media-downloader/modules/paid_content/xenforo_forum_client.py

"""
Generic XenForo Forum Client for Paid Content

Scrapes XenForo-based celebrity image forums (HQCelebCorner, PicturePub, etc.)
treating each celebrity name as a "creator" and each matching thread as a post.

Images are hosted on external hosts (imagebam, pixhost, imagetwist, etc.)
and resolved via ImageHostHandler from forum_downloader.
"""

import asyncio
import html
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse, unquote_plus

import aiohttp

from modules.base_module import LoggingMixin
from .models import Post, Attachment


class XenForoForumClient(LoggingMixin):
    """Generic client for scraping XenForo-based forum threads."""

    FLARESOLVERR_URL = 'http://localhost:8191/v1'

    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                       '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
    }

    IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}

    # External image host domains to look for in post links
    IMAGE_HOST_DOMAINS = [
        'imagebam.com', 'pixhost.to', 'imagetwist.com', 'imgur.com',
        'imgbox.com', 'postimg.cc', 'postimages.org', 'catbox.moe',
        'turboimagehost.com', 'imageban.ru', 'img.yt', 'acidimg.cc',
        'pixxxels.cc', 'imx.to', 'imgbb.com', 'ibb.co',
    ]

    def __init__(self, service_id: str, base_url: str, cookie_path: str, log_callback=None):
        self.SERVICE_ID = service_id
        self.BASE_URL = base_url.rstrip('/')
        self.COOKIE_PATH = cookie_path
        self._init_logger('PaidContent', log_callback, default_module=service_id)
        self._cookies: Optional[Dict[str, str]] = None
        self._image_host_handler = None

    # ------------------------------------------------------------------
    # Cookie handling
    # ------------------------------------------------------------------

    def _load_cookies(self) -> Dict[str, str]:
        """Load Playwright-format cookies and convert to {name: value} dict."""
        if self._cookies is not None:
            return self._cookies

        try:
            cookie_path = Path(self.COOKIE_PATH)
            if cookie_path.exists():
                with open(cookie_path, 'r') as f:
                    raw_cookies = json.load(f)
                self._cookies = {c['name']: c['value'] for c in raw_cookies}
                self.log(f"Loaded {len(self._cookies)} cookies from {self.COOKIE_PATH}", 'debug')
            else:
                self.log(f"Cookie file not found: {self.COOKIE_PATH}", 'warning')
                self._cookies = {}
        except Exception as e:
            self.log(f"Error loading cookies: {e}", 'warning')
            self._cookies = {}

        return self._cookies

    def _get_cookie_header(self) -> str:
        """Build Cookie header string from loaded cookies."""
        cookies = self._load_cookies()
        return '; '.join(f'{k}={v}' for k, v in cookies.items())

    def _get_request_headers(self) -> Dict[str, str]:
        """Get headers with cookies for authenticated requests."""
        headers = dict(self.HEADERS)
        cookie_str = self._get_cookie_header()
        if cookie_str:
            headers['Cookie'] = cookie_str
        return headers

    # ------------------------------------------------------------------
    # Image host handling
    # ------------------------------------------------------------------

    def _get_image_host_handler(self):
        """Get or create ImageHostHandler instance."""
        if self._image_host_handler is None:
            try:
                from modules.forum_downloader import ImageHostHandler
                self._image_host_handler = ImageHostHandler
                self.log("Loaded ImageHostHandler from forum_downloader", 'debug')
            except ImportError:
                self.log("ImageHostHandler not available", 'warning')
                self._image_host_handler = False  # sentinel to avoid retrying
        return self._image_host_handler if self._image_host_handler is not False else None

    # ------------------------------------------------------------------
    # HTTP helpers
    # ------------------------------------------------------------------

    async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
        """Fetch a page with cookies. Falls back to FlareSolverr on 403."""
        headers = self._get_request_headers()
        try:
            async with session.get(url, headers=headers, allow_redirects=True) as resp:
                if resp.status == 200:
                    return await resp.text()
                if resp.status == 403:
                    self.log(f"Got 403 for {url}, trying FlareSolverr", 'debug')
                    return await self._fetch_via_flaresolverr(url)
                self.log(f"HTTP {resp.status} for {url}", 'warning')
                return None
        except Exception as e:
            self.log(f"Error fetching {url}: {e}", 'warning')
            return await self._fetch_via_flaresolverr(url)

    async def _fetch_via_flaresolverr(self, url: str) -> Optional[str]:
        """Fetch a page using FlareSolverr to bypass Cloudflare."""
        try:
            import requests as std_requests
        except ImportError:
            self.log("requests library not available for FlareSolverr", 'warning')
            return None

        fs_session_id = None
        try:
            # Create session
            resp = std_requests.post(self.FLARESOLVERR_URL, json={
                'cmd': 'sessions.create'
            }, timeout=30)
            data = resp.json()
            if data.get('status') != 'ok':
                self.log("Failed to create FlareSolverr session", 'warning')
                return None
            fs_session_id = data.get('session')

            # Fetch page
            cookies = self._load_cookies()
            resp = std_requests.post(self.FLARESOLVERR_URL, json={
                'cmd': 'request.get',
                'url': url,
                'session': fs_session_id,
                'cookies': [{'name': k, 'value': v} for k, v in cookies.items()],
                'maxTimeout': 60000,
            }, timeout=70)
            page_data = resp.json()
            if page_data.get('status') == 'ok':
                return page_data.get('solution', {}).get('response', '')
            self.log(f"FlareSolverr failed for {url}: {page_data.get('message', 'unknown')}", 'warning')
            return None

        except Exception as e:
            self.log(f"FlareSolverr error for {url}: {e}", 'warning')
            return None
        finally:
            if fs_session_id:
                try:
                    std_requests.post(self.FLARESOLVERR_URL, json={
                        'cmd': 'sessions.destroy',
                        'session': fs_session_id,
                    }, timeout=10)
                except Exception:
                    pass

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    async def search_threads(self, query: str) -> List[Dict]:
        """Search for threads matching a celebrity name.

        Returns list of {thread_id, title, url, reply_count}.
        """
        threads = []
        timeout = aiohttp.ClientTimeout(total=30)

        async with aiohttp.ClientSession(timeout=timeout) as session:
            # XenForo search: POST form to /search/search
            search_url = f'{self.BASE_URL}/search/search'
            headers = self._get_request_headers()
            headers['Content-Type'] = 'application/x-www-form-urlencoded'

            # Need CSRF token - fetch search page first
            search_page_url = f'{self.BASE_URL}/search/'
            page_html = await self._fetch_page(session, search_page_url)
            if not page_html:
                self.log("Failed to fetch search page", 'warning')
                return threads

            # Extract CSRF token
            csrf_match = re.search(r'name="_xfToken"\s+value="([^"]+)"', page_html)
            xf_token = csrf_match.group(1) if csrf_match else ''

            form_data = {
                'keywords': query,
                'search_type': 'post',
                'c[title_only]': '1',
                'order': 'date',
                '_xfToken': xf_token,
            }

            try:
                async with session.post(search_url, headers=headers, data=form_data,
                                        allow_redirects=True) as resp:
                    if resp.status != 200:
                        self.log(f"Search returned HTTP {resp.status}", 'warning')
                        return threads
                    result_html = await resp.text()
                    result_url = str(resp.url)
            except Exception as e:
                self.log(f"Search failed: {e}", 'error')
                return threads

            threads = self._parse_search_results(result_html)

            # Handle search result pagination
            page = 2
            while True:
                next_url = self._find_next_search_page(result_html, result_url, page)
                if not next_url:
                    break
                await asyncio.sleep(0.3)
                result_html = await self._fetch_page(session, next_url)
                if not result_html:
                    break
                more = self._parse_search_results(result_html)
                if not more:
                    break
                threads.extend(more)
                page += 1

        self.log(f"Search for '{query}' found {len(threads)} threads", 'info')
        return threads

    async def get_thread_info(self, thread_url: str) -> Optional[Dict]:
        """Fetch page 1 of a thread and extract metadata.

        Returns {thread_id, title, reply_count, page_count, url}.
        """
        timeout = aiohttp.ClientTimeout(total=30)
        try:
            async with aiohttp.ClientSession(timeout=timeout) as session:
                page_html = await self._fetch_page(session, thread_url)
                if not page_html:
                    return None

                title = self._extract_title(page_html)
                page_count = self._extract_page_count(page_html)
                reply_count = self._extract_reply_count(page_html)
                thread_id = self._extract_thread_id(thread_url)

                return {
                    'thread_id': thread_id,
                    'title': title or 'Untitled',
                    'reply_count': reply_count,
                    'page_count': page_count,
                    'url': thread_url.split('#')[0].rstrip('/'),
                }
        except Exception as e:
            self.log(f"Error getting thread info for {thread_url}: {e}", 'error')
            return None

    async def get_thread_images(self, thread_url: str, page_count: int = None,
                                start_page: int = 1) -> List[Dict]:
        """Scrape all pages of a thread and extract image host links.

        Returns list of {url, host, post_number} dicts (deduplicated).
        """
        images = []
        seen_urls: Set[str] = set()

        timeout = aiohttp.ClientTimeout(total=30)
        async with aiohttp.ClientSession(timeout=timeout) as session:
            # If page_count not provided, fetch page 1 to determine it
            if page_count is None:
                page1_html = await self._fetch_page(session, thread_url)
                if not page1_html:
                    return images
                page_count = self._extract_page_count(page1_html)
                page_images = self._extract_image_links(page1_html)
                for img in page_images:
                    if img['url'] not in seen_urls:
                        seen_urls.add(img['url'])
                        images.append(img)
                start_page = 2

            for page_num in range(start_page, page_count + 1):
                page_url = self._build_page_url(thread_url, page_num)
                await asyncio.sleep(0.5)  # Rate limit

                page_html = await self._fetch_page(session, page_url)
                if not page_html:
                    self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
                    break

                page_images = self._extract_image_links(page_html)
                new_count = 0
                for img in page_images:
                    if img['url'] not in seen_urls:
                        seen_urls.add(img['url'])
                        images.append(img)
                        new_count += 1

                self.log(f"Page {page_num}/{page_count}: {new_count} new image links", 'debug')

        self.log(f"Total: {len(images)} unique image links from {page_count} pages", 'info')
        return images

    async def resolve_image_url(self, host_page_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
        """Resolve an image host page URL to a direct image URL.

        Uses ImageHostHandler from forum_downloader where possible.
        """
        handler = self._get_image_host_handler()

        # Try direct extraction without fetching the page
        if handler:
            direct = handler.extract_direct_url(host_page_url)
            if direct:
                return direct

        # imgbox thumbnail → full image conversion (thumbs2 → images2)
        m = re.match(r'https?://thumbs(\d*)\.imgbox\.com/([a-f0-9]+/[a-f0-9]+/)(\w+)_t\.\w+', host_page_url)
        if m:
            return f"https://images{m.group(1)}.imgbox.com/{m.group(2)}{m.group(3)}_o.jpg"

        # For hosts that need page content, fetch and parse
        own_session = session is None
        if own_session:
            timeout = aiohttp.ClientTimeout(total=30)
            session = aiohttp.ClientSession(timeout=timeout)

        try:
            # ImageBam requires sfw_inter=1 cookie to bypass consent page
            headers = dict(self.HEADERS)
            if 'imagebam' in host_page_url:
                headers['Cookie'] = 'sfw_inter=1'

            try:
                async with session.get(host_page_url, headers=headers,
                                       allow_redirects=True) as resp:
                    if resp.status != 200:
                        return None
                    page_content = await resp.text()
                    final_url = str(resp.url)
            except Exception as e:
                self.log(f"Failed to fetch image host page {host_page_url}: {e}", 'debug')
                return None

            # Try handler with page content
            if handler:
                direct = handler.extract_direct_url(host_page_url, page_content=page_content)
                if direct:
                    return direct

            # Manual extraction fallbacks
            return self._extract_direct_image_from_html(host_page_url, page_content, final_url)

        finally:
            if own_session:
                await session.close()

    # ------------------------------------------------------------------
    # HTML parsing helpers
    # ------------------------------------------------------------------

    def _parse_search_results(self, html_content: str) -> List[Dict]:
        """Parse XenForo search results page for thread links."""
        threads = []

        # Parse each contentRow block to extract title, URL, and date
        for block_match in re.finditer(
            r'<div\s+class="contentRow[^"]*"[^>]*>(.*?)</div>\s*</div>\s*</div>',
            html_content, re.DOTALL
        ):
            block = block_match.group(1)

            # Extract thread URL and title
            title_match = re.search(
                r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
                block, re.DOTALL
            )
            if not title_match:
                continue

            url = title_match.group(1)
            title_raw = title_match.group(2)
            title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
            title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
            title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
            title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())

            if not title:
                continue

            if not url.startswith('http'):
                url = self.BASE_URL + url

            thread_id = self._extract_thread_id(url)
            if not thread_id:
                continue

            # Extract date from <time datetime="..."> tag
            published_at = None
            time_match = re.search(r'<time[^>]+datetime="([^"]+)"', block)
            if time_match:
                published_at = time_match.group(1)

            threads.append({
                'thread_id': thread_id,
                'title': title,
                'url': url.split('#')[0].rstrip('/'),
                'reply_count': 0,
                'published_at': published_at,
            })

        # Fallback: if contentRow block parsing found nothing, try simpler title-only parsing
        if not threads:
            for m in re.finditer(
                r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
                html_content, re.DOTALL
            ):
                url = m.group(1)
                title_raw = m.group(2)
                title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
                title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
                title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
                title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
                if not title:
                    continue
                if not url.startswith('http'):
                    url = self.BASE_URL + url
                thread_id = self._extract_thread_id(url)
                if not thread_id:
                    continue
                threads.append({
                    'thread_id': thread_id,
                    'title': title,
                    'url': url.split('#')[0].rstrip('/'),
                    'reply_count': 0,
                    'published_at': None,
                })

        # Deduplicate by thread_id
        seen = set()
        unique = []
        for t in threads:
            if t['thread_id'] not in seen:
                seen.add(t['thread_id'])
                unique.append(t)

        return unique

    def _find_next_search_page(self, html_content: str, current_url: str, page_num: int) -> Optional[str]:
        """Find URL for the next page of search results."""
        # XenForo pagination: <a href="...page-{N}..." class="pageNav-page">
        pattern = rf'<a\s+href="([^"]*)"[^>]*class="pageNav-jump[^"]*"[^>]*>\s*Next'
        m = re.search(pattern, html_content, re.IGNORECASE)
        if m:
            url = m.group(1)
            if not url.startswith('http'):
                url = self.BASE_URL + html.unescape(url)
            return url
        return None

    # Domains/patterns for non-content images (reaction GIFs, emojis, signatures, etc.)
    JUNK_URL_PATTERNS = [
        'giphy.com', 'tenor.com', 'gfycat.com',          # reaction GIFs
        'jsdelivr.net', 'joypixels', 'twemoji',           # emoji CDNs
        'wp-content/',                                      # WordPress media (blog graphics, profile pics)
        '/unicode/', '/emoji/',                             # emoji paths
        'haboodadi.com',                                    # forum signature images
    ]

    # Image hosts that are permanently dead (DNS gone / domain expired)
    DEAD_HOSTS = [
        'someimage.com',
    ]

    def _extract_image_links(self, page_html: str) -> List[Dict]:
        """Extract image host links from all posts on a page."""
        images = []

        # Find all message bodies: XenForo uses <article class="message ..."> and
        # <div class="bbWrapper"> for post content
        for content_match in re.finditer(
            r'<div\s+class="bbWrapper">(.*?)</div>\s*(?:</div>|<div\s+class="(?:js-post|message))',
            page_html, re.DOTALL
        ):
            content = content_match.group(1)

            # Extract links to known image hosts
            for link_match in re.finditer(r'<a\s+[^>]*href="([^"]+)"[^>]*>', content):
                link_url = html.unescape(link_match.group(1))
                if self._is_image_host_url(link_url) and not self._is_junk_url(link_url):
                    images.append({'url': link_url, 'host': self._identify_host(link_url)})

            # Also catch direct image URLs (full-size, not thumbnails)
            # NOTE: Skip images hosted on known image host CDNs (imgbox, imgur, etc.)
            # — legitimate gallery images are posted as <a href> links to host pages
            #   (handled above), while inline <img> from these hosts are signatures.
            for img_match in re.finditer(r'<img\s+[^>]*src="([^"]+)"[^>]*>', content):
                img_url = html.unescape(img_match.group(1))
                # Skip thumbnails, avatars, smilies, and junk
                if any(skip in img_url.lower() for skip in [
                    'thumb', 'avatar', 'smili', 'emoji', 'icon', 'logo',
                    'data/assets', '/styles/', 'xenforo'
                ]):
                    continue
                if self._is_junk_url(img_url):
                    continue
                # Skip inline images from known image hosts — these are signatures,
                # not gallery content (gallery images come through as <a> links above)
                if self._is_image_host_url(img_url):
                    continue
                if self._is_direct_image_url(img_url):
                    images.append({'url': img_url, 'host': 'direct'})

        return images

    def _is_junk_url(self, url: str) -> bool:
        """Filter out non-content images: reaction GIFs, emojis, blog graphics, dead hosts, etc."""
        url_lower = url.lower()
        if any(pat in url_lower for pat in self.JUNK_URL_PATTERNS):
            return True
        if any(host in url_lower for host in self.DEAD_HOSTS):
            return True
        return False

    def _is_image_host_url(self, url: str) -> bool:
        """Check if a URL belongs to a known image hosting service."""
        try:
            domain = urlparse(url).netloc.lower()
            return any(host in domain for host in self.IMAGE_HOST_DOMAINS)
        except Exception:
            return False

    def _is_direct_image_url(self, url: str) -> bool:
        """Check if a URL points directly to an image file."""
        try:
            path = urlparse(url).path.lower()
            return any(path.endswith(f'.{ext}') for ext in self.IMAGE_EXTS)
        except Exception:
            return False

    def _identify_host(self, url: str) -> str:
        """Identify which image host a URL belongs to."""
        handler = self._get_image_host_handler()
        if handler:
            host = handler.identify_host(url)
            if host:
                return host
        # Fallback
        try:
            domain = urlparse(url).netloc.lower()
            for host_domain in self.IMAGE_HOST_DOMAINS:
                if host_domain in domain:
                    return host_domain.split('.')[0]
        except Exception:
            pass
        return 'unknown'

    def _extract_direct_image_from_html(self, url: str, page_content: str, final_url: str) -> Optional[str]:
        """Manually extract direct image URL from host page HTML."""
        domain = urlparse(url).netloc.lower()

        # imagebam: <img class="main-image ..." src="..."> (class may have extra classes)
        if 'imagebam' in domain:
            m = re.search(r'<img\s+[^>]*src="(https?://images\d*\.imagebam\.com/[^"]+)"', page_content)
            if m:
                return html.unescape(m.group(1))
            m = re.search(r'<img\s+[^>]*class="main-image[^"]*"[^>]*src="([^"]+)"', page_content)
            if m:
                return html.unescape(m.group(1))
            # Alternative: og:image meta tag
            m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
            if m:
                return html.unescape(m.group(1))

        # pixhost: <img id="image" src="..."> or img.pixhost.to URL
        if 'pixhost' in domain:
            m = re.search(r'<img\s+[^>]*id="image"[^>]*src="([^"]+)"', page_content)
            if m:
                return html.unescape(m.group(1))
            # Convert thumbnail URL to full: t{N}.pixhost.to/thumbs/ -> img{N}.pixhost.to/images/
            m = re.search(r'https?://t(\d+)\.pixhost\.to/thumbs/(\d+)/(.+)', url)
            if m:
                return f"https://img{m.group(1)}.pixhost.to/images/{m.group(2)}/{m.group(3)}"

        # imagetwist: <img class="pic" src="...">
        if 'imagetwist' in domain:
            m = re.search(r'<img\s+[^>]*class="pic"[^>]*src="([^"]+)"', page_content)
            if m:
                return html.unescape(m.group(1))
            m = re.search(r'<p\s+[^>]*style="text-align:center"[^>]*>\s*<img\s+[^>]*src="([^"]+)"',
                          page_content)
            if m:
                return html.unescape(m.group(1))

        # imgbox: <img id="img" src="..."> or src before id
        if 'imgbox' in domain:
            m = re.search(r'<img\s+[^>]*id="img"[^>]*src="([^"]+)"', page_content)
            if m:
                return html.unescape(m.group(1))
            m = re.search(r'<img\s+[^>]*src="([^"]+)"[^>]*id="img"', page_content)
            if m:
                return html.unescape(m.group(1))
            # Direct image URL pattern
            m = re.search(r'(https?://images\d*\.imgbox\.com/[^\s"<>]+)', page_content)
            if m:
                return html.unescape(m.group(1))

        # turboimagehost: <img class="uImage" src="...">
        if 'turboimagehost' in domain:
            m = re.search(r'<img\s+[^>]*class="uImage"[^>]*src="([^"]+)"', page_content)
            if m:
                return html.unescape(m.group(1))

        # acidimg: <img class="centred" src="...">
        if 'acidimg' in domain:
            m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
            if m:
                return html.unescape(m.group(1))

        # pixxxels: same pattern as acidimg
        if 'pixxxels' in domain:
            m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
            if m:
                return html.unescape(m.group(1))

        # imx.to: <img class="image-show" src="...">
        if 'imx.to' in domain:
            m = re.search(r'<img\s+[^>]*class="image-show"[^>]*src="([^"]+)"', page_content)
            if m:
                return html.unescape(m.group(1))

        # Generic: try og:image meta tag
        m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
        if m:
            img_url = html.unescape(m.group(1))
            if self._is_direct_image_url(img_url):
                return img_url

        return None

    # ------------------------------------------------------------------
    # Utility helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _extract_title(page_html: str) -> Optional[str]:
        """Extract thread title from XenForo <h1 class="p-title-value">."""
        m = re.search(r'<h1\s+class="p-title-value"[^>]*>(.*?)</h1>', page_html, re.DOTALL)
        if m:
            # Remove inner tags (like <span> for prefixes/labels, viewer count spans)
            title = re.sub(r'<[^>]+>', '', m.group(1))
            # Clean up non-breaking spaces and extra whitespace
            title = title.replace('\xa0', ' ')
            title = re.sub(r'\s*\(\d+\s*Viewer[s]?\)', '', title)  # Remove "(1 Viewer)"
            title = re.sub(r'\s+', ' ', title).strip()
            return html.unescape(title)
        # Fallback: <title> — strip common XenForo site name suffixes
        m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
        if m:
            title = html.unescape(m.group(1).strip())
            title = re.sub(r'\s*[-–—|]\s*(?:HQCelebCorner|PicturePub|XenForo).*$', '', title, flags=re.IGNORECASE).strip()
            return title
        return None

    @staticmethod
    def _extract_page_count(page_html: str) -> int:
        """Extract total page count from XenForo pagination."""
        # <li class="pageNav-page"><a href="...">42</a></li>
        pages = re.findall(r'<li\s+class="pageNav-page[^"]*">\s*<a[^>]*>(\d+)</a>', page_html)
        if pages:
            return max(int(p) for p in pages)
        return 1

    @staticmethod
    def _extract_reply_count(page_html: str) -> int:
        """Extract reply count from XenForo thread info."""
        # <dl class="pairs pairs--inline"><dt>Replies</dt><dd>123</dd></dl>
        m = re.search(r'<dt>Replies</dt>\s*<dd>([\d,]+)</dd>', page_html)
        if m:
            return int(m.group(1).replace(',', ''))
        return 0

    @staticmethod
    def _extract_thread_id(url: str) -> Optional[str]:
        """Extract thread ID from XenForo URL.

        Handles both formats:
        - /threads/title.12345/
        - /index.php?threads/title.12345/
        """
        m = re.search(r'threads/[^/]*?\.(\d+)', url)
        if m:
            return m.group(1)
        # Fallback: just /threads/{id}/
        m = re.search(r'threads/(\d+)', url)
        if m:
            return m.group(1)
        return None

    @staticmethod
    def _build_page_url(thread_url: str, page_num: int) -> str:
        """Build paginated thread URL for XenForo.

        Handles: /index.php?threads/slug.12345/page-2
        """
        # Remove existing page- suffix and fragment
        base = thread_url.split('#')[0].rstrip('/')
        base = re.sub(r'/page-\d+$', '', base)
        if page_num == 1:
            return base + '/'
        return f'{base}/page-{page_num}'

    @staticmethod
    def _get_extension(filename_or_url: str) -> str:
        """Get lowercase file extension."""
        clean = filename_or_url.split('?')[0].split('#')[0]
        if '.' in clean.split('/')[-1]:
            return clean.rsplit('.', 1)[-1].lower()
        return ''

    @staticmethod
    def _filename_from_url(url: str) -> str:
        """Extract filename from URL path."""
        path = urlparse(url).path
        name = path.rstrip('/').split('/')[-1]
        return name if name else 'unnamed.jpg'