Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/paid_content/xenforo_forum_client.py
+++ b/modules/paid_content/xenforo_forum_client.py
@@ -0,0 +1,744 @@
+"""
+Generic XenForo Forum Client for Paid Content
+
+Scrapes XenForo-based celebrity image forums (HQCelebCorner, PicturePub, etc.)
+treating each celebrity name as a "creator" and each matching thread as a post.
+
+Images are hosted on external hosts (imagebam, pixhost, imagetwist, etc.)
+and resolved via ImageHostHandler from forum_downloader.
+"""
+
+import asyncio
+import html
+import json
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List, Optional, Set
+from urllib.parse import urlparse, unquote_plus
+
+import aiohttp
+
+from modules.base_module import LoggingMixin
+from .models import Post, Attachment
+
+
+class XenForoForumClient(LoggingMixin):
+    """Generic client for scraping XenForo-based forum threads."""
+
+    FLARESOLVERR_URL = 'http://localhost:8191/v1'
+
+    HEADERS = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+                       '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.9',
+    }
+
+    IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
+
+    # External image host domains to look for in post links
+    IMAGE_HOST_DOMAINS = [
+        'imagebam.com', 'pixhost.to', 'imagetwist.com', 'imgur.com',
+        'imgbox.com', 'postimg.cc', 'postimages.org', 'catbox.moe',
+        'turboimagehost.com', 'imageban.ru', 'img.yt', 'acidimg.cc',
+        'pixxxels.cc', 'imx.to', 'imgbb.com', 'ibb.co',
+    ]
+
+    def __init__(self, service_id: str, base_url: str, cookie_path: str, log_callback=None):
+        self.SERVICE_ID = service_id
+        self.BASE_URL = base_url.rstrip('/')
+        self.COOKIE_PATH = cookie_path
+        self._init_logger('PaidContent', log_callback, default_module=service_id)
+        self._cookies: Optional[Dict[str, str]] = None
+        self._image_host_handler = None
+
+    # ------------------------------------------------------------------
+    # Cookie handling
+    # ------------------------------------------------------------------
+
+    def _load_cookies(self) -> Dict[str, str]:
+        """Load Playwright-format cookies and convert to {name: value} dict."""
+        if self._cookies is not None:
+            return self._cookies
+
+        try:
+            cookie_path = Path(self.COOKIE_PATH)
+            if cookie_path.exists():
+                with open(cookie_path, 'r') as f:
+                    raw_cookies = json.load(f)
+                self._cookies = {c['name']: c['value'] for c in raw_cookies}
+                self.log(f"Loaded {len(self._cookies)} cookies from {self.COOKIE_PATH}", 'debug')
+            else:
+                self.log(f"Cookie file not found: {self.COOKIE_PATH}", 'warning')
+                self._cookies = {}
+        except Exception as e:
+            self.log(f"Error loading cookies: {e}", 'warning')
+            self._cookies = {}
+
+        return self._cookies
+
+    def _get_cookie_header(self) -> str:
+        """Build Cookie header string from loaded cookies."""
+        cookies = self._load_cookies()
+        return '; '.join(f'{k}={v}' for k, v in cookies.items())
+
+    def _get_request_headers(self) -> Dict[str, str]:
+        """Get headers with cookies for authenticated requests."""
+        headers = dict(self.HEADERS)
+        cookie_str = self._get_cookie_header()
+        if cookie_str:
+            headers['Cookie'] = cookie_str
+        return headers
+
+    # ------------------------------------------------------------------
+    # Image host handling
+    # ------------------------------------------------------------------
+
+    def _get_image_host_handler(self):
+        """Get or create ImageHostHandler instance."""
+        if self._image_host_handler is None:
+            try:
+                from modules.forum_downloader import ImageHostHandler
+                self._image_host_handler = ImageHostHandler
+                self.log("Loaded ImageHostHandler from forum_downloader", 'debug')
+            except ImportError:
+                self.log("ImageHostHandler not available", 'warning')
+                self._image_host_handler = False  # sentinel to avoid retrying
+        return self._image_host_handler if self._image_host_handler is not False else None
+
+    # ------------------------------------------------------------------
+    # HTTP helpers
+    # ------------------------------------------------------------------
+
+    async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
+        """Fetch a page with cookies. Falls back to FlareSolverr on 403."""
+        headers = self._get_request_headers()
+        try:
+            async with session.get(url, headers=headers, allow_redirects=True) as resp:
+                if resp.status == 200:
+                    return await resp.text()
+                if resp.status == 403:
+                    self.log(f"Got 403 for {url}, trying FlareSolverr", 'debug')
+                    return await self._fetch_via_flaresolverr(url)
+                self.log(f"HTTP {resp.status} for {url}", 'warning')
+                return None
+        except Exception as e:
+            self.log(f"Error fetching {url}: {e}", 'warning')
+            return await self._fetch_via_flaresolverr(url)
+
+    async def _fetch_via_flaresolverr(self, url: str) -> Optional[str]:
+        """Fetch a page using FlareSolverr to bypass Cloudflare."""
+        try:
+            import requests as std_requests
+        except ImportError:
+            self.log("requests library not available for FlareSolverr", 'warning')
+            return None
+
+        fs_session_id = None
+        try:
+            # Create session
+            resp = std_requests.post(self.FLARESOLVERR_URL, json={
+                'cmd': 'sessions.create'
+            }, timeout=30)
+            data = resp.json()
+            if data.get('status') != 'ok':
+                self.log("Failed to create FlareSolverr session", 'warning')
+                return None
+            fs_session_id = data.get('session')
+
+            # Fetch page
+            cookies = self._load_cookies()
+            resp = std_requests.post(self.FLARESOLVERR_URL, json={
+                'cmd': 'request.get',
+                'url': url,
+                'session': fs_session_id,
+                'cookies': [{'name': k, 'value': v} for k, v in cookies.items()],
+                'maxTimeout': 60000,
+            }, timeout=70)
+            page_data = resp.json()
+            if page_data.get('status') == 'ok':
+                return page_data.get('solution', {}).get('response', '')
+            self.log(f"FlareSolverr failed for {url}: {page_data.get('message', 'unknown')}", 'warning')
+            return None
+
+        except Exception as e:
+            self.log(f"FlareSolverr error for {url}: {e}", 'warning')
+            return None
+        finally:
+            if fs_session_id:
+                try:
+                    std_requests.post(self.FLARESOLVERR_URL, json={
+                        'cmd': 'sessions.destroy',
+                        'session': fs_session_id,
+                    }, timeout=10)
+                except Exception:
+                    pass
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    async def search_threads(self, query: str) -> List[Dict]:
+        """Search for threads matching a celebrity name.
+
+        Returns list of {thread_id, title, url, reply_count}.
+        """
+        threads = []
+        timeout = aiohttp.ClientTimeout(total=30)
+
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            # XenForo search: POST form to /search/search
+            search_url = f'{self.BASE_URL}/search/search'
+            headers = self._get_request_headers()
+            headers['Content-Type'] = 'application/x-www-form-urlencoded'
+
+            # Need CSRF token - fetch search page first
+            search_page_url = f'{self.BASE_URL}/search/'
+            page_html = await self._fetch_page(session, search_page_url)
+            if not page_html:
+                self.log("Failed to fetch search page", 'warning')
+                return threads
+
+            # Extract CSRF token
+            csrf_match = re.search(r'name="_xfToken"\s+value="([^"]+)"', page_html)
+            xf_token = csrf_match.group(1) if csrf_match else ''
+
+            form_data = {
+                'keywords': query,
+                'search_type': 'post',
+                'c[title_only]': '1',
+                'order': 'date',
+                '_xfToken': xf_token,
+            }
+
+            try:
+                async with session.post(search_url, headers=headers, data=form_data,
+                                        allow_redirects=True) as resp:
+                    if resp.status != 200:
+                        self.log(f"Search returned HTTP {resp.status}", 'warning')
+                        return threads
+                    result_html = await resp.text()
+                    result_url = str(resp.url)
+            except Exception as e:
+                self.log(f"Search failed: {e}", 'error')
+                return threads
+
+            threads = self._parse_search_results(result_html)
+
+            # Handle search result pagination
+            page = 2
+            while True:
+                next_url = self._find_next_search_page(result_html, result_url, page)
+                if not next_url:
+                    break
+                await asyncio.sleep(0.3)
+                result_html = await self._fetch_page(session, next_url)
+                if not result_html:
+                    break
+                more = self._parse_search_results(result_html)
+                if not more:
+                    break
+                threads.extend(more)
+                page += 1
+
+        self.log(f"Search for '{query}' found {len(threads)} threads", 'info')
+        return threads
+
+    async def get_thread_info(self, thread_url: str) -> Optional[Dict]:
+        """Fetch page 1 of a thread and extract metadata.
+
+        Returns {thread_id, title, reply_count, page_count, url}.
+        """
+        timeout = aiohttp.ClientTimeout(total=30)
+        try:
+            async with aiohttp.ClientSession(timeout=timeout) as session:
+                page_html = await self._fetch_page(session, thread_url)
+                if not page_html:
+                    return None
+
+                title = self._extract_title(page_html)
+                page_count = self._extract_page_count(page_html)
+                reply_count = self._extract_reply_count(page_html)
+                thread_id = self._extract_thread_id(thread_url)
+
+                return {
+                    'thread_id': thread_id,
+                    'title': title or 'Untitled',
+                    'reply_count': reply_count,
+                    'page_count': page_count,
+                    'url': thread_url.split('#')[0].rstrip('/'),
+                }
+        except Exception as e:
+            self.log(f"Error getting thread info for {thread_url}: {e}", 'error')
+            return None
+
+    async def get_thread_images(self, thread_url: str, page_count: int = None,
+                                start_page: int = 1) -> List[Dict]:
+        """Scrape all pages of a thread and extract image host links.
+
+        Returns list of {url, host, post_number} dicts (deduplicated).
+        """
+        images = []
+        seen_urls: Set[str] = set()
+
+        timeout = aiohttp.ClientTimeout(total=30)
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            # If page_count not provided, fetch page 1 to determine it
+            if page_count is None:
+                page1_html = await self._fetch_page(session, thread_url)
+                if not page1_html:
+                    return images
+                page_count = self._extract_page_count(page1_html)
+                page_images = self._extract_image_links(page1_html)
+                for img in page_images:
+                    if img['url'] not in seen_urls:
+                        seen_urls.add(img['url'])
+                        images.append(img)
+                start_page = 2
+
+            for page_num in range(start_page, page_count + 1):
+                page_url = self._build_page_url(thread_url, page_num)
+                await asyncio.sleep(0.5)  # Rate limit
+
+                page_html = await self._fetch_page(session, page_url)
+                if not page_html:
+                    self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
+                    break
+
+                page_images = self._extract_image_links(page_html)
+                new_count = 0
+                for img in page_images:
+                    if img['url'] not in seen_urls:
+                        seen_urls.add(img['url'])
+                        images.append(img)
+                        new_count += 1
+
+                self.log(f"Page {page_num}/{page_count}: {new_count} new image links", 'debug')
+
+        self.log(f"Total: {len(images)} unique image links from {page_count} pages", 'info')
+        return images
+
+    async def resolve_image_url(self, host_page_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
+        """Resolve an image host page URL to a direct image URL.
+
+        Uses ImageHostHandler from forum_downloader where possible.
+        """
+        handler = self._get_image_host_handler()
+
+        # Try direct extraction without fetching the page
+        if handler:
+            direct = handler.extract_direct_url(host_page_url)
+            if direct:
+                return direct
+
+        # imgbox thumbnail → full image conversion (thumbs2 → images2)
+        m = re.match(r'https?://thumbs(\d*)\.imgbox\.com/([a-f0-9]+/[a-f0-9]+/)(\w+)_t\.\w+', host_page_url)
+        if m:
+            return f"https://images{m.group(1)}.imgbox.com/{m.group(2)}{m.group(3)}_o.jpg"
+
+        # For hosts that need page content, fetch and parse
+        own_session = session is None
+        if own_session:
+            timeout = aiohttp.ClientTimeout(total=30)
+            session = aiohttp.ClientSession(timeout=timeout)
+
+        try:
+            # ImageBam requires sfw_inter=1 cookie to bypass consent page
+            headers = dict(self.HEADERS)
+            if 'imagebam' in host_page_url:
+                headers['Cookie'] = 'sfw_inter=1'
+
+            try:
+                async with session.get(host_page_url, headers=headers,
+                                       allow_redirects=True) as resp:
+                    if resp.status != 200:
+                        return None
+                    page_content = await resp.text()
+                    final_url = str(resp.url)
+            except Exception as e:
+                self.log(f"Failed to fetch image host page {host_page_url}: {e}", 'debug')
+                return None
+
+            # Try handler with page content
+            if handler:
+                direct = handler.extract_direct_url(host_page_url, page_content=page_content)
+                if direct:
+                    return direct
+
+            # Manual extraction fallbacks
+            return self._extract_direct_image_from_html(host_page_url, page_content, final_url)
+
+        finally:
+            if own_session:
+                await session.close()
+
+    # ------------------------------------------------------------------
+    # HTML parsing helpers
+    # ------------------------------------------------------------------
+
+    def _parse_search_results(self, html_content: str) -> List[Dict]:
+        """Parse XenForo search results page for thread links."""
+        threads = []
+
+        # Parse each contentRow block to extract title, URL, and date
+        for block_match in re.finditer(
+            r'<div\s+class="contentRow[^"]*"[^>]*>(.*?)</div>\s*</div>\s*</div>',
+            html_content, re.DOTALL
+        ):
+            block = block_match.group(1)
+
+            # Extract thread URL and title
+            title_match = re.search(
+                r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
+                block, re.DOTALL
+            )
+            if not title_match:
+                continue
+
+            url = title_match.group(1)
+            title_raw = title_match.group(2)
+            title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
+            title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
+            title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
+            title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
+
+            if not title:
+                continue
+
+            if not url.startswith('http'):
+                url = self.BASE_URL + url
+
+            thread_id = self._extract_thread_id(url)
+            if not thread_id:
+                continue
+
+            # Extract date from <time datetime="..."> tag
+            published_at = None
+            time_match = re.search(r'<time[^>]+datetime="([^"]+)"', block)
+            if time_match:
+                published_at = time_match.group(1)
+
+            threads.append({
+                'thread_id': thread_id,
+                'title': title,
+                'url': url.split('#')[0].rstrip('/'),
+                'reply_count': 0,
+                'published_at': published_at,
+            })
+
+        # Fallback: if contentRow block parsing found nothing, try simpler title-only parsing
+        if not threads:
+            for m in re.finditer(
+                r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
+                html_content, re.DOTALL
+            ):
+                url = m.group(1)
+                title_raw = m.group(2)
+                title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
+                title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
+                title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
+                title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
+                if not title:
+                    continue
+                if not url.startswith('http'):
+                    url = self.BASE_URL + url
+                thread_id = self._extract_thread_id(url)
+                if not thread_id:
+                    continue
+                threads.append({
+                    'thread_id': thread_id,
+                    'title': title,
+                    'url': url.split('#')[0].rstrip('/'),
+                    'reply_count': 0,
+                    'published_at': None,
+                })
+
+        # Deduplicate by thread_id
+        seen = set()
+        unique = []
+        for t in threads:
+            if t['thread_id'] not in seen:
+                seen.add(t['thread_id'])
+                unique.append(t)
+
+        return unique
+
+    def _find_next_search_page(self, html_content: str, current_url: str, page_num: int) -> Optional[str]:
+        """Find URL for the next page of search results."""
+        # XenForo pagination: <a href="...page-{N}..." class="pageNav-page">
+        pattern = rf'<a\s+href="([^"]*)"[^>]*class="pageNav-jump[^"]*"[^>]*>\s*Next'
+        m = re.search(pattern, html_content, re.IGNORECASE)
+        if m:
+            url = m.group(1)
+            if not url.startswith('http'):
+                url = self.BASE_URL + html.unescape(url)
+            return url
+        return None
+
+    # Domains/patterns for non-content images (reaction GIFs, emojis, signatures, etc.)
+    JUNK_URL_PATTERNS = [
+        'giphy.com', 'tenor.com', 'gfycat.com',          # reaction GIFs
+        'jsdelivr.net', 'joypixels', 'twemoji',           # emoji CDNs
+        'wp-content/',                                      # WordPress media (blog graphics, profile pics)
+        '/unicode/', '/emoji/',                             # emoji paths
+        'haboodadi.com',                                    # forum signature images
+    ]
+
+    # Image hosts that are permanently dead (DNS gone / domain expired)
+    DEAD_HOSTS = [
+        'someimage.com',
+    ]
+
+    def _extract_image_links(self, page_html: str) -> List[Dict]:
+        """Extract image host links from all posts on a page."""
+        images = []
+
+        # Find all message bodies: XenForo uses <article class="message ..."> and
+        # <div class="bbWrapper"> for post content
+        for content_match in re.finditer(
+            r'<div\s+class="bbWrapper">(.*?)</div>\s*(?:</div>|<div\s+class="(?:js-post|message))',
+            page_html, re.DOTALL
+        ):
+            content = content_match.group(1)
+
+            # Extract links to known image hosts
+            for link_match in re.finditer(r'<a\s+[^>]*href="([^"]+)"[^>]*>', content):
+                link_url = html.unescape(link_match.group(1))
+                if self._is_image_host_url(link_url) and not self._is_junk_url(link_url):
+                    images.append({'url': link_url, 'host': self._identify_host(link_url)})
+
+            # Also catch direct image URLs (full-size, not thumbnails)
+            # NOTE: Skip images hosted on known image host CDNs (imgbox, imgur, etc.)
+            # — legitimate gallery images are posted as <a href> links to host pages
+            #   (handled above), while inline <img> from these hosts are signatures.
+            for img_match in re.finditer(r'<img\s+[^>]*src="([^"]+)"[^>]*>', content):
+                img_url = html.unescape(img_match.group(1))
+                # Skip thumbnails, avatars, smilies, and junk
+                if any(skip in img_url.lower() for skip in [
+                    'thumb', 'avatar', 'smili', 'emoji', 'icon', 'logo',
+                    'data/assets', '/styles/', 'xenforo'
+                ]):
+                    continue
+                if self._is_junk_url(img_url):
+                    continue
+                # Skip inline images from known image hosts — these are signatures,
+                # not gallery content (gallery images come through as <a> links above)
+                if self._is_image_host_url(img_url):
+                    continue
+                if self._is_direct_image_url(img_url):
+                    images.append({'url': img_url, 'host': 'direct'})
+
+        return images
+
+    def _is_junk_url(self, url: str) -> bool:
+        """Filter out non-content images: reaction GIFs, emojis, blog graphics, dead hosts, etc."""
+        url_lower = url.lower()
+        if any(pat in url_lower for pat in self.JUNK_URL_PATTERNS):
+            return True
+        if any(host in url_lower for host in self.DEAD_HOSTS):
+            return True
+        return False
+
+    def _is_image_host_url(self, url: str) -> bool:
+        """Check if a URL belongs to a known image hosting service."""
+        try:
+            domain = urlparse(url).netloc.lower()
+            return any(host in domain for host in self.IMAGE_HOST_DOMAINS)
+        except Exception:
+            return False
+
+    def _is_direct_image_url(self, url: str) -> bool:
+        """Check if a URL points directly to an image file."""
+        try:
+            path = urlparse(url).path.lower()
+            return any(path.endswith(f'.{ext}') for ext in self.IMAGE_EXTS)
+        except Exception:
+            return False
+
+    def _identify_host(self, url: str) -> str:
+        """Identify which image host a URL belongs to."""
+        handler = self._get_image_host_handler()
+        if handler:
+            host = handler.identify_host(url)
+            if host:
+                return host
+        # Fallback
+        try:
+            domain = urlparse(url).netloc.lower()
+            for host_domain in self.IMAGE_HOST_DOMAINS:
+                if host_domain in domain:
+                    return host_domain.split('.')[0]
+        except Exception:
+            pass
+        return 'unknown'
+
+    def _extract_direct_image_from_html(self, url: str, page_content: str, final_url: str) -> Optional[str]:
+        """Manually extract direct image URL from host page HTML."""
+        domain = urlparse(url).netloc.lower()
+
+        # imagebam: <img class="main-image ..." src="..."> (class may have extra classes)
+        if 'imagebam' in domain:
+            m = re.search(r'<img\s+[^>]*src="(https?://images\d*\.imagebam\.com/[^"]+)"', page_content)
+            if m:
+                return html.unescape(m.group(1))
+            m = re.search(r'<img\s+[^>]*class="main-image[^"]*"[^>]*src="([^"]+)"', page_content)
+            if m:
+                return html.unescape(m.group(1))
+            # Alternative: og:image meta tag
+            m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
+            if m:
+                return html.unescape(m.group(1))
+
+        # pixhost: <img id="image" src="..."> or img.pixhost.to URL
+        if 'pixhost' in domain:
+            m = re.search(r'<img\s+[^>]*id="image"[^>]*src="([^"]+)"', page_content)
+            if m:
+                return html.unescape(m.group(1))
+            # Convert thumbnail URL to full: t{N}.pixhost.to/thumbs/ -> img{N}.pixhost.to/images/
+            m = re.search(r'https?://t(\d+)\.pixhost\.to/thumbs/(\d+)/(.+)', url)
+            if m:
+                return f"https://img{m.group(1)}.pixhost.to/images/{m.group(2)}/{m.group(3)}"
+
+        # imagetwist: <img class="pic" src="...">
+        if 'imagetwist' in domain:
+            m = re.search(r'<img\s+[^>]*class="pic"[^>]*src="([^"]+)"', page_content)
+            if m:
+                return html.unescape(m.group(1))
+            m = re.search(r'<p\s+[^>]*style="text-align:center"[^>]*>\s*<img\s+[^>]*src="([^"]+)"',
+                          page_content)
+            if m:
+                return html.unescape(m.group(1))
+
+        # imgbox: <img id="img" src="..."> or src before id
+        if 'imgbox' in domain:
+            m = re.search(r'<img\s+[^>]*id="img"[^>]*src="([^"]+)"', page_content)
+            if m:
+                return html.unescape(m.group(1))
+            m = re.search(r'<img\s+[^>]*src="([^"]+)"[^>]*id="img"', page_content)
+            if m:
+                return html.unescape(m.group(1))
+            # Direct image URL pattern
+            m = re.search(r'(https?://images\d*\.imgbox\.com/[^\s"<>]+)', page_content)
+            if m:
+                return html.unescape(m.group(1))
+
+        # turboimagehost: <img class="uImage" src="...">
+        if 'turboimagehost' in domain:
+            m = re.search(r'<img\s+[^>]*class="uImage"[^>]*src="([^"]+)"', page_content)
+            if m:
+                return html.unescape(m.group(1))
+
+        # acidimg: <img class="centred" src="...">
+        if 'acidimg' in domain:
+            m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
+            if m:
+                return html.unescape(m.group(1))
+
+        # pixxxels: same pattern as acidimg
+        if 'pixxxels' in domain:
+            m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
+            if m:
+                return html.unescape(m.group(1))
+
+        # imx.to: <img class="image-show" src="...">
+        if 'imx.to' in domain:
+            m = re.search(r'<img\s+[^>]*class="image-show"[^>]*src="([^"]+)"', page_content)
+            if m:
+                return html.unescape(m.group(1))
+
+        # Generic: try og:image meta tag
+        m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
+        if m:
+            img_url = html.unescape(m.group(1))
+            if self._is_direct_image_url(img_url):
+                return img_url
+
+        return None
+
+    # ------------------------------------------------------------------
+    # Utility helpers
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _extract_title(page_html: str) -> Optional[str]:
+        """Extract thread title from XenForo <h1 class="p-title-value">."""
+        m = re.search(r'<h1\s+class="p-title-value"[^>]*>(.*?)</h1>', page_html, re.DOTALL)
+        if m:
+            # Remove inner tags (like <span> for prefixes/labels, viewer count spans)
+            title = re.sub(r'<[^>]+>', '', m.group(1))
+            # Clean up non-breaking spaces and extra whitespace
+            title = title.replace('\xa0', ' ')
+            title = re.sub(r'\s*\(\d+\s*Viewer[s]?\)', '', title)  # Remove "(1 Viewer)"
+            title = re.sub(r'\s+', ' ', title).strip()
+            return html.unescape(title)
+        # Fallback: <title> — strip common XenForo site name suffixes
+        m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
+        if m:
+            title = html.unescape(m.group(1).strip())
+            title = re.sub(r'\s*[-–—|]\s*(?:HQCelebCorner|PicturePub|XenForo).*$', '', title, flags=re.IGNORECASE).strip()
+            return title
+        return None
+
+    @staticmethod
+    def _extract_page_count(page_html: str) -> int:
+        """Extract total page count from XenForo pagination."""
+        # <li class="pageNav-page"><a href="...">42</a></li>
+        pages = re.findall(r'<li\s+class="pageNav-page[^"]*">\s*<a[^>]*>(\d+)</a>', page_html)
+        if pages:
+            return max(int(p) for p in pages)
+        return 1
+
+    @staticmethod
+    def _extract_reply_count(page_html: str) -> int:
+        """Extract reply count from XenForo thread info."""
+        # <dl class="pairs pairs--inline"><dt>Replies</dt><dd>123</dd></dl>
+        m = re.search(r'<dt>Replies</dt>\s*<dd>([\d,]+)</dd>', page_html)
+        if m:
+            return int(m.group(1).replace(',', ''))
+        return 0
+
+    @staticmethod
+    def _extract_thread_id(url: str) -> Optional[str]:
+        """Extract thread ID from XenForo URL.
+
+        Handles both formats:
+        - /threads/title.12345/
+        - /index.php?threads/title.12345/
+        """
+        m = re.search(r'threads/[^/]*?\.(\d+)', url)
+        if m:
+            return m.group(1)
+        # Fallback: just /threads/{id}/
+        m = re.search(r'threads/(\d+)', url)
+        if m:
+            return m.group(1)
+        return None
+
+    @staticmethod
+    def _build_page_url(thread_url: str, page_num: int) -> str:
+        """Build paginated thread URL for XenForo.
+
+        Handles: /index.php?threads/slug.12345/page-2
+        """
+        # Remove existing page- suffix and fragment
+        base = thread_url.split('#')[0].rstrip('/')
+        base = re.sub(r'/page-\d+$', '', base)
+        if page_num == 1:
+            return base + '/'
+        return f'{base}/page-{page_num}'
+
+    @staticmethod
+    def _get_extension(filename_or_url: str) -> str:
+        """Get lowercase file extension."""
+        clean = filename_or_url.split('?')[0].split('#')[0]
+        if '.' in clean.split('/')[-1]:
+            return clean.rsplit('.', 1)[-1].lower()
+        return ''
+
+    @staticmethod
+    def _filename_from_url(url: str) -> str:
+        """Extract filename from URL path."""
+        path = urlparse(url).path
+        name = path.rstrip('/').split('/')[-1]
+        return name if name else 'unnamed.jpg'