Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

View File

@@ -0,0 +1,744 @@
"""
Generic XenForo Forum Client for Paid Content
Scrapes XenForo-based celebrity image forums (HQCelebCorner, PicturePub, etc.)
treating each celebrity name as a "creator" and each matching thread as a post.
Images are hosted on external hosts (imagebam, pixhost, imagetwist, etc.)
and resolved via ImageHostHandler from forum_downloader.
"""
import asyncio
import html
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse, unquote_plus
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class XenForoForumClient(LoggingMixin):
"""Generic client for scraping XenForo-based forum threads."""
FLARESOLVERR_URL = 'http://localhost:8191/v1'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
# External image host domains to look for in post links
IMAGE_HOST_DOMAINS = [
'imagebam.com', 'pixhost.to', 'imagetwist.com', 'imgur.com',
'imgbox.com', 'postimg.cc', 'postimages.org', 'catbox.moe',
'turboimagehost.com', 'imageban.ru', 'img.yt', 'acidimg.cc',
'pixxxels.cc', 'imx.to', 'imgbb.com', 'ibb.co',
]
def __init__(self, service_id: str, base_url: str, cookie_path: str, log_callback=None):
self.SERVICE_ID = service_id
self.BASE_URL = base_url.rstrip('/')
self.COOKIE_PATH = cookie_path
self._init_logger('PaidContent', log_callback, default_module=service_id)
self._cookies: Optional[Dict[str, str]] = None
self._image_host_handler = None
# ------------------------------------------------------------------
# Cookie handling
# ------------------------------------------------------------------
def _load_cookies(self) -> Dict[str, str]:
"""Load Playwright-format cookies and convert to {name: value} dict."""
if self._cookies is not None:
return self._cookies
try:
cookie_path = Path(self.COOKIE_PATH)
if cookie_path.exists():
with open(cookie_path, 'r') as f:
raw_cookies = json.load(f)
self._cookies = {c['name']: c['value'] for c in raw_cookies}
self.log(f"Loaded {len(self._cookies)} cookies from {self.COOKIE_PATH}", 'debug')
else:
self.log(f"Cookie file not found: {self.COOKIE_PATH}", 'warning')
self._cookies = {}
except Exception as e:
self.log(f"Error loading cookies: {e}", 'warning')
self._cookies = {}
return self._cookies
def _get_cookie_header(self) -> str:
"""Build Cookie header string from loaded cookies."""
cookies = self._load_cookies()
return '; '.join(f'{k}={v}' for k, v in cookies.items())
def _get_request_headers(self) -> Dict[str, str]:
"""Get headers with cookies for authenticated requests."""
headers = dict(self.HEADERS)
cookie_str = self._get_cookie_header()
if cookie_str:
headers['Cookie'] = cookie_str
return headers
# ------------------------------------------------------------------
# Image host handling
# ------------------------------------------------------------------
def _get_image_host_handler(self):
"""Get or create ImageHostHandler instance."""
if self._image_host_handler is None:
try:
from modules.forum_downloader import ImageHostHandler
self._image_host_handler = ImageHostHandler
self.log("Loaded ImageHostHandler from forum_downloader", 'debug')
except ImportError:
self.log("ImageHostHandler not available", 'warning')
self._image_host_handler = False # sentinel to avoid retrying
return self._image_host_handler if self._image_host_handler is not False else None
# ------------------------------------------------------------------
# HTTP helpers
# ------------------------------------------------------------------
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
"""Fetch a page with cookies. Falls back to FlareSolverr on 403."""
headers = self._get_request_headers()
try:
async with session.get(url, headers=headers, allow_redirects=True) as resp:
if resp.status == 200:
return await resp.text()
if resp.status == 403:
self.log(f"Got 403 for {url}, trying FlareSolverr", 'debug')
return await self._fetch_via_flaresolverr(url)
self.log(f"HTTP {resp.status} for {url}", 'warning')
return None
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return await self._fetch_via_flaresolverr(url)
async def _fetch_via_flaresolverr(self, url: str) -> Optional[str]:
"""Fetch a page using FlareSolverr to bypass Cloudflare."""
try:
import requests as std_requests
except ImportError:
self.log("requests library not available for FlareSolverr", 'warning')
return None
fs_session_id = None
try:
# Create session
resp = std_requests.post(self.FLARESOLVERR_URL, json={
'cmd': 'sessions.create'
}, timeout=30)
data = resp.json()
if data.get('status') != 'ok':
self.log("Failed to create FlareSolverr session", 'warning')
return None
fs_session_id = data.get('session')
# Fetch page
cookies = self._load_cookies()
resp = std_requests.post(self.FLARESOLVERR_URL, json={
'cmd': 'request.get',
'url': url,
'session': fs_session_id,
'cookies': [{'name': k, 'value': v} for k, v in cookies.items()],
'maxTimeout': 60000,
}, timeout=70)
page_data = resp.json()
if page_data.get('status') == 'ok':
return page_data.get('solution', {}).get('response', '')
self.log(f"FlareSolverr failed for {url}: {page_data.get('message', 'unknown')}", 'warning')
return None
except Exception as e:
self.log(f"FlareSolverr error for {url}: {e}", 'warning')
return None
finally:
if fs_session_id:
try:
std_requests.post(self.FLARESOLVERR_URL, json={
'cmd': 'sessions.destroy',
'session': fs_session_id,
}, timeout=10)
except Exception:
pass
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def search_threads(self, query: str) -> List[Dict]:
"""Search for threads matching a celebrity name.
Returns list of {thread_id, title, url, reply_count}.
"""
threads = []
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
# XenForo search: POST form to /search/search
search_url = f'{self.BASE_URL}/search/search'
headers = self._get_request_headers()
headers['Content-Type'] = 'application/x-www-form-urlencoded'
# Need CSRF token - fetch search page first
search_page_url = f'{self.BASE_URL}/search/'
page_html = await self._fetch_page(session, search_page_url)
if not page_html:
self.log("Failed to fetch search page", 'warning')
return threads
# Extract CSRF token
csrf_match = re.search(r'name="_xfToken"\s+value="([^"]+)"', page_html)
xf_token = csrf_match.group(1) if csrf_match else ''
form_data = {
'keywords': query,
'search_type': 'post',
'c[title_only]': '1',
'order': 'date',
'_xfToken': xf_token,
}
try:
async with session.post(search_url, headers=headers, data=form_data,
allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"Search returned HTTP {resp.status}", 'warning')
return threads
result_html = await resp.text()
result_url = str(resp.url)
except Exception as e:
self.log(f"Search failed: {e}", 'error')
return threads
threads = self._parse_search_results(result_html)
# Handle search result pagination
page = 2
while True:
next_url = self._find_next_search_page(result_html, result_url, page)
if not next_url:
break
await asyncio.sleep(0.3)
result_html = await self._fetch_page(session, next_url)
if not result_html:
break
more = self._parse_search_results(result_html)
if not more:
break
threads.extend(more)
page += 1
self.log(f"Search for '{query}' found {len(threads)} threads", 'info')
return threads
async def get_thread_info(self, thread_url: str) -> Optional[Dict]:
"""Fetch page 1 of a thread and extract metadata.
Returns {thread_id, title, reply_count, page_count, url}.
"""
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
page_html = await self._fetch_page(session, thread_url)
if not page_html:
return None
title = self._extract_title(page_html)
page_count = self._extract_page_count(page_html)
reply_count = self._extract_reply_count(page_html)
thread_id = self._extract_thread_id(thread_url)
return {
'thread_id': thread_id,
'title': title or 'Untitled',
'reply_count': reply_count,
'page_count': page_count,
'url': thread_url.split('#')[0].rstrip('/'),
}
except Exception as e:
self.log(f"Error getting thread info for {thread_url}: {e}", 'error')
return None
async def get_thread_images(self, thread_url: str, page_count: int = None,
start_page: int = 1) -> List[Dict]:
"""Scrape all pages of a thread and extract image host links.
Returns list of {url, host, post_number} dicts (deduplicated).
"""
images = []
seen_urls: Set[str] = set()
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
# If page_count not provided, fetch page 1 to determine it
if page_count is None:
page1_html = await self._fetch_page(session, thread_url)
if not page1_html:
return images
page_count = self._extract_page_count(page1_html)
page_images = self._extract_image_links(page1_html)
for img in page_images:
if img['url'] not in seen_urls:
seen_urls.add(img['url'])
images.append(img)
start_page = 2
for page_num in range(start_page, page_count + 1):
page_url = self._build_page_url(thread_url, page_num)
await asyncio.sleep(0.5) # Rate limit
page_html = await self._fetch_page(session, page_url)
if not page_html:
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
break
page_images = self._extract_image_links(page_html)
new_count = 0
for img in page_images:
if img['url'] not in seen_urls:
seen_urls.add(img['url'])
images.append(img)
new_count += 1
self.log(f"Page {page_num}/{page_count}: {new_count} new image links", 'debug')
self.log(f"Total: {len(images)} unique image links from {page_count} pages", 'info')
return images
async def resolve_image_url(self, host_page_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
"""Resolve an image host page URL to a direct image URL.
Uses ImageHostHandler from forum_downloader where possible.
"""
handler = self._get_image_host_handler()
# Try direct extraction without fetching the page
if handler:
direct = handler.extract_direct_url(host_page_url)
if direct:
return direct
# imgbox thumbnail → full image conversion (thumbs2 → images2)
m = re.match(r'https?://thumbs(\d*)\.imgbox\.com/([a-f0-9]+/[a-f0-9]+/)(\w+)_t\.\w+', host_page_url)
if m:
return f"https://images{m.group(1)}.imgbox.com/{m.group(2)}{m.group(3)}_o.jpg"
# For hosts that need page content, fetch and parse
own_session = session is None
if own_session:
timeout = aiohttp.ClientTimeout(total=30)
session = aiohttp.ClientSession(timeout=timeout)
try:
# ImageBam requires sfw_inter=1 cookie to bypass consent page
headers = dict(self.HEADERS)
if 'imagebam' in host_page_url:
headers['Cookie'] = 'sfw_inter=1'
try:
async with session.get(host_page_url, headers=headers,
allow_redirects=True) as resp:
if resp.status != 200:
return None
page_content = await resp.text()
final_url = str(resp.url)
except Exception as e:
self.log(f"Failed to fetch image host page {host_page_url}: {e}", 'debug')
return None
# Try handler with page content
if handler:
direct = handler.extract_direct_url(host_page_url, page_content=page_content)
if direct:
return direct
# Manual extraction fallbacks
return self._extract_direct_image_from_html(host_page_url, page_content, final_url)
finally:
if own_session:
await session.close()
# ------------------------------------------------------------------
# HTML parsing helpers
# ------------------------------------------------------------------
def _parse_search_results(self, html_content: str) -> List[Dict]:
"""Parse XenForo search results page for thread links."""
threads = []
# Parse each contentRow block to extract title, URL, and date
for block_match in re.finditer(
r'<div\s+class="contentRow[^"]*"[^>]*>(.*?)</div>\s*</div>\s*</div>',
html_content, re.DOTALL
):
block = block_match.group(1)
# Extract thread URL and title
title_match = re.search(
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
block, re.DOTALL
)
if not title_match:
continue
url = title_match.group(1)
title_raw = title_match.group(2)
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
if not title:
continue
if not url.startswith('http'):
url = self.BASE_URL + url
thread_id = self._extract_thread_id(url)
if not thread_id:
continue
# Extract date from <time datetime="..."> tag
published_at = None
time_match = re.search(r'<time[^>]+datetime="([^"]+)"', block)
if time_match:
published_at = time_match.group(1)
threads.append({
'thread_id': thread_id,
'title': title,
'url': url.split('#')[0].rstrip('/'),
'reply_count': 0,
'published_at': published_at,
})
# Fallback: if contentRow block parsing found nothing, try simpler title-only parsing
if not threads:
for m in re.finditer(
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
html_content, re.DOTALL
):
url = m.group(1)
title_raw = m.group(2)
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
if not title:
continue
if not url.startswith('http'):
url = self.BASE_URL + url
thread_id = self._extract_thread_id(url)
if not thread_id:
continue
threads.append({
'thread_id': thread_id,
'title': title,
'url': url.split('#')[0].rstrip('/'),
'reply_count': 0,
'published_at': None,
})
# Deduplicate by thread_id
seen = set()
unique = []
for t in threads:
if t['thread_id'] not in seen:
seen.add(t['thread_id'])
unique.append(t)
return unique
def _find_next_search_page(self, html_content: str, current_url: str, page_num: int) -> Optional[str]:
"""Find URL for the next page of search results."""
# XenForo pagination: <a href="...page-{N}..." class="pageNav-page">
pattern = rf'<a\s+href="([^"]*)"[^>]*class="pageNav-jump[^"]*"[^>]*>\s*Next'
m = re.search(pattern, html_content, re.IGNORECASE)
if m:
url = m.group(1)
if not url.startswith('http'):
url = self.BASE_URL + html.unescape(url)
return url
return None
# Domains/patterns for non-content images (reaction GIFs, emojis, signatures, etc.)
JUNK_URL_PATTERNS = [
'giphy.com', 'tenor.com', 'gfycat.com', # reaction GIFs
'jsdelivr.net', 'joypixels', 'twemoji', # emoji CDNs
'wp-content/', # WordPress media (blog graphics, profile pics)
'/unicode/', '/emoji/', # emoji paths
'haboodadi.com', # forum signature images
]
# Image hosts that are permanently dead (DNS gone / domain expired)
DEAD_HOSTS = [
'someimage.com',
]
def _extract_image_links(self, page_html: str) -> List[Dict]:
"""Extract image host links from all posts on a page."""
images = []
# Find all message bodies: XenForo uses <article class="message ..."> and
# <div class="bbWrapper"> for post content
for content_match in re.finditer(
r'<div\s+class="bbWrapper">(.*?)</div>\s*(?:</div>|<div\s+class="(?:js-post|message))',
page_html, re.DOTALL
):
content = content_match.group(1)
# Extract links to known image hosts
for link_match in re.finditer(r'<a\s+[^>]*href="([^"]+)"[^>]*>', content):
link_url = html.unescape(link_match.group(1))
if self._is_image_host_url(link_url) and not self._is_junk_url(link_url):
images.append({'url': link_url, 'host': self._identify_host(link_url)})
# Also catch direct image URLs (full-size, not thumbnails)
# NOTE: Skip images hosted on known image host CDNs (imgbox, imgur, etc.)
# — legitimate gallery images are posted as <a href> links to host pages
# (handled above), while inline <img> from these hosts are signatures.
for img_match in re.finditer(r'<img\s+[^>]*src="([^"]+)"[^>]*>', content):
img_url = html.unescape(img_match.group(1))
# Skip thumbnails, avatars, smilies, and junk
if any(skip in img_url.lower() for skip in [
'thumb', 'avatar', 'smili', 'emoji', 'icon', 'logo',
'data/assets', '/styles/', 'xenforo'
]):
continue
if self._is_junk_url(img_url):
continue
# Skip inline images from known image hosts — these are signatures,
# not gallery content (gallery images come through as <a> links above)
if self._is_image_host_url(img_url):
continue
if self._is_direct_image_url(img_url):
images.append({'url': img_url, 'host': 'direct'})
return images
def _is_junk_url(self, url: str) -> bool:
"""Filter out non-content images: reaction GIFs, emojis, blog graphics, dead hosts, etc."""
url_lower = url.lower()
if any(pat in url_lower for pat in self.JUNK_URL_PATTERNS):
return True
if any(host in url_lower for host in self.DEAD_HOSTS):
return True
return False
def _is_image_host_url(self, url: str) -> bool:
"""Check if a URL belongs to a known image hosting service."""
try:
domain = urlparse(url).netloc.lower()
return any(host in domain for host in self.IMAGE_HOST_DOMAINS)
except Exception:
return False
def _is_direct_image_url(self, url: str) -> bool:
"""Check if a URL points directly to an image file."""
try:
path = urlparse(url).path.lower()
return any(path.endswith(f'.{ext}') for ext in self.IMAGE_EXTS)
except Exception:
return False
def _identify_host(self, url: str) -> str:
"""Identify which image host a URL belongs to."""
handler = self._get_image_host_handler()
if handler:
host = handler.identify_host(url)
if host:
return host
# Fallback
try:
domain = urlparse(url).netloc.lower()
for host_domain in self.IMAGE_HOST_DOMAINS:
if host_domain in domain:
return host_domain.split('.')[0]
except Exception:
pass
return 'unknown'
def _extract_direct_image_from_html(self, url: str, page_content: str, final_url: str) -> Optional[str]:
"""Manually extract direct image URL from host page HTML."""
domain = urlparse(url).netloc.lower()
# imagebam: <img class="main-image ..." src="..."> (class may have extra classes)
if 'imagebam' in domain:
m = re.search(r'<img\s+[^>]*src="(https?://images\d*\.imagebam\.com/[^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
m = re.search(r'<img\s+[^>]*class="main-image[^"]*"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# Alternative: og:image meta tag
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# pixhost: <img id="image" src="..."> or img.pixhost.to URL
if 'pixhost' in domain:
m = re.search(r'<img\s+[^>]*id="image"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# Convert thumbnail URL to full: t{N}.pixhost.to/thumbs/ -> img{N}.pixhost.to/images/
m = re.search(r'https?://t(\d+)\.pixhost\.to/thumbs/(\d+)/(.+)', url)
if m:
return f"https://img{m.group(1)}.pixhost.to/images/{m.group(2)}/{m.group(3)}"
# imagetwist: <img class="pic" src="...">
if 'imagetwist' in domain:
m = re.search(r'<img\s+[^>]*class="pic"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
m = re.search(r'<p\s+[^>]*style="text-align:center"[^>]*>\s*<img\s+[^>]*src="([^"]+)"',
page_content)
if m:
return html.unescape(m.group(1))
# imgbox: <img id="img" src="..."> or src before id
if 'imgbox' in domain:
m = re.search(r'<img\s+[^>]*id="img"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
m = re.search(r'<img\s+[^>]*src="([^"]+)"[^>]*id="img"', page_content)
if m:
return html.unescape(m.group(1))
# Direct image URL pattern
m = re.search(r'(https?://images\d*\.imgbox\.com/[^\s"<>]+)', page_content)
if m:
return html.unescape(m.group(1))
# turboimagehost: <img class="uImage" src="...">
if 'turboimagehost' in domain:
m = re.search(r'<img\s+[^>]*class="uImage"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# acidimg: <img class="centred" src="...">
if 'acidimg' in domain:
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# pixxxels: same pattern as acidimg
if 'pixxxels' in domain:
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# imx.to: <img class="image-show" src="...">
if 'imx.to' in domain:
m = re.search(r'<img\s+[^>]*class="image-show"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# Generic: try og:image meta tag
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
if m:
img_url = html.unescape(m.group(1))
if self._is_direct_image_url(img_url):
return img_url
return None
# ------------------------------------------------------------------
# Utility helpers
# ------------------------------------------------------------------
@staticmethod
def _extract_title(page_html: str) -> Optional[str]:
"""Extract thread title from XenForo <h1 class="p-title-value">."""
m = re.search(r'<h1\s+class="p-title-value"[^>]*>(.*?)</h1>', page_html, re.DOTALL)
if m:
# Remove inner tags (like <span> for prefixes/labels, viewer count spans)
title = re.sub(r'<[^>]+>', '', m.group(1))
# Clean up non-breaking spaces and extra whitespace
title = title.replace('\xa0', ' ')
title = re.sub(r'\s*\(\d+\s*Viewer[s]?\)', '', title) # Remove "(1 Viewer)"
title = re.sub(r'\s+', ' ', title).strip()
return html.unescape(title)
# Fallback: <title> — strip common XenForo site name suffixes
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
if m:
title = html.unescape(m.group(1).strip())
title = re.sub(r'\s*[-–—|]\s*(?:HQCelebCorner|PicturePub|XenForo).*$', '', title, flags=re.IGNORECASE).strip()
return title
return None
@staticmethod
def _extract_page_count(page_html: str) -> int:
"""Extract total page count from XenForo pagination."""
# <li class="pageNav-page"><a href="...">42</a></li>
pages = re.findall(r'<li\s+class="pageNav-page[^"]*">\s*<a[^>]*>(\d+)</a>', page_html)
if pages:
return max(int(p) for p in pages)
return 1
@staticmethod
def _extract_reply_count(page_html: str) -> int:
"""Extract reply count from XenForo thread info."""
# <dl class="pairs pairs--inline"><dt>Replies</dt><dd>123</dd></dl>
m = re.search(r'<dt>Replies</dt>\s*<dd>([\d,]+)</dd>', page_html)
if m:
return int(m.group(1).replace(',', ''))
return 0
@staticmethod
def _extract_thread_id(url: str) -> Optional[str]:
"""Extract thread ID from XenForo URL.
Handles both formats:
- /threads/title.12345/
- /index.php?threads/title.12345/
"""
m = re.search(r'threads/[^/]*?\.(\d+)', url)
if m:
return m.group(1)
# Fallback: just /threads/{id}/
m = re.search(r'threads/(\d+)', url)
if m:
return m.group(1)
return None
@staticmethod
def _build_page_url(thread_url: str, page_num: int) -> str:
"""Build paginated thread URL for XenForo.
Handles: /index.php?threads/slug.12345/page-2
"""
# Remove existing page- suffix and fragment
base = thread_url.split('#')[0].rstrip('/')
base = re.sub(r'/page-\d+$', '', base)
if page_num == 1:
return base + '/'
return f'{base}/page-{page_num}'
@staticmethod
def _get_extension(filename_or_url: str) -> str:
"""Get lowercase file extension."""
clean = filename_or_url.split('?')[0].split('#')[0]
if '.' in clean.split('/')[-1]:
return clean.rsplit('.', 1)[-1].lower()
return ''
@staticmethod
def _filename_from_url(url: str) -> str:
"""Extract filename from URL path."""
path = urlparse(url).path
name = path.rstrip('/').split('/')[-1]
return name if name else 'unnamed.jpg'