745 lines
30 KiB
Python
745 lines
30 KiB
Python
"""
|
|
Generic XenForo Forum Client for Paid Content
|
|
|
|
Scrapes XenForo-based celebrity image forums (HQCelebCorner, PicturePub, etc.)
|
|
treating each celebrity name as a "creator" and each matching thread as a post.
|
|
|
|
Images are hosted on external hosts (imagebam, pixhost, imagetwist, etc.)
|
|
and resolved via ImageHostHandler from forum_downloader.
|
|
"""
|
|
|
|
import asyncio
|
|
import html
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Set
|
|
from urllib.parse import urlparse, unquote_plus
|
|
|
|
import aiohttp
|
|
|
|
from modules.base_module import LoggingMixin
|
|
from .models import Post, Attachment
|
|
|
|
|
|
class XenForoForumClient(LoggingMixin):
|
|
"""Generic client for scraping XenForo-based forum threads."""
|
|
|
|
FLARESOLVERR_URL = 'http://localhost:8191/v1'
|
|
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
|
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
}
|
|
|
|
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
|
|
|
|
# External image host domains to look for in post links
|
|
IMAGE_HOST_DOMAINS = [
|
|
'imagebam.com', 'pixhost.to', 'imagetwist.com', 'imgur.com',
|
|
'imgbox.com', 'postimg.cc', 'postimages.org', 'catbox.moe',
|
|
'turboimagehost.com', 'imageban.ru', 'img.yt', 'acidimg.cc',
|
|
'pixxxels.cc', 'imx.to', 'imgbb.com', 'ibb.co',
|
|
]
|
|
|
|
def __init__(self, service_id: str, base_url: str, cookie_path: str, log_callback=None):
|
|
self.SERVICE_ID = service_id
|
|
self.BASE_URL = base_url.rstrip('/')
|
|
self.COOKIE_PATH = cookie_path
|
|
self._init_logger('PaidContent', log_callback, default_module=service_id)
|
|
self._cookies: Optional[Dict[str, str]] = None
|
|
self._image_host_handler = None
|
|
|
|
# ------------------------------------------------------------------
|
|
# Cookie handling
|
|
# ------------------------------------------------------------------
|
|
|
|
def _load_cookies(self) -> Dict[str, str]:
|
|
"""Load Playwright-format cookies and convert to {name: value} dict."""
|
|
if self._cookies is not None:
|
|
return self._cookies
|
|
|
|
try:
|
|
cookie_path = Path(self.COOKIE_PATH)
|
|
if cookie_path.exists():
|
|
with open(cookie_path, 'r') as f:
|
|
raw_cookies = json.load(f)
|
|
self._cookies = {c['name']: c['value'] for c in raw_cookies}
|
|
self.log(f"Loaded {len(self._cookies)} cookies from {self.COOKIE_PATH}", 'debug')
|
|
else:
|
|
self.log(f"Cookie file not found: {self.COOKIE_PATH}", 'warning')
|
|
self._cookies = {}
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies: {e}", 'warning')
|
|
self._cookies = {}
|
|
|
|
return self._cookies
|
|
|
|
def _get_cookie_header(self) -> str:
|
|
"""Build Cookie header string from loaded cookies."""
|
|
cookies = self._load_cookies()
|
|
return '; '.join(f'{k}={v}' for k, v in cookies.items())
|
|
|
|
def _get_request_headers(self) -> Dict[str, str]:
|
|
"""Get headers with cookies for authenticated requests."""
|
|
headers = dict(self.HEADERS)
|
|
cookie_str = self._get_cookie_header()
|
|
if cookie_str:
|
|
headers['Cookie'] = cookie_str
|
|
return headers
|
|
|
|
# ------------------------------------------------------------------
|
|
# Image host handling
|
|
# ------------------------------------------------------------------
|
|
|
|
def _get_image_host_handler(self):
|
|
"""Get or create ImageHostHandler instance."""
|
|
if self._image_host_handler is None:
|
|
try:
|
|
from modules.forum_downloader import ImageHostHandler
|
|
self._image_host_handler = ImageHostHandler
|
|
self.log("Loaded ImageHostHandler from forum_downloader", 'debug')
|
|
except ImportError:
|
|
self.log("ImageHostHandler not available", 'warning')
|
|
self._image_host_handler = False # sentinel to avoid retrying
|
|
return self._image_host_handler if self._image_host_handler is not False else None
|
|
|
|
# ------------------------------------------------------------------
|
|
# HTTP helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
|
"""Fetch a page with cookies. Falls back to FlareSolverr on 403."""
|
|
headers = self._get_request_headers()
|
|
try:
|
|
async with session.get(url, headers=headers, allow_redirects=True) as resp:
|
|
if resp.status == 200:
|
|
return await resp.text()
|
|
if resp.status == 403:
|
|
self.log(f"Got 403 for {url}, trying FlareSolverr", 'debug')
|
|
return await self._fetch_via_flaresolverr(url)
|
|
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
|
return None
|
|
except Exception as e:
|
|
self.log(f"Error fetching {url}: {e}", 'warning')
|
|
return await self._fetch_via_flaresolverr(url)
|
|
|
|
async def _fetch_via_flaresolverr(self, url: str) -> Optional[str]:
|
|
"""Fetch a page using FlareSolverr to bypass Cloudflare."""
|
|
try:
|
|
import requests as std_requests
|
|
except ImportError:
|
|
self.log("requests library not available for FlareSolverr", 'warning')
|
|
return None
|
|
|
|
fs_session_id = None
|
|
try:
|
|
# Create session
|
|
resp = std_requests.post(self.FLARESOLVERR_URL, json={
|
|
'cmd': 'sessions.create'
|
|
}, timeout=30)
|
|
data = resp.json()
|
|
if data.get('status') != 'ok':
|
|
self.log("Failed to create FlareSolverr session", 'warning')
|
|
return None
|
|
fs_session_id = data.get('session')
|
|
|
|
# Fetch page
|
|
cookies = self._load_cookies()
|
|
resp = std_requests.post(self.FLARESOLVERR_URL, json={
|
|
'cmd': 'request.get',
|
|
'url': url,
|
|
'session': fs_session_id,
|
|
'cookies': [{'name': k, 'value': v} for k, v in cookies.items()],
|
|
'maxTimeout': 60000,
|
|
}, timeout=70)
|
|
page_data = resp.json()
|
|
if page_data.get('status') == 'ok':
|
|
return page_data.get('solution', {}).get('response', '')
|
|
self.log(f"FlareSolverr failed for {url}: {page_data.get('message', 'unknown')}", 'warning')
|
|
return None
|
|
|
|
except Exception as e:
|
|
self.log(f"FlareSolverr error for {url}: {e}", 'warning')
|
|
return None
|
|
finally:
|
|
if fs_session_id:
|
|
try:
|
|
std_requests.post(self.FLARESOLVERR_URL, json={
|
|
'cmd': 'sessions.destroy',
|
|
'session': fs_session_id,
|
|
}, timeout=10)
|
|
except Exception:
|
|
pass
|
|
|
|
# ------------------------------------------------------------------
|
|
# Public API
|
|
# ------------------------------------------------------------------
|
|
|
|
async def search_threads(self, query: str) -> List[Dict]:
|
|
"""Search for threads matching a celebrity name.
|
|
|
|
Returns list of {thread_id, title, url, reply_count}.
|
|
"""
|
|
threads = []
|
|
timeout = aiohttp.ClientTimeout(total=30)
|
|
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
# XenForo search: POST form to /search/search
|
|
search_url = f'{self.BASE_URL}/search/search'
|
|
headers = self._get_request_headers()
|
|
headers['Content-Type'] = 'application/x-www-form-urlencoded'
|
|
|
|
# Need CSRF token - fetch search page first
|
|
search_page_url = f'{self.BASE_URL}/search/'
|
|
page_html = await self._fetch_page(session, search_page_url)
|
|
if not page_html:
|
|
self.log("Failed to fetch search page", 'warning')
|
|
return threads
|
|
|
|
# Extract CSRF token
|
|
csrf_match = re.search(r'name="_xfToken"\s+value="([^"]+)"', page_html)
|
|
xf_token = csrf_match.group(1) if csrf_match else ''
|
|
|
|
form_data = {
|
|
'keywords': query,
|
|
'search_type': 'post',
|
|
'c[title_only]': '1',
|
|
'order': 'date',
|
|
'_xfToken': xf_token,
|
|
}
|
|
|
|
try:
|
|
async with session.post(search_url, headers=headers, data=form_data,
|
|
allow_redirects=True) as resp:
|
|
if resp.status != 200:
|
|
self.log(f"Search returned HTTP {resp.status}", 'warning')
|
|
return threads
|
|
result_html = await resp.text()
|
|
result_url = str(resp.url)
|
|
except Exception as e:
|
|
self.log(f"Search failed: {e}", 'error')
|
|
return threads
|
|
|
|
threads = self._parse_search_results(result_html)
|
|
|
|
# Handle search result pagination
|
|
page = 2
|
|
while True:
|
|
next_url = self._find_next_search_page(result_html, result_url, page)
|
|
if not next_url:
|
|
break
|
|
await asyncio.sleep(0.3)
|
|
result_html = await self._fetch_page(session, next_url)
|
|
if not result_html:
|
|
break
|
|
more = self._parse_search_results(result_html)
|
|
if not more:
|
|
break
|
|
threads.extend(more)
|
|
page += 1
|
|
|
|
self.log(f"Search for '{query}' found {len(threads)} threads", 'info')
|
|
return threads
|
|
|
|
async def get_thread_info(self, thread_url: str) -> Optional[Dict]:
|
|
"""Fetch page 1 of a thread and extract metadata.
|
|
|
|
Returns {thread_id, title, reply_count, page_count, url}.
|
|
"""
|
|
timeout = aiohttp.ClientTimeout(total=30)
|
|
try:
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
page_html = await self._fetch_page(session, thread_url)
|
|
if not page_html:
|
|
return None
|
|
|
|
title = self._extract_title(page_html)
|
|
page_count = self._extract_page_count(page_html)
|
|
reply_count = self._extract_reply_count(page_html)
|
|
thread_id = self._extract_thread_id(thread_url)
|
|
|
|
return {
|
|
'thread_id': thread_id,
|
|
'title': title or 'Untitled',
|
|
'reply_count': reply_count,
|
|
'page_count': page_count,
|
|
'url': thread_url.split('#')[0].rstrip('/'),
|
|
}
|
|
except Exception as e:
|
|
self.log(f"Error getting thread info for {thread_url}: {e}", 'error')
|
|
return None
|
|
|
|
async def get_thread_images(self, thread_url: str, page_count: int = None,
|
|
start_page: int = 1) -> List[Dict]:
|
|
"""Scrape all pages of a thread and extract image host links.
|
|
|
|
Returns list of {url, host, post_number} dicts (deduplicated).
|
|
"""
|
|
images = []
|
|
seen_urls: Set[str] = set()
|
|
|
|
timeout = aiohttp.ClientTimeout(total=30)
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
# If page_count not provided, fetch page 1 to determine it
|
|
if page_count is None:
|
|
page1_html = await self._fetch_page(session, thread_url)
|
|
if not page1_html:
|
|
return images
|
|
page_count = self._extract_page_count(page1_html)
|
|
page_images = self._extract_image_links(page1_html)
|
|
for img in page_images:
|
|
if img['url'] not in seen_urls:
|
|
seen_urls.add(img['url'])
|
|
images.append(img)
|
|
start_page = 2
|
|
|
|
for page_num in range(start_page, page_count + 1):
|
|
page_url = self._build_page_url(thread_url, page_num)
|
|
await asyncio.sleep(0.5) # Rate limit
|
|
|
|
page_html = await self._fetch_page(session, page_url)
|
|
if not page_html:
|
|
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
|
|
break
|
|
|
|
page_images = self._extract_image_links(page_html)
|
|
new_count = 0
|
|
for img in page_images:
|
|
if img['url'] not in seen_urls:
|
|
seen_urls.add(img['url'])
|
|
images.append(img)
|
|
new_count += 1
|
|
|
|
self.log(f"Page {page_num}/{page_count}: {new_count} new image links", 'debug')
|
|
|
|
self.log(f"Total: {len(images)} unique image links from {page_count} pages", 'info')
|
|
return images
|
|
|
|
async def resolve_image_url(self, host_page_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
|
|
"""Resolve an image host page URL to a direct image URL.
|
|
|
|
Uses ImageHostHandler from forum_downloader where possible.
|
|
"""
|
|
handler = self._get_image_host_handler()
|
|
|
|
# Try direct extraction without fetching the page
|
|
if handler:
|
|
direct = handler.extract_direct_url(host_page_url)
|
|
if direct:
|
|
return direct
|
|
|
|
# imgbox thumbnail → full image conversion (thumbs2 → images2)
|
|
m = re.match(r'https?://thumbs(\d*)\.imgbox\.com/([a-f0-9]+/[a-f0-9]+/)(\w+)_t\.\w+', host_page_url)
|
|
if m:
|
|
return f"https://images{m.group(1)}.imgbox.com/{m.group(2)}{m.group(3)}_o.jpg"
|
|
|
|
# For hosts that need page content, fetch and parse
|
|
own_session = session is None
|
|
if own_session:
|
|
timeout = aiohttp.ClientTimeout(total=30)
|
|
session = aiohttp.ClientSession(timeout=timeout)
|
|
|
|
try:
|
|
# ImageBam requires sfw_inter=1 cookie to bypass consent page
|
|
headers = dict(self.HEADERS)
|
|
if 'imagebam' in host_page_url:
|
|
headers['Cookie'] = 'sfw_inter=1'
|
|
|
|
try:
|
|
async with session.get(host_page_url, headers=headers,
|
|
allow_redirects=True) as resp:
|
|
if resp.status != 200:
|
|
return None
|
|
page_content = await resp.text()
|
|
final_url = str(resp.url)
|
|
except Exception as e:
|
|
self.log(f"Failed to fetch image host page {host_page_url}: {e}", 'debug')
|
|
return None
|
|
|
|
# Try handler with page content
|
|
if handler:
|
|
direct = handler.extract_direct_url(host_page_url, page_content=page_content)
|
|
if direct:
|
|
return direct
|
|
|
|
# Manual extraction fallbacks
|
|
return self._extract_direct_image_from_html(host_page_url, page_content, final_url)
|
|
|
|
finally:
|
|
if own_session:
|
|
await session.close()
|
|
|
|
# ------------------------------------------------------------------
|
|
# HTML parsing helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
def _parse_search_results(self, html_content: str) -> List[Dict]:
|
|
"""Parse XenForo search results page for thread links."""
|
|
threads = []
|
|
|
|
# Parse each contentRow block to extract title, URL, and date
|
|
for block_match in re.finditer(
|
|
r'<div\s+class="contentRow[^"]*"[^>]*>(.*?)</div>\s*</div>\s*</div>',
|
|
html_content, re.DOTALL
|
|
):
|
|
block = block_match.group(1)
|
|
|
|
# Extract thread URL and title
|
|
title_match = re.search(
|
|
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
|
|
block, re.DOTALL
|
|
)
|
|
if not title_match:
|
|
continue
|
|
|
|
url = title_match.group(1)
|
|
title_raw = title_match.group(2)
|
|
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
|
|
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
|
|
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
|
|
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
|
|
|
|
if not title:
|
|
continue
|
|
|
|
if not url.startswith('http'):
|
|
url = self.BASE_URL + url
|
|
|
|
thread_id = self._extract_thread_id(url)
|
|
if not thread_id:
|
|
continue
|
|
|
|
# Extract date from <time datetime="..."> tag
|
|
published_at = None
|
|
time_match = re.search(r'<time[^>]+datetime="([^"]+)"', block)
|
|
if time_match:
|
|
published_at = time_match.group(1)
|
|
|
|
threads.append({
|
|
'thread_id': thread_id,
|
|
'title': title,
|
|
'url': url.split('#')[0].rstrip('/'),
|
|
'reply_count': 0,
|
|
'published_at': published_at,
|
|
})
|
|
|
|
# Fallback: if contentRow block parsing found nothing, try simpler title-only parsing
|
|
if not threads:
|
|
for m in re.finditer(
|
|
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
|
|
html_content, re.DOTALL
|
|
):
|
|
url = m.group(1)
|
|
title_raw = m.group(2)
|
|
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
|
|
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
|
|
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
|
|
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
|
|
if not title:
|
|
continue
|
|
if not url.startswith('http'):
|
|
url = self.BASE_URL + url
|
|
thread_id = self._extract_thread_id(url)
|
|
if not thread_id:
|
|
continue
|
|
threads.append({
|
|
'thread_id': thread_id,
|
|
'title': title,
|
|
'url': url.split('#')[0].rstrip('/'),
|
|
'reply_count': 0,
|
|
'published_at': None,
|
|
})
|
|
|
|
# Deduplicate by thread_id
|
|
seen = set()
|
|
unique = []
|
|
for t in threads:
|
|
if t['thread_id'] not in seen:
|
|
seen.add(t['thread_id'])
|
|
unique.append(t)
|
|
|
|
return unique
|
|
|
|
def _find_next_search_page(self, html_content: str, current_url: str, page_num: int) -> Optional[str]:
|
|
"""Find URL for the next page of search results."""
|
|
# XenForo pagination: <a href="...page-{N}..." class="pageNav-page">
|
|
pattern = rf'<a\s+href="([^"]*)"[^>]*class="pageNav-jump[^"]*"[^>]*>\s*Next'
|
|
m = re.search(pattern, html_content, re.IGNORECASE)
|
|
if m:
|
|
url = m.group(1)
|
|
if not url.startswith('http'):
|
|
url = self.BASE_URL + html.unescape(url)
|
|
return url
|
|
return None
|
|
|
|
# Domains/patterns for non-content images (reaction GIFs, emojis, signatures, etc.)
|
|
JUNK_URL_PATTERNS = [
|
|
'giphy.com', 'tenor.com', 'gfycat.com', # reaction GIFs
|
|
'jsdelivr.net', 'joypixels', 'twemoji', # emoji CDNs
|
|
'wp-content/', # WordPress media (blog graphics, profile pics)
|
|
'/unicode/', '/emoji/', # emoji paths
|
|
'haboodadi.com', # forum signature images
|
|
]
|
|
|
|
# Image hosts that are permanently dead (DNS gone / domain expired)
|
|
DEAD_HOSTS = [
|
|
'someimage.com',
|
|
]
|
|
|
|
def _extract_image_links(self, page_html: str) -> List[Dict]:
|
|
"""Extract image host links from all posts on a page."""
|
|
images = []
|
|
|
|
# Find all message bodies: XenForo uses <article class="message ..."> and
|
|
# <div class="bbWrapper"> for post content
|
|
for content_match in re.finditer(
|
|
r'<div\s+class="bbWrapper">(.*?)</div>\s*(?:</div>|<div\s+class="(?:js-post|message))',
|
|
page_html, re.DOTALL
|
|
):
|
|
content = content_match.group(1)
|
|
|
|
# Extract links to known image hosts
|
|
for link_match in re.finditer(r'<a\s+[^>]*href="([^"]+)"[^>]*>', content):
|
|
link_url = html.unescape(link_match.group(1))
|
|
if self._is_image_host_url(link_url) and not self._is_junk_url(link_url):
|
|
images.append({'url': link_url, 'host': self._identify_host(link_url)})
|
|
|
|
# Also catch direct image URLs (full-size, not thumbnails)
|
|
# NOTE: Skip images hosted on known image host CDNs (imgbox, imgur, etc.)
|
|
# — legitimate gallery images are posted as <a href> links to host pages
|
|
# (handled above), while inline <img> from these hosts are signatures.
|
|
for img_match in re.finditer(r'<img\s+[^>]*src="([^"]+)"[^>]*>', content):
|
|
img_url = html.unescape(img_match.group(1))
|
|
# Skip thumbnails, avatars, smilies, and junk
|
|
if any(skip in img_url.lower() for skip in [
|
|
'thumb', 'avatar', 'smili', 'emoji', 'icon', 'logo',
|
|
'data/assets', '/styles/', 'xenforo'
|
|
]):
|
|
continue
|
|
if self._is_junk_url(img_url):
|
|
continue
|
|
# Skip inline images from known image hosts — these are signatures,
|
|
# not gallery content (gallery images come through as <a> links above)
|
|
if self._is_image_host_url(img_url):
|
|
continue
|
|
if self._is_direct_image_url(img_url):
|
|
images.append({'url': img_url, 'host': 'direct'})
|
|
|
|
return images
|
|
|
|
def _is_junk_url(self, url: str) -> bool:
|
|
"""Filter out non-content images: reaction GIFs, emojis, blog graphics, dead hosts, etc."""
|
|
url_lower = url.lower()
|
|
if any(pat in url_lower for pat in self.JUNK_URL_PATTERNS):
|
|
return True
|
|
if any(host in url_lower for host in self.DEAD_HOSTS):
|
|
return True
|
|
return False
|
|
|
|
def _is_image_host_url(self, url: str) -> bool:
|
|
"""Check if a URL belongs to a known image hosting service."""
|
|
try:
|
|
domain = urlparse(url).netloc.lower()
|
|
return any(host in domain for host in self.IMAGE_HOST_DOMAINS)
|
|
except Exception:
|
|
return False
|
|
|
|
def _is_direct_image_url(self, url: str) -> bool:
|
|
"""Check if a URL points directly to an image file."""
|
|
try:
|
|
path = urlparse(url).path.lower()
|
|
return any(path.endswith(f'.{ext}') for ext in self.IMAGE_EXTS)
|
|
except Exception:
|
|
return False
|
|
|
|
def _identify_host(self, url: str) -> str:
|
|
"""Identify which image host a URL belongs to."""
|
|
handler = self._get_image_host_handler()
|
|
if handler:
|
|
host = handler.identify_host(url)
|
|
if host:
|
|
return host
|
|
# Fallback
|
|
try:
|
|
domain = urlparse(url).netloc.lower()
|
|
for host_domain in self.IMAGE_HOST_DOMAINS:
|
|
if host_domain in domain:
|
|
return host_domain.split('.')[0]
|
|
except Exception:
|
|
pass
|
|
return 'unknown'
|
|
|
|
def _extract_direct_image_from_html(self, url: str, page_content: str, final_url: str) -> Optional[str]:
|
|
"""Manually extract direct image URL from host page HTML."""
|
|
domain = urlparse(url).netloc.lower()
|
|
|
|
# imagebam: <img class="main-image ..." src="..."> (class may have extra classes)
|
|
if 'imagebam' in domain:
|
|
m = re.search(r'<img\s+[^>]*src="(https?://images\d*\.imagebam\.com/[^"]+)"', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
m = re.search(r'<img\s+[^>]*class="main-image[^"]*"[^>]*src="([^"]+)"', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
# Alternative: og:image meta tag
|
|
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
|
|
# pixhost: <img id="image" src="..."> or img.pixhost.to URL
|
|
if 'pixhost' in domain:
|
|
m = re.search(r'<img\s+[^>]*id="image"[^>]*src="([^"]+)"', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
# Convert thumbnail URL to full: t{N}.pixhost.to/thumbs/ -> img{N}.pixhost.to/images/
|
|
m = re.search(r'https?://t(\d+)\.pixhost\.to/thumbs/(\d+)/(.+)', url)
|
|
if m:
|
|
return f"https://img{m.group(1)}.pixhost.to/images/{m.group(2)}/{m.group(3)}"
|
|
|
|
# imagetwist: <img class="pic" src="...">
|
|
if 'imagetwist' in domain:
|
|
m = re.search(r'<img\s+[^>]*class="pic"[^>]*src="([^"]+)"', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
m = re.search(r'<p\s+[^>]*style="text-align:center"[^>]*>\s*<img\s+[^>]*src="([^"]+)"',
|
|
page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
|
|
# imgbox: <img id="img" src="..."> or src before id
|
|
if 'imgbox' in domain:
|
|
m = re.search(r'<img\s+[^>]*id="img"[^>]*src="([^"]+)"', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
m = re.search(r'<img\s+[^>]*src="([^"]+)"[^>]*id="img"', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
# Direct image URL pattern
|
|
m = re.search(r'(https?://images\d*\.imgbox\.com/[^\s"<>]+)', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
|
|
# turboimagehost: <img class="uImage" src="...">
|
|
if 'turboimagehost' in domain:
|
|
m = re.search(r'<img\s+[^>]*class="uImage"[^>]*src="([^"]+)"', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
|
|
# acidimg: <img class="centred" src="...">
|
|
if 'acidimg' in domain:
|
|
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
|
|
# pixxxels: same pattern as acidimg
|
|
if 'pixxxels' in domain:
|
|
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
|
|
# imx.to: <img class="image-show" src="...">
|
|
if 'imx.to' in domain:
|
|
m = re.search(r'<img\s+[^>]*class="image-show"[^>]*src="([^"]+)"', page_content)
|
|
if m:
|
|
return html.unescape(m.group(1))
|
|
|
|
# Generic: try og:image meta tag
|
|
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
|
|
if m:
|
|
img_url = html.unescape(m.group(1))
|
|
if self._is_direct_image_url(img_url):
|
|
return img_url
|
|
|
|
return None
|
|
|
|
# ------------------------------------------------------------------
|
|
# Utility helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def _extract_title(page_html: str) -> Optional[str]:
|
|
"""Extract thread title from XenForo <h1 class="p-title-value">."""
|
|
m = re.search(r'<h1\s+class="p-title-value"[^>]*>(.*?)</h1>', page_html, re.DOTALL)
|
|
if m:
|
|
# Remove inner tags (like <span> for prefixes/labels, viewer count spans)
|
|
title = re.sub(r'<[^>]+>', '', m.group(1))
|
|
# Clean up non-breaking spaces and extra whitespace
|
|
title = title.replace('\xa0', ' ')
|
|
title = re.sub(r'\s*\(\d+\s*Viewer[s]?\)', '', title) # Remove "(1 Viewer)"
|
|
title = re.sub(r'\s+', ' ', title).strip()
|
|
return html.unescape(title)
|
|
# Fallback: <title> — strip common XenForo site name suffixes
|
|
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
|
if m:
|
|
title = html.unescape(m.group(1).strip())
|
|
title = re.sub(r'\s*[-–—|]\s*(?:HQCelebCorner|PicturePub|XenForo).*$', '', title, flags=re.IGNORECASE).strip()
|
|
return title
|
|
return None
|
|
|
|
@staticmethod
|
|
def _extract_page_count(page_html: str) -> int:
|
|
"""Extract total page count from XenForo pagination."""
|
|
# <li class="pageNav-page"><a href="...">42</a></li>
|
|
pages = re.findall(r'<li\s+class="pageNav-page[^"]*">\s*<a[^>]*>(\d+)</a>', page_html)
|
|
if pages:
|
|
return max(int(p) for p in pages)
|
|
return 1
|
|
|
|
@staticmethod
|
|
def _extract_reply_count(page_html: str) -> int:
|
|
"""Extract reply count from XenForo thread info."""
|
|
# <dl class="pairs pairs--inline"><dt>Replies</dt><dd>123</dd></dl>
|
|
m = re.search(r'<dt>Replies</dt>\s*<dd>([\d,]+)</dd>', page_html)
|
|
if m:
|
|
return int(m.group(1).replace(',', ''))
|
|
return 0
|
|
|
|
@staticmethod
|
|
def _extract_thread_id(url: str) -> Optional[str]:
|
|
"""Extract thread ID from XenForo URL.
|
|
|
|
Handles both formats:
|
|
- /threads/title.12345/
|
|
- /index.php?threads/title.12345/
|
|
"""
|
|
m = re.search(r'threads/[^/]*?\.(\d+)', url)
|
|
if m:
|
|
return m.group(1)
|
|
# Fallback: just /threads/{id}/
|
|
m = re.search(r'threads/(\d+)', url)
|
|
if m:
|
|
return m.group(1)
|
|
return None
|
|
|
|
@staticmethod
|
|
def _build_page_url(thread_url: str, page_num: int) -> str:
|
|
"""Build paginated thread URL for XenForo.
|
|
|
|
Handles: /index.php?threads/slug.12345/page-2
|
|
"""
|
|
# Remove existing page- suffix and fragment
|
|
base = thread_url.split('#')[0].rstrip('/')
|
|
base = re.sub(r'/page-\d+$', '', base)
|
|
if page_num == 1:
|
|
return base + '/'
|
|
return f'{base}/page-{page_num}'
|
|
|
|
@staticmethod
|
|
def _get_extension(filename_or_url: str) -> str:
|
|
"""Get lowercase file extension."""
|
|
clean = filename_or_url.split('?')[0].split('#')[0]
|
|
if '.' in clean.split('/')[-1]:
|
|
return clean.rsplit('.', 1)[-1].lower()
|
|
return ''
|
|
|
|
@staticmethod
|
|
def _filename_from_url(url: str) -> str:
|
|
"""Extract filename from URL path."""
|
|
path = urlparse(url).path
|
|
name = path.rstrip('/').split('/')[-1]
|
|
return name if name else 'unnamed.jpg'
|