744
modules/paid_content/xenforo_forum_client.py
Normal file
744
modules/paid_content/xenforo_forum_client.py
Normal file
@@ -0,0 +1,744 @@
|
||||
"""
|
||||
Generic XenForo Forum Client for Paid Content
|
||||
|
||||
Scrapes XenForo-based celebrity image forums (HQCelebCorner, PicturePub, etc.)
|
||||
treating each celebrity name as a "creator" and each matching thread as a post.
|
||||
|
||||
Images are hosted on external hosts (imagebam, pixhost, imagetwist, etc.)
|
||||
and resolved via ImageHostHandler from forum_downloader.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urlparse, unquote_plus
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class XenForoForumClient(LoggingMixin):
|
||||
"""Generic client for scraping XenForo-based forum threads."""
|
||||
|
||||
FLARESOLVERR_URL = 'http://localhost:8191/v1'
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
|
||||
|
||||
# External image host domains to look for in post links
|
||||
IMAGE_HOST_DOMAINS = [
|
||||
'imagebam.com', 'pixhost.to', 'imagetwist.com', 'imgur.com',
|
||||
'imgbox.com', 'postimg.cc', 'postimages.org', 'catbox.moe',
|
||||
'turboimagehost.com', 'imageban.ru', 'img.yt', 'acidimg.cc',
|
||||
'pixxxels.cc', 'imx.to', 'imgbb.com', 'ibb.co',
|
||||
]
|
||||
|
||||
def __init__(self, service_id: str, base_url: str, cookie_path: str, log_callback=None):
|
||||
self.SERVICE_ID = service_id
|
||||
self.BASE_URL = base_url.rstrip('/')
|
||||
self.COOKIE_PATH = cookie_path
|
||||
self._init_logger('PaidContent', log_callback, default_module=service_id)
|
||||
self._cookies: Optional[Dict[str, str]] = None
|
||||
self._image_host_handler = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Cookie handling
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _load_cookies(self) -> Dict[str, str]:
|
||||
"""Load Playwright-format cookies and convert to {name: value} dict."""
|
||||
if self._cookies is not None:
|
||||
return self._cookies
|
||||
|
||||
try:
|
||||
cookie_path = Path(self.COOKIE_PATH)
|
||||
if cookie_path.exists():
|
||||
with open(cookie_path, 'r') as f:
|
||||
raw_cookies = json.load(f)
|
||||
self._cookies = {c['name']: c['value'] for c in raw_cookies}
|
||||
self.log(f"Loaded {len(self._cookies)} cookies from {self.COOKIE_PATH}", 'debug')
|
||||
else:
|
||||
self.log(f"Cookie file not found: {self.COOKIE_PATH}", 'warning')
|
||||
self._cookies = {}
|
||||
except Exception as e:
|
||||
self.log(f"Error loading cookies: {e}", 'warning')
|
||||
self._cookies = {}
|
||||
|
||||
return self._cookies
|
||||
|
||||
def _get_cookie_header(self) -> str:
|
||||
"""Build Cookie header string from loaded cookies."""
|
||||
cookies = self._load_cookies()
|
||||
return '; '.join(f'{k}={v}' for k, v in cookies.items())
|
||||
|
||||
def _get_request_headers(self) -> Dict[str, str]:
|
||||
"""Get headers with cookies for authenticated requests."""
|
||||
headers = dict(self.HEADERS)
|
||||
cookie_str = self._get_cookie_header()
|
||||
if cookie_str:
|
||||
headers['Cookie'] = cookie_str
|
||||
return headers
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Image host handling
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _get_image_host_handler(self):
|
||||
"""Get or create ImageHostHandler instance."""
|
||||
if self._image_host_handler is None:
|
||||
try:
|
||||
from modules.forum_downloader import ImageHostHandler
|
||||
self._image_host_handler = ImageHostHandler
|
||||
self.log("Loaded ImageHostHandler from forum_downloader", 'debug')
|
||||
except ImportError:
|
||||
self.log("ImageHostHandler not available", 'warning')
|
||||
self._image_host_handler = False # sentinel to avoid retrying
|
||||
return self._image_host_handler if self._image_host_handler is not False else None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTTP helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
||||
"""Fetch a page with cookies. Falls back to FlareSolverr on 403."""
|
||||
headers = self._get_request_headers()
|
||||
try:
|
||||
async with session.get(url, headers=headers, allow_redirects=True) as resp:
|
||||
if resp.status == 200:
|
||||
return await resp.text()
|
||||
if resp.status == 403:
|
||||
self.log(f"Got 403 for {url}, trying FlareSolverr", 'debug')
|
||||
return await self._fetch_via_flaresolverr(url)
|
||||
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {url}: {e}", 'warning')
|
||||
return await self._fetch_via_flaresolverr(url)
|
||||
|
||||
async def _fetch_via_flaresolverr(self, url: str) -> Optional[str]:
|
||||
"""Fetch a page using FlareSolverr to bypass Cloudflare."""
|
||||
try:
|
||||
import requests as std_requests
|
||||
except ImportError:
|
||||
self.log("requests library not available for FlareSolverr", 'warning')
|
||||
return None
|
||||
|
||||
fs_session_id = None
|
||||
try:
|
||||
# Create session
|
||||
resp = std_requests.post(self.FLARESOLVERR_URL, json={
|
||||
'cmd': 'sessions.create'
|
||||
}, timeout=30)
|
||||
data = resp.json()
|
||||
if data.get('status') != 'ok':
|
||||
self.log("Failed to create FlareSolverr session", 'warning')
|
||||
return None
|
||||
fs_session_id = data.get('session')
|
||||
|
||||
# Fetch page
|
||||
cookies = self._load_cookies()
|
||||
resp = std_requests.post(self.FLARESOLVERR_URL, json={
|
||||
'cmd': 'request.get',
|
||||
'url': url,
|
||||
'session': fs_session_id,
|
||||
'cookies': [{'name': k, 'value': v} for k, v in cookies.items()],
|
||||
'maxTimeout': 60000,
|
||||
}, timeout=70)
|
||||
page_data = resp.json()
|
||||
if page_data.get('status') == 'ok':
|
||||
return page_data.get('solution', {}).get('response', '')
|
||||
self.log(f"FlareSolverr failed for {url}: {page_data.get('message', 'unknown')}", 'warning')
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"FlareSolverr error for {url}: {e}", 'warning')
|
||||
return None
|
||||
finally:
|
||||
if fs_session_id:
|
||||
try:
|
||||
std_requests.post(self.FLARESOLVERR_URL, json={
|
||||
'cmd': 'sessions.destroy',
|
||||
'session': fs_session_id,
|
||||
}, timeout=10)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def search_threads(self, query: str) -> List[Dict]:
|
||||
"""Search for threads matching a celebrity name.
|
||||
|
||||
Returns list of {thread_id, title, url, reply_count}.
|
||||
"""
|
||||
threads = []
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# XenForo search: POST form to /search/search
|
||||
search_url = f'{self.BASE_URL}/search/search'
|
||||
headers = self._get_request_headers()
|
||||
headers['Content-Type'] = 'application/x-www-form-urlencoded'
|
||||
|
||||
# Need CSRF token - fetch search page first
|
||||
search_page_url = f'{self.BASE_URL}/search/'
|
||||
page_html = await self._fetch_page(session, search_page_url)
|
||||
if not page_html:
|
||||
self.log("Failed to fetch search page", 'warning')
|
||||
return threads
|
||||
|
||||
# Extract CSRF token
|
||||
csrf_match = re.search(r'name="_xfToken"\s+value="([^"]+)"', page_html)
|
||||
xf_token = csrf_match.group(1) if csrf_match else ''
|
||||
|
||||
form_data = {
|
||||
'keywords': query,
|
||||
'search_type': 'post',
|
||||
'c[title_only]': '1',
|
||||
'order': 'date',
|
||||
'_xfToken': xf_token,
|
||||
}
|
||||
|
||||
try:
|
||||
async with session.post(search_url, headers=headers, data=form_data,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Search returned HTTP {resp.status}", 'warning')
|
||||
return threads
|
||||
result_html = await resp.text()
|
||||
result_url = str(resp.url)
|
||||
except Exception as e:
|
||||
self.log(f"Search failed: {e}", 'error')
|
||||
return threads
|
||||
|
||||
threads = self._parse_search_results(result_html)
|
||||
|
||||
# Handle search result pagination
|
||||
page = 2
|
||||
while True:
|
||||
next_url = self._find_next_search_page(result_html, result_url, page)
|
||||
if not next_url:
|
||||
break
|
||||
await asyncio.sleep(0.3)
|
||||
result_html = await self._fetch_page(session, next_url)
|
||||
if not result_html:
|
||||
break
|
||||
more = self._parse_search_results(result_html)
|
||||
if not more:
|
||||
break
|
||||
threads.extend(more)
|
||||
page += 1
|
||||
|
||||
self.log(f"Search for '{query}' found {len(threads)} threads", 'info')
|
||||
return threads
|
||||
|
||||
async def get_thread_info(self, thread_url: str) -> Optional[Dict]:
|
||||
"""Fetch page 1 of a thread and extract metadata.
|
||||
|
||||
Returns {thread_id, title, reply_count, page_count, url}.
|
||||
"""
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
page_html = await self._fetch_page(session, thread_url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
title = self._extract_title(page_html)
|
||||
page_count = self._extract_page_count(page_html)
|
||||
reply_count = self._extract_reply_count(page_html)
|
||||
thread_id = self._extract_thread_id(thread_url)
|
||||
|
||||
return {
|
||||
'thread_id': thread_id,
|
||||
'title': title or 'Untitled',
|
||||
'reply_count': reply_count,
|
||||
'page_count': page_count,
|
||||
'url': thread_url.split('#')[0].rstrip('/'),
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error getting thread info for {thread_url}: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def get_thread_images(self, thread_url: str, page_count: int = None,
|
||||
start_page: int = 1) -> List[Dict]:
|
||||
"""Scrape all pages of a thread and extract image host links.
|
||||
|
||||
Returns list of {url, host, post_number} dicts (deduplicated).
|
||||
"""
|
||||
images = []
|
||||
seen_urls: Set[str] = set()
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# If page_count not provided, fetch page 1 to determine it
|
||||
if page_count is None:
|
||||
page1_html = await self._fetch_page(session, thread_url)
|
||||
if not page1_html:
|
||||
return images
|
||||
page_count = self._extract_page_count(page1_html)
|
||||
page_images = self._extract_image_links(page1_html)
|
||||
for img in page_images:
|
||||
if img['url'] not in seen_urls:
|
||||
seen_urls.add(img['url'])
|
||||
images.append(img)
|
||||
start_page = 2
|
||||
|
||||
for page_num in range(start_page, page_count + 1):
|
||||
page_url = self._build_page_url(thread_url, page_num)
|
||||
await asyncio.sleep(0.5) # Rate limit
|
||||
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if not page_html:
|
||||
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
|
||||
break
|
||||
|
||||
page_images = self._extract_image_links(page_html)
|
||||
new_count = 0
|
||||
for img in page_images:
|
||||
if img['url'] not in seen_urls:
|
||||
seen_urls.add(img['url'])
|
||||
images.append(img)
|
||||
new_count += 1
|
||||
|
||||
self.log(f"Page {page_num}/{page_count}: {new_count} new image links", 'debug')
|
||||
|
||||
self.log(f"Total: {len(images)} unique image links from {page_count} pages", 'info')
|
||||
return images
|
||||
|
||||
async def resolve_image_url(self, host_page_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
|
||||
"""Resolve an image host page URL to a direct image URL.
|
||||
|
||||
Uses ImageHostHandler from forum_downloader where possible.
|
||||
"""
|
||||
handler = self._get_image_host_handler()
|
||||
|
||||
# Try direct extraction without fetching the page
|
||||
if handler:
|
||||
direct = handler.extract_direct_url(host_page_url)
|
||||
if direct:
|
||||
return direct
|
||||
|
||||
# imgbox thumbnail → full image conversion (thumbs2 → images2)
|
||||
m = re.match(r'https?://thumbs(\d*)\.imgbox\.com/([a-f0-9]+/[a-f0-9]+/)(\w+)_t\.\w+', host_page_url)
|
||||
if m:
|
||||
return f"https://images{m.group(1)}.imgbox.com/{m.group(2)}{m.group(3)}_o.jpg"
|
||||
|
||||
# For hosts that need page content, fetch and parse
|
||||
own_session = session is None
|
||||
if own_session:
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
session = aiohttp.ClientSession(timeout=timeout)
|
||||
|
||||
try:
|
||||
# ImageBam requires sfw_inter=1 cookie to bypass consent page
|
||||
headers = dict(self.HEADERS)
|
||||
if 'imagebam' in host_page_url:
|
||||
headers['Cookie'] = 'sfw_inter=1'
|
||||
|
||||
try:
|
||||
async with session.get(host_page_url, headers=headers,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
return None
|
||||
page_content = await resp.text()
|
||||
final_url = str(resp.url)
|
||||
except Exception as e:
|
||||
self.log(f"Failed to fetch image host page {host_page_url}: {e}", 'debug')
|
||||
return None
|
||||
|
||||
# Try handler with page content
|
||||
if handler:
|
||||
direct = handler.extract_direct_url(host_page_url, page_content=page_content)
|
||||
if direct:
|
||||
return direct
|
||||
|
||||
# Manual extraction fallbacks
|
||||
return self._extract_direct_image_from_html(host_page_url, page_content, final_url)
|
||||
|
||||
finally:
|
||||
if own_session:
|
||||
await session.close()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTML parsing helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _parse_search_results(self, html_content: str) -> List[Dict]:
|
||||
"""Parse XenForo search results page for thread links."""
|
||||
threads = []
|
||||
|
||||
# Parse each contentRow block to extract title, URL, and date
|
||||
for block_match in re.finditer(
|
||||
r'<div\s+class="contentRow[^"]*"[^>]*>(.*?)</div>\s*</div>\s*</div>',
|
||||
html_content, re.DOTALL
|
||||
):
|
||||
block = block_match.group(1)
|
||||
|
||||
# Extract thread URL and title
|
||||
title_match = re.search(
|
||||
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
|
||||
block, re.DOTALL
|
||||
)
|
||||
if not title_match:
|
||||
continue
|
||||
|
||||
url = title_match.group(1)
|
||||
title_raw = title_match.group(2)
|
||||
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
|
||||
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
|
||||
|
||||
if not title:
|
||||
continue
|
||||
|
||||
if not url.startswith('http'):
|
||||
url = self.BASE_URL + url
|
||||
|
||||
thread_id = self._extract_thread_id(url)
|
||||
if not thread_id:
|
||||
continue
|
||||
|
||||
# Extract date from <time datetime="..."> tag
|
||||
published_at = None
|
||||
time_match = re.search(r'<time[^>]+datetime="([^"]+)"', block)
|
||||
if time_match:
|
||||
published_at = time_match.group(1)
|
||||
|
||||
threads.append({
|
||||
'thread_id': thread_id,
|
||||
'title': title,
|
||||
'url': url.split('#')[0].rstrip('/'),
|
||||
'reply_count': 0,
|
||||
'published_at': published_at,
|
||||
})
|
||||
|
||||
# Fallback: if contentRow block parsing found nothing, try simpler title-only parsing
|
||||
if not threads:
|
||||
for m in re.finditer(
|
||||
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
|
||||
html_content, re.DOTALL
|
||||
):
|
||||
url = m.group(1)
|
||||
title_raw = m.group(2)
|
||||
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
|
||||
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
|
||||
if not title:
|
||||
continue
|
||||
if not url.startswith('http'):
|
||||
url = self.BASE_URL + url
|
||||
thread_id = self._extract_thread_id(url)
|
||||
if not thread_id:
|
||||
continue
|
||||
threads.append({
|
||||
'thread_id': thread_id,
|
||||
'title': title,
|
||||
'url': url.split('#')[0].rstrip('/'),
|
||||
'reply_count': 0,
|
||||
'published_at': None,
|
||||
})
|
||||
|
||||
# Deduplicate by thread_id
|
||||
seen = set()
|
||||
unique = []
|
||||
for t in threads:
|
||||
if t['thread_id'] not in seen:
|
||||
seen.add(t['thread_id'])
|
||||
unique.append(t)
|
||||
|
||||
return unique
|
||||
|
||||
def _find_next_search_page(self, html_content: str, current_url: str, page_num: int) -> Optional[str]:
|
||||
"""Find URL for the next page of search results."""
|
||||
# XenForo pagination: <a href="...page-{N}..." class="pageNav-page">
|
||||
pattern = rf'<a\s+href="([^"]*)"[^>]*class="pageNav-jump[^"]*"[^>]*>\s*Next'
|
||||
m = re.search(pattern, html_content, re.IGNORECASE)
|
||||
if m:
|
||||
url = m.group(1)
|
||||
if not url.startswith('http'):
|
||||
url = self.BASE_URL + html.unescape(url)
|
||||
return url
|
||||
return None
|
||||
|
||||
# Domains/patterns for non-content images (reaction GIFs, emojis, signatures, etc.)
|
||||
JUNK_URL_PATTERNS = [
|
||||
'giphy.com', 'tenor.com', 'gfycat.com', # reaction GIFs
|
||||
'jsdelivr.net', 'joypixels', 'twemoji', # emoji CDNs
|
||||
'wp-content/', # WordPress media (blog graphics, profile pics)
|
||||
'/unicode/', '/emoji/', # emoji paths
|
||||
'haboodadi.com', # forum signature images
|
||||
]
|
||||
|
||||
# Image hosts that are permanently dead (DNS gone / domain expired)
|
||||
DEAD_HOSTS = [
|
||||
'someimage.com',
|
||||
]
|
||||
|
||||
def _extract_image_links(self, page_html: str) -> List[Dict]:
|
||||
"""Extract image host links from all posts on a page."""
|
||||
images = []
|
||||
|
||||
# Find all message bodies: XenForo uses <article class="message ..."> and
|
||||
# <div class="bbWrapper"> for post content
|
||||
for content_match in re.finditer(
|
||||
r'<div\s+class="bbWrapper">(.*?)</div>\s*(?:</div>|<div\s+class="(?:js-post|message))',
|
||||
page_html, re.DOTALL
|
||||
):
|
||||
content = content_match.group(1)
|
||||
|
||||
# Extract links to known image hosts
|
||||
for link_match in re.finditer(r'<a\s+[^>]*href="([^"]+)"[^>]*>', content):
|
||||
link_url = html.unescape(link_match.group(1))
|
||||
if self._is_image_host_url(link_url) and not self._is_junk_url(link_url):
|
||||
images.append({'url': link_url, 'host': self._identify_host(link_url)})
|
||||
|
||||
# Also catch direct image URLs (full-size, not thumbnails)
|
||||
# NOTE: Skip images hosted on known image host CDNs (imgbox, imgur, etc.)
|
||||
# — legitimate gallery images are posted as <a href> links to host pages
|
||||
# (handled above), while inline <img> from these hosts are signatures.
|
||||
for img_match in re.finditer(r'<img\s+[^>]*src="([^"]+)"[^>]*>', content):
|
||||
img_url = html.unescape(img_match.group(1))
|
||||
# Skip thumbnails, avatars, smilies, and junk
|
||||
if any(skip in img_url.lower() for skip in [
|
||||
'thumb', 'avatar', 'smili', 'emoji', 'icon', 'logo',
|
||||
'data/assets', '/styles/', 'xenforo'
|
||||
]):
|
||||
continue
|
||||
if self._is_junk_url(img_url):
|
||||
continue
|
||||
# Skip inline images from known image hosts — these are signatures,
|
||||
# not gallery content (gallery images come through as <a> links above)
|
||||
if self._is_image_host_url(img_url):
|
||||
continue
|
||||
if self._is_direct_image_url(img_url):
|
||||
images.append({'url': img_url, 'host': 'direct'})
|
||||
|
||||
return images
|
||||
|
||||
def _is_junk_url(self, url: str) -> bool:
|
||||
"""Filter out non-content images: reaction GIFs, emojis, blog graphics, dead hosts, etc."""
|
||||
url_lower = url.lower()
|
||||
if any(pat in url_lower for pat in self.JUNK_URL_PATTERNS):
|
||||
return True
|
||||
if any(host in url_lower for host in self.DEAD_HOSTS):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_image_host_url(self, url: str) -> bool:
|
||||
"""Check if a URL belongs to a known image hosting service."""
|
||||
try:
|
||||
domain = urlparse(url).netloc.lower()
|
||||
return any(host in domain for host in self.IMAGE_HOST_DOMAINS)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _is_direct_image_url(self, url: str) -> bool:
|
||||
"""Check if a URL points directly to an image file."""
|
||||
try:
|
||||
path = urlparse(url).path.lower()
|
||||
return any(path.endswith(f'.{ext}') for ext in self.IMAGE_EXTS)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _identify_host(self, url: str) -> str:
|
||||
"""Identify which image host a URL belongs to."""
|
||||
handler = self._get_image_host_handler()
|
||||
if handler:
|
||||
host = handler.identify_host(url)
|
||||
if host:
|
||||
return host
|
||||
# Fallback
|
||||
try:
|
||||
domain = urlparse(url).netloc.lower()
|
||||
for host_domain in self.IMAGE_HOST_DOMAINS:
|
||||
if host_domain in domain:
|
||||
return host_domain.split('.')[0]
|
||||
except Exception:
|
||||
pass
|
||||
return 'unknown'
|
||||
|
||||
def _extract_direct_image_from_html(self, url: str, page_content: str, final_url: str) -> Optional[str]:
|
||||
"""Manually extract direct image URL from host page HTML."""
|
||||
domain = urlparse(url).netloc.lower()
|
||||
|
||||
# imagebam: <img class="main-image ..." src="..."> (class may have extra classes)
|
||||
if 'imagebam' in domain:
|
||||
m = re.search(r'<img\s+[^>]*src="(https?://images\d*\.imagebam\.com/[^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
m = re.search(r'<img\s+[^>]*class="main-image[^"]*"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
# Alternative: og:image meta tag
|
||||
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# pixhost: <img id="image" src="..."> or img.pixhost.to URL
|
||||
if 'pixhost' in domain:
|
||||
m = re.search(r'<img\s+[^>]*id="image"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
# Convert thumbnail URL to full: t{N}.pixhost.to/thumbs/ -> img{N}.pixhost.to/images/
|
||||
m = re.search(r'https?://t(\d+)\.pixhost\.to/thumbs/(\d+)/(.+)', url)
|
||||
if m:
|
||||
return f"https://img{m.group(1)}.pixhost.to/images/{m.group(2)}/{m.group(3)}"
|
||||
|
||||
# imagetwist: <img class="pic" src="...">
|
||||
if 'imagetwist' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="pic"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
m = re.search(r'<p\s+[^>]*style="text-align:center"[^>]*>\s*<img\s+[^>]*src="([^"]+)"',
|
||||
page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# imgbox: <img id="img" src="..."> or src before id
|
||||
if 'imgbox' in domain:
|
||||
m = re.search(r'<img\s+[^>]*id="img"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
m = re.search(r'<img\s+[^>]*src="([^"]+)"[^>]*id="img"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
# Direct image URL pattern
|
||||
m = re.search(r'(https?://images\d*\.imgbox\.com/[^\s"<>]+)', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# turboimagehost: <img class="uImage" src="...">
|
||||
if 'turboimagehost' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="uImage"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# acidimg: <img class="centred" src="...">
|
||||
if 'acidimg' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# pixxxels: same pattern as acidimg
|
||||
if 'pixxxels' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# imx.to: <img class="image-show" src="...">
|
||||
if 'imx.to' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="image-show"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# Generic: try og:image meta tag
|
||||
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
|
||||
if m:
|
||||
img_url = html.unescape(m.group(1))
|
||||
if self._is_direct_image_url(img_url):
|
||||
return img_url
|
||||
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utility helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _extract_title(page_html: str) -> Optional[str]:
|
||||
"""Extract thread title from XenForo <h1 class="p-title-value">."""
|
||||
m = re.search(r'<h1\s+class="p-title-value"[^>]*>(.*?)</h1>', page_html, re.DOTALL)
|
||||
if m:
|
||||
# Remove inner tags (like <span> for prefixes/labels, viewer count spans)
|
||||
title = re.sub(r'<[^>]+>', '', m.group(1))
|
||||
# Clean up non-breaking spaces and extra whitespace
|
||||
title = title.replace('\xa0', ' ')
|
||||
title = re.sub(r'\s*\(\d+\s*Viewer[s]?\)', '', title) # Remove "(1 Viewer)"
|
||||
title = re.sub(r'\s+', ' ', title).strip()
|
||||
return html.unescape(title)
|
||||
# Fallback: <title> — strip common XenForo site name suffixes
|
||||
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
title = html.unescape(m.group(1).strip())
|
||||
title = re.sub(r'\s*[-–—|]\s*(?:HQCelebCorner|PicturePub|XenForo).*$', '', title, flags=re.IGNORECASE).strip()
|
||||
return title
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_page_count(page_html: str) -> int:
|
||||
"""Extract total page count from XenForo pagination."""
|
||||
# <li class="pageNav-page"><a href="...">42</a></li>
|
||||
pages = re.findall(r'<li\s+class="pageNav-page[^"]*">\s*<a[^>]*>(\d+)</a>', page_html)
|
||||
if pages:
|
||||
return max(int(p) for p in pages)
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def _extract_reply_count(page_html: str) -> int:
|
||||
"""Extract reply count from XenForo thread info."""
|
||||
# <dl class="pairs pairs--inline"><dt>Replies</dt><dd>123</dd></dl>
|
||||
m = re.search(r'<dt>Replies</dt>\s*<dd>([\d,]+)</dd>', page_html)
|
||||
if m:
|
||||
return int(m.group(1).replace(',', ''))
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def _extract_thread_id(url: str) -> Optional[str]:
|
||||
"""Extract thread ID from XenForo URL.
|
||||
|
||||
Handles both formats:
|
||||
- /threads/title.12345/
|
||||
- /index.php?threads/title.12345/
|
||||
"""
|
||||
m = re.search(r'threads/[^/]*?\.(\d+)', url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
# Fallback: just /threads/{id}/
|
||||
m = re.search(r'threads/(\d+)', url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _build_page_url(thread_url: str, page_num: int) -> str:
|
||||
"""Build paginated thread URL for XenForo.
|
||||
|
||||
Handles: /index.php?threads/slug.12345/page-2
|
||||
"""
|
||||
# Remove existing page- suffix and fragment
|
||||
base = thread_url.split('#')[0].rstrip('/')
|
||||
base = re.sub(r'/page-\d+$', '', base)
|
||||
if page_num == 1:
|
||||
return base + '/'
|
||||
return f'{base}/page-{page_num}'
|
||||
|
||||
@staticmethod
|
||||
def _get_extension(filename_or_url: str) -> str:
|
||||
"""Get lowercase file extension."""
|
||||
clean = filename_or_url.split('?')[0].split('#')[0]
|
||||
if '.' in clean.split('/')[-1]:
|
||||
return clean.rsplit('.', 1)[-1].lower()
|
||||
return ''
|
||||
|
||||
@staticmethod
|
||||
def _filename_from_url(url: str) -> str:
|
||||
"""Extract filename from URL path."""
|
||||
path = urlparse(url).path
|
||||
name = path.rstrip('/').split('/')[-1]
|
||||
return name if name else 'unnamed.jpg'
|
||||
Reference in New Issue
Block a user