623 lines
25 KiB
Python
623 lines
25 KiB
Python
"""
|
|
Coppermine Gallery scraper client.
|
|
|
|
Coppermine is a PHP photo gallery with a nested structure:
|
|
categories > sub-categories > albums > photos
|
|
|
|
One album maps to one Post with N Attachments.
|
|
Full-res URLs are derived from thumbnails by stripping the `thumb_` prefix.
|
|
"""
|
|
|
|
import asyncio
|
|
import re
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional, Set
|
|
from urllib.parse import urljoin, urlparse, parse_qs
|
|
|
|
import aiohttp
|
|
|
|
from modules.base_module import LoggingMixin
|
|
from .models import Post, Attachment
|
|
|
|
|
|
class CoppermineClient(LoggingMixin):
|
|
SERVICE_ID = 'coppermine'
|
|
PLATFORM = 'coppermine'
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
|
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
}
|
|
|
|
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
|
|
|
|
def __init__(self, log_callback=None):
|
|
self._init_logger('PaidContent', log_callback, default_module='Coppermine')
|
|
|
|
async def get_profile_info(self, gallery_url: str) -> Optional[Dict]:
|
|
"""Fetch gallery root and extract profile metadata.
|
|
|
|
Args:
|
|
gallery_url: Base gallery URL (e.g. https://kylie-jenner.org/gallery)
|
|
|
|
Returns:
|
|
Dict with username, display_name, post_count, gallery_url or None on failure
|
|
"""
|
|
root_url = self._build_url(gallery_url, 'index.php')
|
|
timeout = aiohttp.ClientTimeout(total=30)
|
|
try:
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
html = await self._fetch_page(session, root_url)
|
|
if not html:
|
|
return None
|
|
|
|
# Extract site title from <title> tag
|
|
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
|
|
site_title = title_match.group(1).strip() if title_match else 'Coppermine Gallery'
|
|
# Clean HTML entities
|
|
site_title = re.sub(r'&', '&', site_title)
|
|
site_title = re.sub(r'<', '<', site_title)
|
|
site_title = re.sub(r'>', '>', site_title)
|
|
site_title = re.sub(r'&#\d+;', '', site_title)
|
|
site_title = re.sub(r'&\w+;', '', site_title)
|
|
|
|
# Try to extract stats: "N files in M albums"
|
|
total_files = 0
|
|
total_albums = 0
|
|
stats_match = re.search(
|
|
r'(\d[\d,]*)\s+files?\s+in\s+(\d[\d,]*)\s+albums?',
|
|
html, re.IGNORECASE
|
|
)
|
|
if stats_match:
|
|
total_files = int(stats_match.group(1).replace(',', ''))
|
|
total_albums = int(stats_match.group(2).replace(',', ''))
|
|
|
|
# Use domain as username
|
|
parsed = urlparse(gallery_url)
|
|
domain = parsed.netloc.replace('www.', '')
|
|
|
|
return {
|
|
'username': domain,
|
|
'display_name': site_title,
|
|
'post_count': total_albums,
|
|
'gallery_url': gallery_url,
|
|
}
|
|
except Exception as e:
|
|
self.log(f"Error fetching profile info from {gallery_url}: {e}", 'error')
|
|
return None
|
|
|
|
async def get_posts(self, gallery_url: str,
|
|
known_post_ids: Optional[Set[str]] = None,
|
|
progress_callback=None,
|
|
post_callback=None):
|
|
"""Crawl the gallery, yielding new albums as Post objects incrementally.
|
|
|
|
Phase 1: Fetch root, extract top-level category links
|
|
Phase 2: Recursively crawl categories until album links found
|
|
Phase 3: For each album, fetch thumbnails and call post_callback immediately
|
|
|
|
Args:
|
|
gallery_url: Base gallery URL
|
|
known_post_ids: Set of post IDs already in DB (album_NNN)
|
|
progress_callback: Called with status message strings
|
|
post_callback: async callable(post) — called for each album as it's fetched.
|
|
If provided, posts are streamed instead of collected.
|
|
|
|
Returns:
|
|
List of Post objects (only if post_callback is None)
|
|
"""
|
|
known = known_post_ids or set()
|
|
timeout = aiohttp.ClientTimeout(total=None, sock_connect=30, sock_read=60)
|
|
posts_collected = [] if post_callback is None else None
|
|
|
|
try:
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
# Phase 1: Get all category links from root
|
|
root_url = self._build_url(gallery_url, 'index.php')
|
|
root_html = await self._fetch_page(session, root_url)
|
|
if not root_html:
|
|
self.log("Failed to fetch gallery root", 'error')
|
|
return [] if post_callback is None else None
|
|
|
|
category_ids = self._extract_category_ids(root_html)
|
|
self.log(f"Found {len(category_ids)} top-level categories", 'info')
|
|
|
|
if progress_callback:
|
|
progress_callback(f'Found {len(category_ids)} categories, crawling...')
|
|
|
|
# Phase 2: Recursively crawl categories to find album IDs
|
|
album_ids = set()
|
|
visited_cats = set()
|
|
for cat_id in category_ids:
|
|
new_albums = await self._crawl_category(
|
|
session, gallery_url, cat_id, visited_cats, known, progress_callback
|
|
)
|
|
album_ids.update(new_albums)
|
|
|
|
# Filter out known albums
|
|
new_album_ids = {aid for aid in album_ids
|
|
if f"album_{aid}" not in known}
|
|
|
|
self.log(f"Found {len(new_album_ids)} new albums "
|
|
f"({len(album_ids)} total, {len(album_ids) - len(new_album_ids)} known)",
|
|
'info')
|
|
|
|
if progress_callback:
|
|
progress_callback(f'Found {len(new_album_ids)} new albums, fetching photos...')
|
|
|
|
# Phase 3: Fetch each new album and deliver Post objects
|
|
parsed = urlparse(gallery_url)
|
|
domain = parsed.netloc.replace('www.', '')
|
|
fetched = 0
|
|
|
|
for i, album_id in enumerate(sorted(new_album_ids)):
|
|
if progress_callback and (i + 1) % 5 == 0:
|
|
progress_callback(
|
|
f'Fetching album {i + 1}/{len(new_album_ids)}...'
|
|
)
|
|
|
|
post = await self._fetch_album(session, gallery_url, album_id, domain)
|
|
if post and post.attachments:
|
|
fetched += 1
|
|
if post_callback:
|
|
await post_callback(post)
|
|
else:
|
|
posts_collected.append(post)
|
|
|
|
# Rate limit: 1s between page fetches
|
|
await asyncio.sleep(2)
|
|
|
|
self.log(f"Fetched {fetched} albums with attachments", 'info')
|
|
return posts_collected
|
|
|
|
except Exception as e:
|
|
self.log(f"Error crawling gallery {gallery_url}: {e}", 'error')
|
|
return [] if post_callback is None else None
|
|
|
|
# ------------------------------------------------------------------
|
|
# Internal helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
def _build_url(self, gallery_url: str, page: str) -> str:
|
|
"""Build a full URL from the gallery base and a page name."""
|
|
base = gallery_url.rstrip('/')
|
|
return f"{base}/{page}"
|
|
|
|
async def _fetch_page(self, session: aiohttp.ClientSession, url: str,
|
|
max_retries: int = 3) -> Optional[str]:
|
|
"""Fetch a page and return its HTML text, or None on failure.
|
|
|
|
Retries with exponential backoff on connection errors / server disconnects.
|
|
"""
|
|
for attempt in range(max_retries):
|
|
try:
|
|
async with session.get(url, headers=self.HEADERS) as resp:
|
|
if resp.status == 429:
|
|
wait = 5 * (attempt + 1)
|
|
self.log(f"Rate limited on {url}, waiting {wait}s", 'warning')
|
|
await asyncio.sleep(wait)
|
|
continue
|
|
if resp.status != 200:
|
|
self.log(f"HTTP {resp.status} fetching {url}", 'warning')
|
|
return None
|
|
return await resp.text()
|
|
except (aiohttp.ServerDisconnectedError, aiohttp.ClientOSError,
|
|
aiohttp.ClientPayloadError, ConnectionResetError) as e:
|
|
wait = 3 * (attempt + 1)
|
|
if attempt < max_retries - 1:
|
|
self.log(f"Connection error on {url}, retry {attempt + 1} in {wait}s: {e}",
|
|
'warning')
|
|
await asyncio.sleep(wait)
|
|
else:
|
|
self.log(f"Failed after {max_retries} attempts: {url}: {e}", 'warning')
|
|
return None
|
|
except Exception as e:
|
|
self.log(f"Error fetching {url}: {e}", 'warning')
|
|
return None
|
|
return None
|
|
|
|
def _extract_category_ids(self, html: str) -> List[str]:
|
|
"""Extract category IDs from index.php page.
|
|
|
|
Looks for links like: index.php?cat=N
|
|
"""
|
|
cat_ids = []
|
|
seen = set()
|
|
for match in re.finditer(r'index\.php\?cat=(\d+)', html):
|
|
cat_id = match.group(1)
|
|
if cat_id not in seen:
|
|
seen.add(cat_id)
|
|
cat_ids.append(cat_id)
|
|
return cat_ids
|
|
|
|
def _extract_album_ids(self, html: str) -> List[str]:
|
|
"""Extract album IDs from a category page.
|
|
|
|
Looks for links like: thumbnails.php?album=N
|
|
"""
|
|
album_ids = []
|
|
seen = set()
|
|
for match in re.finditer(r'thumbnails\.php\?album=(\d+)', html):
|
|
album_id = match.group(1)
|
|
if album_id not in seen:
|
|
seen.add(album_id)
|
|
album_ids.append(album_id)
|
|
return album_ids
|
|
|
|
def _extract_page_count(self, html: str) -> int:
|
|
"""Extract total page count from Coppermine pagination text.
|
|
|
|
Looks for patterns like "53 albums on 2 page(s)" or "N files on M page(s)".
|
|
"""
|
|
match = re.search(r'on\s+(\d+)\s+page\(s\)', html, re.IGNORECASE)
|
|
if match:
|
|
return int(match.group(1))
|
|
return 1
|
|
|
|
async def _crawl_category(self, session: aiohttp.ClientSession,
|
|
gallery_url: str, cat_id: str,
|
|
visited: Set[str], known: Set[str],
|
|
progress_callback=None,
|
|
depth: int = 0) -> Set[str]:
|
|
"""Recursively crawl a category to find all album IDs.
|
|
|
|
Categories can contain sub-categories or albums. We recurse
|
|
until we find album links (thumbnails.php?album=N).
|
|
Handles pagination within category pages (index.php?cat=N&page=M).
|
|
|
|
Args:
|
|
session: aiohttp session
|
|
gallery_url: Base gallery URL
|
|
cat_id: Category ID to crawl
|
|
visited: Set of already-visited category IDs (prevents loops)
|
|
known: Set of known post_ids (for logging only)
|
|
progress_callback: Status callback
|
|
depth: Recursion depth (max 10)
|
|
|
|
Returns:
|
|
Set of album ID strings
|
|
"""
|
|
if cat_id in visited or depth > 10:
|
|
return set()
|
|
visited.add(cat_id)
|
|
|
|
# Fetch first page
|
|
cat_url = self._build_url(gallery_url, f'index.php?cat={cat_id}')
|
|
html = await self._fetch_page(session, cat_url)
|
|
if not html:
|
|
return set()
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
album_ids = set(self._extract_album_ids(html))
|
|
sub_cat_ids = self._extract_category_ids(html)
|
|
|
|
# Handle pagination: fetch remaining pages
|
|
total_pages = self._extract_page_count(html)
|
|
if total_pages > 1:
|
|
for page_num in range(2, total_pages + 1):
|
|
page_url = self._build_url(
|
|
gallery_url, f'index.php?cat={cat_id}&page={page_num}'
|
|
)
|
|
page_html = await self._fetch_page(session, page_url)
|
|
if page_html:
|
|
album_ids.update(self._extract_album_ids(page_html))
|
|
# Sub-categories are the same on every page, no need to re-extract
|
|
await asyncio.sleep(2)
|
|
|
|
# Filter out the current category from sub-categories
|
|
sub_cat_ids = [c for c in sub_cat_ids if c != cat_id and c not in visited]
|
|
|
|
if progress_callback:
|
|
progress_callback(
|
|
f'Category {cat_id}: {len(album_ids)} albums, '
|
|
f'{len(sub_cat_ids)} sub-categories'
|
|
+ (f' ({total_pages} pages)' if total_pages > 1 else '')
|
|
)
|
|
|
|
# Recurse into sub-categories
|
|
for sub_id in sub_cat_ids:
|
|
sub_albums = await self._crawl_category(
|
|
session, gallery_url, sub_id, visited, known,
|
|
progress_callback, depth + 1
|
|
)
|
|
album_ids.update(sub_albums)
|
|
|
|
return album_ids
|
|
|
|
async def _fetch_album(self, session: aiohttp.ClientSession,
|
|
gallery_url: str, album_id: str,
|
|
domain: str) -> Optional[Post]:
|
|
"""Fetch an album page (all pages) and build a Post object.
|
|
|
|
Handles pagination within albums (thumbnails.php?album=N&page=M).
|
|
|
|
Args:
|
|
session: aiohttp session
|
|
gallery_url: Base gallery URL
|
|
album_id: Album ID to fetch
|
|
domain: Domain name for creator_id
|
|
|
|
Returns:
|
|
Post object with attachments, or None on failure
|
|
"""
|
|
album_url = self._build_url(gallery_url, f'thumbnails.php?album={album_id}')
|
|
html = await self._fetch_page(session, album_url)
|
|
if not html:
|
|
return None
|
|
|
|
# Extract album title from first page
|
|
title = self._extract_album_title(html)
|
|
if not title:
|
|
title = f"Album {album_id}"
|
|
|
|
# Extract attachments from first page
|
|
attachments = self._extract_attachments(html, gallery_url)
|
|
|
|
# Handle pagination within album
|
|
total_pages = self._extract_page_count(html)
|
|
if total_pages > 1:
|
|
for page_num in range(2, total_pages + 1):
|
|
page_url = self._build_url(
|
|
gallery_url, f'thumbnails.php?album={album_id}&page={page_num}'
|
|
)
|
|
page_html = await self._fetch_page(session, page_url)
|
|
if page_html:
|
|
attachments.extend(self._extract_attachments(page_html, gallery_url))
|
|
await asyncio.sleep(2)
|
|
|
|
if not attachments:
|
|
return None
|
|
|
|
# Extract album date from breadcrumb + title
|
|
album_date = self._extract_album_date(html, title)
|
|
|
|
post_id = f"album_{album_id}"
|
|
return Post(
|
|
post_id=post_id,
|
|
service_id=self.SERVICE_ID,
|
|
platform=self.PLATFORM,
|
|
creator_id=domain,
|
|
title=None,
|
|
content=title,
|
|
published_at=album_date,
|
|
attachments=attachments,
|
|
)
|
|
|
|
def _extract_album_title(self, html: str) -> Optional[str]:
|
|
"""Extract album title from page HTML.
|
|
|
|
Priority: breadcrumb last item > <h1>/<h2> heading > <title> last segment
|
|
"""
|
|
# Try breadcrumb: last text segment after the last ">"
|
|
# Coppermine breadcrumbs: "Home > Category > Sub > Album Title"
|
|
bc_match = re.search(
|
|
r'class="[^"]*breadcrumb[^"]*"[^>]*>(.*?)</(?:div|span|td|p)',
|
|
html, re.DOTALL | re.IGNORECASE
|
|
)
|
|
if bc_match:
|
|
bc_text = bc_match.group(1)
|
|
# Strip HTML tags, split on ">", take last segment
|
|
bc_text = re.sub(r'<[^>]+>', ' ', bc_text)
|
|
parts = [p.strip() for p in bc_text.split('>') if p.strip()]
|
|
if parts:
|
|
title = self._clean_text(parts[-1])
|
|
if title and title.lower() not in ('home', 'index', 'gallery'):
|
|
return title
|
|
|
|
# Try headings
|
|
for tag in ('h1', 'h2', 'h3'):
|
|
h_match = re.search(
|
|
rf'<{tag}[^>]*>(.*?)</{tag}>', html, re.DOTALL | re.IGNORECASE
|
|
)
|
|
if h_match:
|
|
title = self._clean_text(h_match.group(1))
|
|
if title and len(title) > 2:
|
|
return title
|
|
|
|
# Fallback: <title> tag — take the last segment before the site name
|
|
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
|
|
if title_match:
|
|
title = title_match.group(1).strip()
|
|
# Usually "Site Name - Album Title" or "Album Title - Site Name"
|
|
# The album-specific part is typically not the site name;
|
|
# use the longest segment as a heuristic
|
|
if ' - ' in title:
|
|
parts = [p.strip() for p in title.split(' - ')]
|
|
# Pick the longest part (album names tend to be longer than site names)
|
|
title = max(parts, key=len)
|
|
if title:
|
|
return self._clean_text(title)
|
|
|
|
return None
|
|
|
|
def _extract_album_date(self, html: str, title: str) -> str:
|
|
"""Extract album date from breadcrumb year + title month/day.
|
|
|
|
Breadcrumb: "Home > Candids > 2026 > January 11 - Leaving..."
|
|
Title: "January 11 - Leaving Golden Globes afterparty..."
|
|
|
|
Returns ISO date string, or current datetime as fallback.
|
|
"""
|
|
MONTHS = {
|
|
'january': 1, 'february': 2, 'march': 3, 'april': 4,
|
|
'may': 5, 'june': 6, 'july': 7, 'august': 8,
|
|
'september': 9, 'october': 10, 'november': 11, 'december': 12,
|
|
}
|
|
|
|
# Extract year from breadcrumb path (look for 4-digit year in links)
|
|
year = None
|
|
# Breadcrumb links: index.php?cat=155">2026</a>
|
|
for m in re.finditer(r'>\s*((?:19|20)\d{2})\s*</', html):
|
|
year = int(m.group(1))
|
|
|
|
# Also try path segments in albums/ URLs for year
|
|
if not year:
|
|
path_match = re.search(r'albums/[^/]+/(20\d{2})/', html)
|
|
if path_match:
|
|
year = int(path_match.group(1))
|
|
|
|
# Extract month and day from album title
|
|
month, day = None, None
|
|
if title:
|
|
# "January 11 - ..." or "March 3 - ..."
|
|
date_match = re.match(
|
|
r'(\w+)\s+(\d{1,2})\b', title
|
|
)
|
|
if date_match:
|
|
month_name = date_match.group(1).lower()
|
|
if month_name in MONTHS:
|
|
month = MONTHS[month_name]
|
|
day = int(date_match.group(2))
|
|
|
|
# Build date from breadcrumb year + title month/day
|
|
if year and month and day:
|
|
try:
|
|
return datetime(year, month, day).isoformat()
|
|
except ValueError:
|
|
pass
|
|
if year and month:
|
|
try:
|
|
return datetime(year, month, 1).isoformat()
|
|
except ValueError:
|
|
pass
|
|
if year:
|
|
return datetime(year, 1, 1).isoformat()
|
|
|
|
# Fallback: parse "Date added=Jan 13, 2026" from thumbnail tooltips
|
|
MONTH_ABBR = {
|
|
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
|
|
'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
|
|
'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
|
|
}
|
|
added_match = re.search(
|
|
r'Date added\s*=\s*(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
|
|
)
|
|
if added_match:
|
|
m_abbr = added_match.group(1).lower()
|
|
if m_abbr in MONTH_ABBR:
|
|
try:
|
|
return datetime(
|
|
int(added_match.group(3)),
|
|
MONTH_ABBR[m_abbr],
|
|
int(added_match.group(2))
|
|
).isoformat()
|
|
except ValueError:
|
|
pass
|
|
|
|
# Also try "last one added on Jan 13, 2026" from album_stat
|
|
stat_match = re.search(
|
|
r'last one added on\s+(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
|
|
)
|
|
if stat_match:
|
|
m_abbr = stat_match.group(1).lower()
|
|
if m_abbr in MONTH_ABBR:
|
|
try:
|
|
return datetime(
|
|
int(stat_match.group(3)),
|
|
MONTH_ABBR[m_abbr],
|
|
int(stat_match.group(2))
|
|
).isoformat()
|
|
except ValueError:
|
|
pass
|
|
|
|
return datetime.now().isoformat()
|
|
|
|
def _extract_attachments(self, html: str, gallery_url: str) -> List[Attachment]:
|
|
"""Extract photo attachments from album page HTML.
|
|
|
|
Finds thumbnail images and converts them to full-res URLs by
|
|
stripping the `thumb_` prefix from the filename.
|
|
"""
|
|
attachments = []
|
|
seen_urls = set()
|
|
|
|
# Pattern: thumbnail images in album pages
|
|
# Common patterns:
|
|
# <img src="albums/path/thumb_filename.jpg" ...>
|
|
# <img src="albums/path/normal_filename.jpg" ...>
|
|
for match in re.finditer(
|
|
r'<img[^>]+src=["\']([^"\']*?albums/[^"\']*?(?:thumb_|normal_)[^"\']+)["\']',
|
|
html, re.IGNORECASE
|
|
):
|
|
thumb_src = match.group(1)
|
|
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
|
|
if full_url and full_url not in seen_urls:
|
|
seen_urls.add(full_url)
|
|
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
|
|
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
|
|
|
attachments.append(Attachment(
|
|
name=filename,
|
|
server_path=full_url, # use as dedup key
|
|
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
|
|
extension=ext or None,
|
|
download_url=full_url,
|
|
))
|
|
|
|
# Also try: <a href="displayimage.php?..."><img src="albums/...">
|
|
# Some themes wrap thumbnails in links
|
|
if not attachments:
|
|
for match in re.finditer(
|
|
r'<a[^>]+href=["\'][^"\']*displayimage\.php[^"\']*["\'][^>]*>'
|
|
r'\s*<img[^>]+src=["\']([^"\']+)["\']',
|
|
html, re.IGNORECASE | re.DOTALL
|
|
):
|
|
thumb_src = match.group(1)
|
|
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
|
|
if full_url and full_url not in seen_urls:
|
|
seen_urls.add(full_url)
|
|
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
|
|
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
|
|
|
attachments.append(Attachment(
|
|
name=filename,
|
|
server_path=full_url,
|
|
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
|
|
extension=ext or None,
|
|
download_url=full_url,
|
|
))
|
|
|
|
return attachments
|
|
|
|
def _thumb_to_fullres(self, thumb_src: str, gallery_url: str) -> Optional[str]:
|
|
"""Convert a thumbnail URL to a full-resolution URL.
|
|
|
|
Strips `thumb_` or `normal_` prefix from the filename and
|
|
prepends the gallery base URL if needed.
|
|
|
|
Args:
|
|
thumb_src: Thumbnail src attribute value
|
|
gallery_url: Base gallery URL
|
|
|
|
Returns:
|
|
Full-resolution image URL, or None if conversion fails
|
|
"""
|
|
if not thumb_src:
|
|
return None
|
|
|
|
# Strip thumb_ or normal_ prefix from filename
|
|
# e.g. albums/candids/2026/0111/thumb_001.jpg → albums/candids/2026/0111/001.jpg
|
|
fullres_path = re.sub(r'(/)(?:thumb_|normal_)', r'\1', thumb_src)
|
|
|
|
# If the path is already absolute (starts with http), return as-is
|
|
if fullres_path.startswith(('http://', 'https://')):
|
|
return fullres_path
|
|
|
|
# Otherwise, make it absolute relative to gallery URL
|
|
base = gallery_url.rstrip('/')
|
|
fullres_path = fullres_path.lstrip('./')
|
|
return f"{base}/{fullres_path}"
|
|
|
|
def _clean_text(self, text: str) -> str:
|
|
"""Clean HTML entities and whitespace from text."""
|
|
text = re.sub(r'&', '&', text)
|
|
text = re.sub(r'<', '<', text)
|
|
text = re.sub(r'>', '>', text)
|
|
text = re.sub(r'"', '"', text)
|
|
text = re.sub(r'&#\d+;', '', text)
|
|
text = re.sub(r'&\w+;', '', text)
|
|
text = re.sub(r'<[^>]+>', '', text)
|
|
return text.strip()
|