Files
media-downloader/modules/paid_content/coppermine_client.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

623 lines
25 KiB
Python

"""
Coppermine Gallery scraper client.
Coppermine is a PHP photo gallery with a nested structure:
categories > sub-categories > albums > photos
One album maps to one Post with N Attachments.
Full-res URLs are derived from thumbnails by stripping the `thumb_` prefix.
"""
import asyncio
import re
from datetime import datetime
from typing import Dict, List, Optional, Set
from urllib.parse import urljoin, urlparse, parse_qs
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class CoppermineClient(LoggingMixin):
SERVICE_ID = 'coppermine'
PLATFORM = 'coppermine'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
def __init__(self, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Coppermine')
async def get_profile_info(self, gallery_url: str) -> Optional[Dict]:
"""Fetch gallery root and extract profile metadata.
Args:
gallery_url: Base gallery URL (e.g. https://kylie-jenner.org/gallery)
Returns:
Dict with username, display_name, post_count, gallery_url or None on failure
"""
root_url = self._build_url(gallery_url, 'index.php')
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
html = await self._fetch_page(session, root_url)
if not html:
return None
# Extract site title from <title> tag
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
site_title = title_match.group(1).strip() if title_match else 'Coppermine Gallery'
# Clean HTML entities
site_title = re.sub(r'&amp;', '&', site_title)
site_title = re.sub(r'&lt;', '<', site_title)
site_title = re.sub(r'&gt;', '>', site_title)
site_title = re.sub(r'&#\d+;', '', site_title)
site_title = re.sub(r'&\w+;', '', site_title)
# Try to extract stats: "N files in M albums"
total_files = 0
total_albums = 0
stats_match = re.search(
r'(\d[\d,]*)\s+files?\s+in\s+(\d[\d,]*)\s+albums?',
html, re.IGNORECASE
)
if stats_match:
total_files = int(stats_match.group(1).replace(',', ''))
total_albums = int(stats_match.group(2).replace(',', ''))
# Use domain as username
parsed = urlparse(gallery_url)
domain = parsed.netloc.replace('www.', '')
return {
'username': domain,
'display_name': site_title,
'post_count': total_albums,
'gallery_url': gallery_url,
}
except Exception as e:
self.log(f"Error fetching profile info from {gallery_url}: {e}", 'error')
return None
async def get_posts(self, gallery_url: str,
known_post_ids: Optional[Set[str]] = None,
progress_callback=None,
post_callback=None):
"""Crawl the gallery, yielding new albums as Post objects incrementally.
Phase 1: Fetch root, extract top-level category links
Phase 2: Recursively crawl categories until album links found
Phase 3: For each album, fetch thumbnails and call post_callback immediately
Args:
gallery_url: Base gallery URL
known_post_ids: Set of post IDs already in DB (album_NNN)
progress_callback: Called with status message strings
post_callback: async callable(post) — called for each album as it's fetched.
If provided, posts are streamed instead of collected.
Returns:
List of Post objects (only if post_callback is None)
"""
known = known_post_ids or set()
timeout = aiohttp.ClientTimeout(total=None, sock_connect=30, sock_read=60)
posts_collected = [] if post_callback is None else None
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
# Phase 1: Get all category links from root
root_url = self._build_url(gallery_url, 'index.php')
root_html = await self._fetch_page(session, root_url)
if not root_html:
self.log("Failed to fetch gallery root", 'error')
return [] if post_callback is None else None
category_ids = self._extract_category_ids(root_html)
self.log(f"Found {len(category_ids)} top-level categories", 'info')
if progress_callback:
progress_callback(f'Found {len(category_ids)} categories, crawling...')
# Phase 2: Recursively crawl categories to find album IDs
album_ids = set()
visited_cats = set()
for cat_id in category_ids:
new_albums = await self._crawl_category(
session, gallery_url, cat_id, visited_cats, known, progress_callback
)
album_ids.update(new_albums)
# Filter out known albums
new_album_ids = {aid for aid in album_ids
if f"album_{aid}" not in known}
self.log(f"Found {len(new_album_ids)} new albums "
f"({len(album_ids)} total, {len(album_ids) - len(new_album_ids)} known)",
'info')
if progress_callback:
progress_callback(f'Found {len(new_album_ids)} new albums, fetching photos...')
# Phase 3: Fetch each new album and deliver Post objects
parsed = urlparse(gallery_url)
domain = parsed.netloc.replace('www.', '')
fetched = 0
for i, album_id in enumerate(sorted(new_album_ids)):
if progress_callback and (i + 1) % 5 == 0:
progress_callback(
f'Fetching album {i + 1}/{len(new_album_ids)}...'
)
post = await self._fetch_album(session, gallery_url, album_id, domain)
if post and post.attachments:
fetched += 1
if post_callback:
await post_callback(post)
else:
posts_collected.append(post)
# Rate limit: 1s between page fetches
await asyncio.sleep(2)
self.log(f"Fetched {fetched} albums with attachments", 'info')
return posts_collected
except Exception as e:
self.log(f"Error crawling gallery {gallery_url}: {e}", 'error')
return [] if post_callback is None else None
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _build_url(self, gallery_url: str, page: str) -> str:
"""Build a full URL from the gallery base and a page name."""
base = gallery_url.rstrip('/')
return f"{base}/{page}"
async def _fetch_page(self, session: aiohttp.ClientSession, url: str,
max_retries: int = 3) -> Optional[str]:
"""Fetch a page and return its HTML text, or None on failure.
Retries with exponential backoff on connection errors / server disconnects.
"""
for attempt in range(max_retries):
try:
async with session.get(url, headers=self.HEADERS) as resp:
if resp.status == 429:
wait = 5 * (attempt + 1)
self.log(f"Rate limited on {url}, waiting {wait}s", 'warning')
await asyncio.sleep(wait)
continue
if resp.status != 200:
self.log(f"HTTP {resp.status} fetching {url}", 'warning')
return None
return await resp.text()
except (aiohttp.ServerDisconnectedError, aiohttp.ClientOSError,
aiohttp.ClientPayloadError, ConnectionResetError) as e:
wait = 3 * (attempt + 1)
if attempt < max_retries - 1:
self.log(f"Connection error on {url}, retry {attempt + 1} in {wait}s: {e}",
'warning')
await asyncio.sleep(wait)
else:
self.log(f"Failed after {max_retries} attempts: {url}: {e}", 'warning')
return None
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return None
return None
def _extract_category_ids(self, html: str) -> List[str]:
"""Extract category IDs from index.php page.
Looks for links like: index.php?cat=N
"""
cat_ids = []
seen = set()
for match in re.finditer(r'index\.php\?cat=(\d+)', html):
cat_id = match.group(1)
if cat_id not in seen:
seen.add(cat_id)
cat_ids.append(cat_id)
return cat_ids
def _extract_album_ids(self, html: str) -> List[str]:
"""Extract album IDs from a category page.
Looks for links like: thumbnails.php?album=N
"""
album_ids = []
seen = set()
for match in re.finditer(r'thumbnails\.php\?album=(\d+)', html):
album_id = match.group(1)
if album_id not in seen:
seen.add(album_id)
album_ids.append(album_id)
return album_ids
def _extract_page_count(self, html: str) -> int:
"""Extract total page count from Coppermine pagination text.
Looks for patterns like "53 albums on 2 page(s)" or "N files on M page(s)".
"""
match = re.search(r'on\s+(\d+)\s+page\(s\)', html, re.IGNORECASE)
if match:
return int(match.group(1))
return 1
async def _crawl_category(self, session: aiohttp.ClientSession,
gallery_url: str, cat_id: str,
visited: Set[str], known: Set[str],
progress_callback=None,
depth: int = 0) -> Set[str]:
"""Recursively crawl a category to find all album IDs.
Categories can contain sub-categories or albums. We recurse
until we find album links (thumbnails.php?album=N).
Handles pagination within category pages (index.php?cat=N&page=M).
Args:
session: aiohttp session
gallery_url: Base gallery URL
cat_id: Category ID to crawl
visited: Set of already-visited category IDs (prevents loops)
known: Set of known post_ids (for logging only)
progress_callback: Status callback
depth: Recursion depth (max 10)
Returns:
Set of album ID strings
"""
if cat_id in visited or depth > 10:
return set()
visited.add(cat_id)
# Fetch first page
cat_url = self._build_url(gallery_url, f'index.php?cat={cat_id}')
html = await self._fetch_page(session, cat_url)
if not html:
return set()
await asyncio.sleep(2)
album_ids = set(self._extract_album_ids(html))
sub_cat_ids = self._extract_category_ids(html)
# Handle pagination: fetch remaining pages
total_pages = self._extract_page_count(html)
if total_pages > 1:
for page_num in range(2, total_pages + 1):
page_url = self._build_url(
gallery_url, f'index.php?cat={cat_id}&page={page_num}'
)
page_html = await self._fetch_page(session, page_url)
if page_html:
album_ids.update(self._extract_album_ids(page_html))
# Sub-categories are the same on every page, no need to re-extract
await asyncio.sleep(2)
# Filter out the current category from sub-categories
sub_cat_ids = [c for c in sub_cat_ids if c != cat_id and c not in visited]
if progress_callback:
progress_callback(
f'Category {cat_id}: {len(album_ids)} albums, '
f'{len(sub_cat_ids)} sub-categories'
+ (f' ({total_pages} pages)' if total_pages > 1 else '')
)
# Recurse into sub-categories
for sub_id in sub_cat_ids:
sub_albums = await self._crawl_category(
session, gallery_url, sub_id, visited, known,
progress_callback, depth + 1
)
album_ids.update(sub_albums)
return album_ids
async def _fetch_album(self, session: aiohttp.ClientSession,
gallery_url: str, album_id: str,
domain: str) -> Optional[Post]:
"""Fetch an album page (all pages) and build a Post object.
Handles pagination within albums (thumbnails.php?album=N&page=M).
Args:
session: aiohttp session
gallery_url: Base gallery URL
album_id: Album ID to fetch
domain: Domain name for creator_id
Returns:
Post object with attachments, or None on failure
"""
album_url = self._build_url(gallery_url, f'thumbnails.php?album={album_id}')
html = await self._fetch_page(session, album_url)
if not html:
return None
# Extract album title from first page
title = self._extract_album_title(html)
if not title:
title = f"Album {album_id}"
# Extract attachments from first page
attachments = self._extract_attachments(html, gallery_url)
# Handle pagination within album
total_pages = self._extract_page_count(html)
if total_pages > 1:
for page_num in range(2, total_pages + 1):
page_url = self._build_url(
gallery_url, f'thumbnails.php?album={album_id}&page={page_num}'
)
page_html = await self._fetch_page(session, page_url)
if page_html:
attachments.extend(self._extract_attachments(page_html, gallery_url))
await asyncio.sleep(2)
if not attachments:
return None
# Extract album date from breadcrumb + title
album_date = self._extract_album_date(html, title)
post_id = f"album_{album_id}"
return Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=domain,
title=None,
content=title,
published_at=album_date,
attachments=attachments,
)
def _extract_album_title(self, html: str) -> Optional[str]:
"""Extract album title from page HTML.
Priority: breadcrumb last item > <h1>/<h2> heading > <title> last segment
"""
# Try breadcrumb: last text segment after the last ">"
# Coppermine breadcrumbs: "Home > Category > Sub > Album Title"
bc_match = re.search(
r'class="[^"]*breadcrumb[^"]*"[^>]*>(.*?)</(?:div|span|td|p)',
html, re.DOTALL | re.IGNORECASE
)
if bc_match:
bc_text = bc_match.group(1)
# Strip HTML tags, split on ">", take last segment
bc_text = re.sub(r'<[^>]+>', ' ', bc_text)
parts = [p.strip() for p in bc_text.split('>') if p.strip()]
if parts:
title = self._clean_text(parts[-1])
if title and title.lower() not in ('home', 'index', 'gallery'):
return title
# Try headings
for tag in ('h1', 'h2', 'h3'):
h_match = re.search(
rf'<{tag}[^>]*>(.*?)</{tag}>', html, re.DOTALL | re.IGNORECASE
)
if h_match:
title = self._clean_text(h_match.group(1))
if title and len(title) > 2:
return title
# Fallback: <title> tag — take the last segment before the site name
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
if title_match:
title = title_match.group(1).strip()
# Usually "Site Name - Album Title" or "Album Title - Site Name"
# The album-specific part is typically not the site name;
# use the longest segment as a heuristic
if ' - ' in title:
parts = [p.strip() for p in title.split(' - ')]
# Pick the longest part (album names tend to be longer than site names)
title = max(parts, key=len)
if title:
return self._clean_text(title)
return None
def _extract_album_date(self, html: str, title: str) -> str:
"""Extract album date from breadcrumb year + title month/day.
Breadcrumb: "Home > Candids > 2026 > January 11 - Leaving..."
Title: "January 11 - Leaving Golden Globes afterparty..."
Returns ISO date string, or current datetime as fallback.
"""
MONTHS = {
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12,
}
# Extract year from breadcrumb path (look for 4-digit year in links)
year = None
# Breadcrumb links: index.php?cat=155">2026</a>
for m in re.finditer(r'>\s*((?:19|20)\d{2})\s*</', html):
year = int(m.group(1))
# Also try path segments in albums/ URLs for year
if not year:
path_match = re.search(r'albums/[^/]+/(20\d{2})/', html)
if path_match:
year = int(path_match.group(1))
# Extract month and day from album title
month, day = None, None
if title:
# "January 11 - ..." or "March 3 - ..."
date_match = re.match(
r'(\w+)\s+(\d{1,2})\b', title
)
if date_match:
month_name = date_match.group(1).lower()
if month_name in MONTHS:
month = MONTHS[month_name]
day = int(date_match.group(2))
# Build date from breadcrumb year + title month/day
if year and month and day:
try:
return datetime(year, month, day).isoformat()
except ValueError:
pass
if year and month:
try:
return datetime(year, month, 1).isoformat()
except ValueError:
pass
if year:
return datetime(year, 1, 1).isoformat()
# Fallback: parse "Date added=Jan 13, 2026" from thumbnail tooltips
MONTH_ABBR = {
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
}
added_match = re.search(
r'Date added\s*=\s*(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
)
if added_match:
m_abbr = added_match.group(1).lower()
if m_abbr in MONTH_ABBR:
try:
return datetime(
int(added_match.group(3)),
MONTH_ABBR[m_abbr],
int(added_match.group(2))
).isoformat()
except ValueError:
pass
# Also try "last one added on Jan 13, 2026" from album_stat
stat_match = re.search(
r'last one added on\s+(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
)
if stat_match:
m_abbr = stat_match.group(1).lower()
if m_abbr in MONTH_ABBR:
try:
return datetime(
int(stat_match.group(3)),
MONTH_ABBR[m_abbr],
int(stat_match.group(2))
).isoformat()
except ValueError:
pass
return datetime.now().isoformat()
def _extract_attachments(self, html: str, gallery_url: str) -> List[Attachment]:
"""Extract photo attachments from album page HTML.
Finds thumbnail images and converts them to full-res URLs by
stripping the `thumb_` prefix from the filename.
"""
attachments = []
seen_urls = set()
# Pattern: thumbnail images in album pages
# Common patterns:
# <img src="albums/path/thumb_filename.jpg" ...>
# <img src="albums/path/normal_filename.jpg" ...>
for match in re.finditer(
r'<img[^>]+src=["\']([^"\']*?albums/[^"\']*?(?:thumb_|normal_)[^"\']+)["\']',
html, re.IGNORECASE
):
thumb_src = match.group(1)
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
if full_url and full_url not in seen_urls:
seen_urls.add(full_url)
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
attachments.append(Attachment(
name=filename,
server_path=full_url, # use as dedup key
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
extension=ext or None,
download_url=full_url,
))
# Also try: <a href="displayimage.php?..."><img src="albums/...">
# Some themes wrap thumbnails in links
if not attachments:
for match in re.finditer(
r'<a[^>]+href=["\'][^"\']*displayimage\.php[^"\']*["\'][^>]*>'
r'\s*<img[^>]+src=["\']([^"\']+)["\']',
html, re.IGNORECASE | re.DOTALL
):
thumb_src = match.group(1)
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
if full_url and full_url not in seen_urls:
seen_urls.add(full_url)
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
attachments.append(Attachment(
name=filename,
server_path=full_url,
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
extension=ext or None,
download_url=full_url,
))
return attachments
def _thumb_to_fullres(self, thumb_src: str, gallery_url: str) -> Optional[str]:
"""Convert a thumbnail URL to a full-resolution URL.
Strips `thumb_` or `normal_` prefix from the filename and
prepends the gallery base URL if needed.
Args:
thumb_src: Thumbnail src attribute value
gallery_url: Base gallery URL
Returns:
Full-resolution image URL, or None if conversion fails
"""
if not thumb_src:
return None
# Strip thumb_ or normal_ prefix from filename
# e.g. albums/candids/2026/0111/thumb_001.jpg → albums/candids/2026/0111/001.jpg
fullres_path = re.sub(r'(/)(?:thumb_|normal_)', r'\1', thumb_src)
# If the path is already absolute (starts with http), return as-is
if fullres_path.startswith(('http://', 'https://')):
return fullres_path
# Otherwise, make it absolute relative to gallery URL
base = gallery_url.rstrip('/')
fullres_path = fullres_path.lstrip('./')
return f"{base}/{fullres_path}"
def _clean_text(self, text: str) -> str:
"""Clean HTML entities and whitespace from text."""
text = re.sub(r'&amp;', '&', text)
text = re.sub(r'&lt;', '<', text)
text = re.sub(r'&gt;', '>', text)
text = re.sub(r'&quot;', '"', text)
text = re.sub(r'&#\d+;', '', text)
text = re.sub(r'&\w+;', '', text)
text = re.sub(r'<[^>]+>', '', text)
return text.strip()