""" Coppermine Gallery scraper client. Coppermine is a PHP photo gallery with a nested structure: categories > sub-categories > albums > photos One album maps to one Post with N Attachments. Full-res URLs are derived from thumbnails by stripping the `thumb_` prefix. """ import asyncio import re from datetime import datetime from typing import Dict, List, Optional, Set from urllib.parse import urljoin, urlparse, parse_qs import aiohttp from modules.base_module import LoggingMixin from .models import Post, Attachment class CoppermineClient(LoggingMixin): SERVICE_ID = 'coppermine' PLATFORM = 'coppermine' HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', } IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'} def __init__(self, log_callback=None): self._init_logger('PaidContent', log_callback, default_module='Coppermine') async def get_profile_info(self, gallery_url: str) -> Optional[Dict]: """Fetch gallery root and extract profile metadata. Args: gallery_url: Base gallery URL (e.g. https://kylie-jenner.org/gallery) Returns: Dict with username, display_name, post_count, gallery_url or None on failure """ root_url = self._build_url(gallery_url, 'index.php') timeout = aiohttp.ClientTimeout(total=30) try: async with aiohttp.ClientSession(timeout=timeout) as session: html = await self._fetch_page(session, root_url) if not html: return None # Extract site title from tag title_match = re.search(r'<title[^>]*>(.*?)', html, re.DOTALL | re.IGNORECASE) site_title = title_match.group(1).strip() if title_match else 'Coppermine Gallery' # Clean HTML entities site_title = re.sub(r'&', '&', site_title) site_title = re.sub(r'<', '<', site_title) site_title = re.sub(r'>', '>', site_title) site_title = re.sub(r'&#\d+;', '', site_title) site_title = re.sub(r'&\w+;', '', site_title) # Try to extract stats: "N files in M albums" total_files = 0 total_albums = 0 stats_match = re.search( r'(\d[\d,]*)\s+files?\s+in\s+(\d[\d,]*)\s+albums?', html, re.IGNORECASE ) if stats_match: total_files = int(stats_match.group(1).replace(',', '')) total_albums = int(stats_match.group(2).replace(',', '')) # Use domain as username parsed = urlparse(gallery_url) domain = parsed.netloc.replace('www.', '') return { 'username': domain, 'display_name': site_title, 'post_count': total_albums, 'gallery_url': gallery_url, } except Exception as e: self.log(f"Error fetching profile info from {gallery_url}: {e}", 'error') return None async def get_posts(self, gallery_url: str, known_post_ids: Optional[Set[str]] = None, progress_callback=None, post_callback=None): """Crawl the gallery, yielding new albums as Post objects incrementally. Phase 1: Fetch root, extract top-level category links Phase 2: Recursively crawl categories until album links found Phase 3: For each album, fetch thumbnails and call post_callback immediately Args: gallery_url: Base gallery URL known_post_ids: Set of post IDs already in DB (album_NNN) progress_callback: Called with status message strings post_callback: async callable(post) — called for each album as it's fetched. If provided, posts are streamed instead of collected. Returns: List of Post objects (only if post_callback is None) """ known = known_post_ids or set() timeout = aiohttp.ClientTimeout(total=None, sock_connect=30, sock_read=60) posts_collected = [] if post_callback is None else None try: async with aiohttp.ClientSession(timeout=timeout) as session: # Phase 1: Get all category links from root root_url = self._build_url(gallery_url, 'index.php') root_html = await self._fetch_page(session, root_url) if not root_html: self.log("Failed to fetch gallery root", 'error') return [] if post_callback is None else None category_ids = self._extract_category_ids(root_html) self.log(f"Found {len(category_ids)} top-level categories", 'info') if progress_callback: progress_callback(f'Found {len(category_ids)} categories, crawling...') # Phase 2: Recursively crawl categories to find album IDs album_ids = set() visited_cats = set() for cat_id in category_ids: new_albums = await self._crawl_category( session, gallery_url, cat_id, visited_cats, known, progress_callback ) album_ids.update(new_albums) # Filter out known albums new_album_ids = {aid for aid in album_ids if f"album_{aid}" not in known} self.log(f"Found {len(new_album_ids)} new albums " f"({len(album_ids)} total, {len(album_ids) - len(new_album_ids)} known)", 'info') if progress_callback: progress_callback(f'Found {len(new_album_ids)} new albums, fetching photos...') # Phase 3: Fetch each new album and deliver Post objects parsed = urlparse(gallery_url) domain = parsed.netloc.replace('www.', '') fetched = 0 for i, album_id in enumerate(sorted(new_album_ids)): if progress_callback and (i + 1) % 5 == 0: progress_callback( f'Fetching album {i + 1}/{len(new_album_ids)}...' ) post = await self._fetch_album(session, gallery_url, album_id, domain) if post and post.attachments: fetched += 1 if post_callback: await post_callback(post) else: posts_collected.append(post) # Rate limit: 1s between page fetches await asyncio.sleep(2) self.log(f"Fetched {fetched} albums with attachments", 'info') return posts_collected except Exception as e: self.log(f"Error crawling gallery {gallery_url}: {e}", 'error') return [] if post_callback is None else None # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ def _build_url(self, gallery_url: str, page: str) -> str: """Build a full URL from the gallery base and a page name.""" base = gallery_url.rstrip('/') return f"{base}/{page}" async def _fetch_page(self, session: aiohttp.ClientSession, url: str, max_retries: int = 3) -> Optional[str]: """Fetch a page and return its HTML text, or None on failure. Retries with exponential backoff on connection errors / server disconnects. """ for attempt in range(max_retries): try: async with session.get(url, headers=self.HEADERS) as resp: if resp.status == 429: wait = 5 * (attempt + 1) self.log(f"Rate limited on {url}, waiting {wait}s", 'warning') await asyncio.sleep(wait) continue if resp.status != 200: self.log(f"HTTP {resp.status} fetching {url}", 'warning') return None return await resp.text() except (aiohttp.ServerDisconnectedError, aiohttp.ClientOSError, aiohttp.ClientPayloadError, ConnectionResetError) as e: wait = 3 * (attempt + 1) if attempt < max_retries - 1: self.log(f"Connection error on {url}, retry {attempt + 1} in {wait}s: {e}", 'warning') await asyncio.sleep(wait) else: self.log(f"Failed after {max_retries} attempts: {url}: {e}", 'warning') return None except Exception as e: self.log(f"Error fetching {url}: {e}", 'warning') return None return None def _extract_category_ids(self, html: str) -> List[str]: """Extract category IDs from index.php page. Looks for links like: index.php?cat=N """ cat_ids = [] seen = set() for match in re.finditer(r'index\.php\?cat=(\d+)', html): cat_id = match.group(1) if cat_id not in seen: seen.add(cat_id) cat_ids.append(cat_id) return cat_ids def _extract_album_ids(self, html: str) -> List[str]: """Extract album IDs from a category page. Looks for links like: thumbnails.php?album=N """ album_ids = [] seen = set() for match in re.finditer(r'thumbnails\.php\?album=(\d+)', html): album_id = match.group(1) if album_id not in seen: seen.add(album_id) album_ids.append(album_id) return album_ids def _extract_page_count(self, html: str) -> int: """Extract total page count from Coppermine pagination text. Looks for patterns like "53 albums on 2 page(s)" or "N files on M page(s)". """ match = re.search(r'on\s+(\d+)\s+page\(s\)', html, re.IGNORECASE) if match: return int(match.group(1)) return 1 async def _crawl_category(self, session: aiohttp.ClientSession, gallery_url: str, cat_id: str, visited: Set[str], known: Set[str], progress_callback=None, depth: int = 0) -> Set[str]: """Recursively crawl a category to find all album IDs. Categories can contain sub-categories or albums. We recurse until we find album links (thumbnails.php?album=N). Handles pagination within category pages (index.php?cat=N&page=M). Args: session: aiohttp session gallery_url: Base gallery URL cat_id: Category ID to crawl visited: Set of already-visited category IDs (prevents loops) known: Set of known post_ids (for logging only) progress_callback: Status callback depth: Recursion depth (max 10) Returns: Set of album ID strings """ if cat_id in visited or depth > 10: return set() visited.add(cat_id) # Fetch first page cat_url = self._build_url(gallery_url, f'index.php?cat={cat_id}') html = await self._fetch_page(session, cat_url) if not html: return set() await asyncio.sleep(2) album_ids = set(self._extract_album_ids(html)) sub_cat_ids = self._extract_category_ids(html) # Handle pagination: fetch remaining pages total_pages = self._extract_page_count(html) if total_pages > 1: for page_num in range(2, total_pages + 1): page_url = self._build_url( gallery_url, f'index.php?cat={cat_id}&page={page_num}' ) page_html = await self._fetch_page(session, page_url) if page_html: album_ids.update(self._extract_album_ids(page_html)) # Sub-categories are the same on every page, no need to re-extract await asyncio.sleep(2) # Filter out the current category from sub-categories sub_cat_ids = [c for c in sub_cat_ids if c != cat_id and c not in visited] if progress_callback: progress_callback( f'Category {cat_id}: {len(album_ids)} albums, ' f'{len(sub_cat_ids)} sub-categories' + (f' ({total_pages} pages)' if total_pages > 1 else '') ) # Recurse into sub-categories for sub_id in sub_cat_ids: sub_albums = await self._crawl_category( session, gallery_url, sub_id, visited, known, progress_callback, depth + 1 ) album_ids.update(sub_albums) return album_ids async def _fetch_album(self, session: aiohttp.ClientSession, gallery_url: str, album_id: str, domain: str) -> Optional[Post]: """Fetch an album page (all pages) and build a Post object. Handles pagination within albums (thumbnails.php?album=N&page=M). Args: session: aiohttp session gallery_url: Base gallery URL album_id: Album ID to fetch domain: Domain name for creator_id Returns: Post object with attachments, or None on failure """ album_url = self._build_url(gallery_url, f'thumbnails.php?album={album_id}') html = await self._fetch_page(session, album_url) if not html: return None # Extract album title from first page title = self._extract_album_title(html) if not title: title = f"Album {album_id}" # Extract attachments from first page attachments = self._extract_attachments(html, gallery_url) # Handle pagination within album total_pages = self._extract_page_count(html) if total_pages > 1: for page_num in range(2, total_pages + 1): page_url = self._build_url( gallery_url, f'thumbnails.php?album={album_id}&page={page_num}' ) page_html = await self._fetch_page(session, page_url) if page_html: attachments.extend(self._extract_attachments(page_html, gallery_url)) await asyncio.sleep(2) if not attachments: return None # Extract album date from breadcrumb + title album_date = self._extract_album_date(html, title) post_id = f"album_{album_id}" return Post( post_id=post_id, service_id=self.SERVICE_ID, platform=self.PLATFORM, creator_id=domain, title=None, content=title, published_at=album_date, attachments=attachments, ) def _extract_album_title(self, html: str) -> Optional[str]: """Extract album title from page HTML. Priority: breadcrumb last item >

/

heading > last segment """ # Try breadcrumb: last text segment after the last ">" # Coppermine breadcrumbs: "Home > Category > Sub > Album Title" bc_match = re.search( r'class="[^"]breadcrumb[^"]"[^>]>(.?)</(?:div|span|td|p)', html, re.DOTALL | re.IGNORECASE ) if bc_match: bc_text = bc_match.group(1) # Strip HTML tags, split on ">", take last segment bc_text = re.sub(r'<[^>]+>', ' ', bc_text) parts = [p.strip() for p in bc_text.split('>') if p.strip()] if parts: title = self._clean_text(parts[-1]) if title and title.lower() not in ('home', 'index', 'gallery'): return title # Try headings for tag in ('h1', 'h2', 'h3'): h_match = re.search( rf'<{tag}[^>]>(.?)</{tag}>', html, re.DOTALL | re.IGNORECASE ) if h_match: title = self._clean_text(h_match.group(1)) if title and len(title) > 2: return title # Fallback: <title> tag — take the last segment before the site name title_match = re.search(r'<title[^>]>(.?)', html, re.DOTALL | re.IGNORECASE) if title_match: title = title_match.group(1).strip() # Usually "Site Name - Album Title" or "Album Title - Site Name" # The album-specific part is typically not the site name; # use the longest segment as a heuristic if ' - ' in title: parts = [p.strip() for p in title.split(' - ')] # Pick the longest part (album names tend to be longer than site names) title = max(parts, key=len) if title: return self._clean_text(title) return None def _extract_album_date(self, html: str, title: str) -> str: """Extract album date from breadcrumb year + title month/day. Breadcrumb: "Home > Candids > 2026 > January 11 - Leaving..." Title: "January 11 - Leaving Golden Globes afterparty..." Returns ISO date string, or current datetime as fallback. """ MONTHS = { 'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12, } # Extract year from breadcrumb path (look for 4-digit year in links) year = None # Breadcrumb links: index.php?cat=155">2026 for m in re.finditer(r'>\s((?:19|20)\d{2})\s List[Attachment]: """Extract photo attachments from album page HTML. Finds thumbnail images and converts them to full-res URLs by stripping the `thumb_` prefix from the filename. """ attachments = [] seen_urls = set() # Pattern: thumbnail images in album pages # Common patterns: # # for match in re.finditer( r']+src=["\']([^"\']?albums/[^"\']?(?:thumb_|normal_)[^"\']+)["\']', html, re.IGNORECASE ): thumb_src = match.group(1) full_url = self._thumb_to_fullres(thumb_src, gallery_url) if full_url and full_url not in seen_urls: seen_urls.add(full_url) filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else '' attachments.append(Attachment( name=filename, server_path=full_url, # use as dedup key file_type='image' if ext in self.IMAGE_EXTS else 'unknown', extension=ext or None, download_url=full_url, )) # Also try: # Some themes wrap thumbnails in links if not attachments: for match in re.finditer( r']+href=["\'][^"\']displayimage\.php[^"\']["\'][^>]>' r'\s]+src=["\']([^"\']+)["\']', html, re.IGNORECASE | re.DOTALL ): thumb_src = match.group(1) full_url = self._thumb_to_fullres(thumb_src, gallery_url) if full_url and full_url not in seen_urls: seen_urls.add(full_url) filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else '' attachments.append(Attachment( name=filename, server_path=full_url, file_type='image' if ext in self.IMAGE_EXTS else 'unknown', extension=ext or None, download_url=full_url, )) return attachments def _thumb_to_fullres(self, thumb_src: str, gallery_url: str) -> Optional[str]: """Convert a thumbnail URL to a full-resolution URL. Strips `thumb_` or `normal_` prefix from the filename and prepends the gallery base URL if needed. Args: thumb_src: Thumbnail src attribute value gallery_url: Base gallery URL Returns: Full-resolution image URL, or None if conversion fails """ if not thumb_src: return None # Strip thumb_ or normal_ prefix from filename # e.g. albums/candids/2026/0111/thumb_001.jpg → albums/candids/2026/0111/001.jpg fullres_path = re.sub(r'(/)(?:thumb_|normal_)', r'\1', thumb_src) # If the path is already absolute (starts with http), return as-is if fullres_path.startswith(('http://', 'https://')): return fullres_path # Otherwise, make it absolute relative to gallery URL base = gallery_url.rstrip('/') fullres_path = fullres_path.lstrip('./') return f"{base}/{fullres_path}" def _clean_text(self, text: str) -> str: """Clean HTML entities and whitespace from text.""" text = re.sub(r'&', '&', text) text = re.sub(r'<', '<', text) text = re.sub(r'>', '>', text) text = re.sub(r'"', '"', text) text = re.sub(r'&#\d+;', '', text) text = re.sub(r'&\w+;', '', text) text = re.sub(r'<[^>]+>', '', text) return text.strip()