media-downloader/modules/paid_content/coppermine_client.py

"""
Coppermine Gallery scraper client.

Coppermine is a PHP photo gallery with a nested structure:
  categories > sub-categories > albums > photos

One album maps to one Post with N Attachments.
Full-res URLs are derived from thumbnails by stripping the `thumb_` prefix.
"""

import asyncio
import re
from datetime import datetime
from typing import Dict, List, Optional, Set
from urllib.parse import urljoin, urlparse, parse_qs

import aiohttp

from modules.base_module import LoggingMixin
from .models import Post, Attachment


class CoppermineClient(LoggingMixin):
    SERVICE_ID = 'coppermine'
    PLATFORM = 'coppermine'
    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                       '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
    }

    IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}

    def __init__(self, log_callback=None):
        self._init_logger('PaidContent', log_callback, default_module='Coppermine')

    async def get_profile_info(self, gallery_url: str) -> Optional[Dict]:
        """Fetch gallery root and extract profile metadata.

        Args:
            gallery_url: Base gallery URL (e.g. https://kylie-jenner.org/gallery)

        Returns:
            Dict with username, display_name, post_count, gallery_url or None on failure
        """
        root_url = self._build_url(gallery_url, 'index.php')
        timeout = aiohttp.ClientTimeout(total=30)
        try:
            async with aiohttp.ClientSession(timeout=timeout) as session:
                html = await self._fetch_page(session, root_url)
                if not html:
                    return None

                # Extract site title from <title> tag
                title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
                site_title = title_match.group(1).strip() if title_match else 'Coppermine Gallery'
                # Clean HTML entities
                site_title = re.sub(r'&amp;', '&', site_title)
                site_title = re.sub(r'&lt;', '<', site_title)
                site_title = re.sub(r'&gt;', '>', site_title)
                site_title = re.sub(r'&#\d+;', '', site_title)
                site_title = re.sub(r'&\w+;', '', site_title)

                # Try to extract stats: "N files in M albums"
                total_files = 0
                total_albums = 0
                stats_match = re.search(
                    r'(\d[\d,]*)\s+files?\s+in\s+(\d[\d,]*)\s+albums?',
                    html, re.IGNORECASE
                )
                if stats_match:
                    total_files = int(stats_match.group(1).replace(',', ''))
                    total_albums = int(stats_match.group(2).replace(',', ''))

                # Use domain as username
                parsed = urlparse(gallery_url)
                domain = parsed.netloc.replace('www.', '')

                return {
                    'username': domain,
                    'display_name': site_title,
                    'post_count': total_albums,
                    'gallery_url': gallery_url,
                }
        except Exception as e:
            self.log(f"Error fetching profile info from {gallery_url}: {e}", 'error')
            return None

    async def get_posts(self, gallery_url: str,
                        known_post_ids: Optional[Set[str]] = None,
                        progress_callback=None,
                        post_callback=None):
        """Crawl the gallery, yielding new albums as Post objects incrementally.

        Phase 1: Fetch root, extract top-level category links
        Phase 2: Recursively crawl categories until album links found
        Phase 3: For each album, fetch thumbnails and call post_callback immediately

        Args:
            gallery_url: Base gallery URL
            known_post_ids: Set of post IDs already in DB (album_NNN)
            progress_callback: Called with status message strings
            post_callback: async callable(post) — called for each album as it's fetched.
                          If provided, posts are streamed instead of collected.

        Returns:
            List of Post objects (only if post_callback is None)
        """
        known = known_post_ids or set()
        timeout = aiohttp.ClientTimeout(total=None, sock_connect=30, sock_read=60)
        posts_collected = [] if post_callback is None else None

        try:
            async with aiohttp.ClientSession(timeout=timeout) as session:
                # Phase 1: Get all category links from root
                root_url = self._build_url(gallery_url, 'index.php')
                root_html = await self._fetch_page(session, root_url)
                if not root_html:
                    self.log("Failed to fetch gallery root", 'error')
                    return [] if post_callback is None else None

                category_ids = self._extract_category_ids(root_html)
                self.log(f"Found {len(category_ids)} top-level categories", 'info')

                if progress_callback:
                    progress_callback(f'Found {len(category_ids)} categories, crawling...')

                # Phase 2: Recursively crawl categories to find album IDs
                album_ids = set()
                visited_cats = set()
                for cat_id in category_ids:
                    new_albums = await self._crawl_category(
                        session, gallery_url, cat_id, visited_cats, known, progress_callback
                    )
                    album_ids.update(new_albums)

                # Filter out known albums
                new_album_ids = {aid for aid in album_ids
                                 if f"album_{aid}" not in known}

                self.log(f"Found {len(new_album_ids)} new albums "
                         f"({len(album_ids)} total, {len(album_ids) - len(new_album_ids)} known)",
                         'info')

                if progress_callback:
                    progress_callback(f'Found {len(new_album_ids)} new albums, fetching photos...')

                # Phase 3: Fetch each new album and deliver Post objects
                parsed = urlparse(gallery_url)
                domain = parsed.netloc.replace('www.', '')
                fetched = 0

                for i, album_id in enumerate(sorted(new_album_ids)):
                    if progress_callback and (i + 1) % 5 == 0:
                        progress_callback(
                            f'Fetching album {i + 1}/{len(new_album_ids)}...'
                        )

                    post = await self._fetch_album(session, gallery_url, album_id, domain)
                    if post and post.attachments:
                        fetched += 1
                        if post_callback:
                            await post_callback(post)
                        else:
                            posts_collected.append(post)

                    # Rate limit: 1s between page fetches
                    await asyncio.sleep(2)

                self.log(f"Fetched {fetched} albums with attachments", 'info')
                return posts_collected

        except Exception as e:
            self.log(f"Error crawling gallery {gallery_url}: {e}", 'error')
            return [] if post_callback is None else None

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _build_url(self, gallery_url: str, page: str) -> str:
        """Build a full URL from the gallery base and a page name."""
        base = gallery_url.rstrip('/')
        return f"{base}/{page}"

    async def _fetch_page(self, session: aiohttp.ClientSession, url: str,
                          max_retries: int = 3) -> Optional[str]:
        """Fetch a page and return its HTML text, or None on failure.

        Retries with exponential backoff on connection errors / server disconnects.
        """
        for attempt in range(max_retries):
            try:
                async with session.get(url, headers=self.HEADERS) as resp:
                    if resp.status == 429:
                        wait = 5 * (attempt + 1)
                        self.log(f"Rate limited on {url}, waiting {wait}s", 'warning')
                        await asyncio.sleep(wait)
                        continue
                    if resp.status != 200:
                        self.log(f"HTTP {resp.status} fetching {url}", 'warning')
                        return None
                    return await resp.text()
            except (aiohttp.ServerDisconnectedError, aiohttp.ClientOSError,
                    aiohttp.ClientPayloadError, ConnectionResetError) as e:
                wait = 3 * (attempt + 1)
                if attempt < max_retries - 1:
                    self.log(f"Connection error on {url}, retry {attempt + 1} in {wait}s: {e}",
                             'warning')
                    await asyncio.sleep(wait)
                else:
                    self.log(f"Failed after {max_retries} attempts: {url}: {e}", 'warning')
                    return None
            except Exception as e:
                self.log(f"Error fetching {url}: {e}", 'warning')
                return None
        return None

    def _extract_category_ids(self, html: str) -> List[str]:
        """Extract category IDs from index.php page.

        Looks for links like: index.php?cat=N
        """
        cat_ids = []
        seen = set()
        for match in re.finditer(r'index\.php\?cat=(\d+)', html):
            cat_id = match.group(1)
            if cat_id not in seen:
                seen.add(cat_id)
                cat_ids.append(cat_id)
        return cat_ids

    def _extract_album_ids(self, html: str) -> List[str]:
        """Extract album IDs from a category page.

        Looks for links like: thumbnails.php?album=N
        """
        album_ids = []
        seen = set()
        for match in re.finditer(r'thumbnails\.php\?album=(\d+)', html):
            album_id = match.group(1)
            if album_id not in seen:
                seen.add(album_id)
                album_ids.append(album_id)
        return album_ids

    def _extract_page_count(self, html: str) -> int:
        """Extract total page count from Coppermine pagination text.

        Looks for patterns like "53 albums on 2 page(s)" or "N files on M page(s)".
        """
        match = re.search(r'on\s+(\d+)\s+page\(s\)', html, re.IGNORECASE)
        if match:
            return int(match.group(1))
        return 1

    async def _crawl_category(self, session: aiohttp.ClientSession,
                               gallery_url: str, cat_id: str,
                               visited: Set[str], known: Set[str],
                               progress_callback=None,
                               depth: int = 0) -> Set[str]:
        """Recursively crawl a category to find all album IDs.

        Categories can contain sub-categories or albums. We recurse
        until we find album links (thumbnails.php?album=N).
        Handles pagination within category pages (index.php?cat=N&page=M).

        Args:
            session: aiohttp session
            gallery_url: Base gallery URL
            cat_id: Category ID to crawl
            visited: Set of already-visited category IDs (prevents loops)
            known: Set of known post_ids (for logging only)
            progress_callback: Status callback
            depth: Recursion depth (max 10)

        Returns:
            Set of album ID strings
        """
        if cat_id in visited or depth > 10:
            return set()
        visited.add(cat_id)

        # Fetch first page
        cat_url = self._build_url(gallery_url, f'index.php?cat={cat_id}')
        html = await self._fetch_page(session, cat_url)
        if not html:
            return set()

        await asyncio.sleep(2)

        album_ids = set(self._extract_album_ids(html))
        sub_cat_ids = self._extract_category_ids(html)

        # Handle pagination: fetch remaining pages
        total_pages = self._extract_page_count(html)
        if total_pages > 1:
            for page_num in range(2, total_pages + 1):
                page_url = self._build_url(
                    gallery_url, f'index.php?cat={cat_id}&page={page_num}'
                )
                page_html = await self._fetch_page(session, page_url)
                if page_html:
                    album_ids.update(self._extract_album_ids(page_html))
                    # Sub-categories are the same on every page, no need to re-extract
                await asyncio.sleep(2)

        # Filter out the current category from sub-categories
        sub_cat_ids = [c for c in sub_cat_ids if c != cat_id and c not in visited]

        if progress_callback:
            progress_callback(
                f'Category {cat_id}: {len(album_ids)} albums, '
                f'{len(sub_cat_ids)} sub-categories'
                + (f' ({total_pages} pages)' if total_pages > 1 else '')
            )

        # Recurse into sub-categories
        for sub_id in sub_cat_ids:
            sub_albums = await self._crawl_category(
                session, gallery_url, sub_id, visited, known,
                progress_callback, depth + 1
            )
            album_ids.update(sub_albums)

        return album_ids

    async def _fetch_album(self, session: aiohttp.ClientSession,
                            gallery_url: str, album_id: str,
                            domain: str) -> Optional[Post]:
        """Fetch an album page (all pages) and build a Post object.

        Handles pagination within albums (thumbnails.php?album=N&page=M).

        Args:
            session: aiohttp session
            gallery_url: Base gallery URL
            album_id: Album ID to fetch
            domain: Domain name for creator_id

        Returns:
            Post object with attachments, or None on failure
        """
        album_url = self._build_url(gallery_url, f'thumbnails.php?album={album_id}')
        html = await self._fetch_page(session, album_url)
        if not html:
            return None

        # Extract album title from first page
        title = self._extract_album_title(html)
        if not title:
            title = f"Album {album_id}"

        # Extract attachments from first page
        attachments = self._extract_attachments(html, gallery_url)

        # Handle pagination within album
        total_pages = self._extract_page_count(html)
        if total_pages > 1:
            for page_num in range(2, total_pages + 1):
                page_url = self._build_url(
                    gallery_url, f'thumbnails.php?album={album_id}&page={page_num}'
                )
                page_html = await self._fetch_page(session, page_url)
                if page_html:
                    attachments.extend(self._extract_attachments(page_html, gallery_url))
                await asyncio.sleep(2)

        if not attachments:
            return None

        # Extract album date from breadcrumb + title
        album_date = self._extract_album_date(html, title)

        post_id = f"album_{album_id}"
        return Post(
            post_id=post_id,
            service_id=self.SERVICE_ID,
            platform=self.PLATFORM,
            creator_id=domain,
            title=None,
            content=title,
            published_at=album_date,
            attachments=attachments,
        )

    def _extract_album_title(self, html: str) -> Optional[str]:
        """Extract album title from page HTML.

        Priority: breadcrumb last item > <h1>/<h2> heading > <title> last segment
        """
        # Try breadcrumb: last text segment after the last ">"
        # Coppermine breadcrumbs: "Home > Category > Sub > Album Title"
        bc_match = re.search(
            r'class="[^"]*breadcrumb[^"]*"[^>]*>(.*?)</(?:div|span|td|p)',
            html, re.DOTALL | re.IGNORECASE
        )
        if bc_match:
            bc_text = bc_match.group(1)
            # Strip HTML tags, split on ">", take last segment
            bc_text = re.sub(r'<[^>]+>', ' ', bc_text)
            parts = [p.strip() for p in bc_text.split('>') if p.strip()]
            if parts:
                title = self._clean_text(parts[-1])
                if title and title.lower() not in ('home', 'index', 'gallery'):
                    return title

        # Try headings
        for tag in ('h1', 'h2', 'h3'):
            h_match = re.search(
                rf'<{tag}[^>]*>(.*?)</{tag}>', html, re.DOTALL | re.IGNORECASE
            )
            if h_match:
                title = self._clean_text(h_match.group(1))
                if title and len(title) > 2:
                    return title

        # Fallback: <title> tag — take the last segment before the site name
        title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
        if title_match:
            title = title_match.group(1).strip()
            # Usually "Site Name - Album Title" or "Album Title - Site Name"
            # The album-specific part is typically not the site name;
            # use the longest segment as a heuristic
            if ' - ' in title:
                parts = [p.strip() for p in title.split(' - ')]
                # Pick the longest part (album names tend to be longer than site names)
                title = max(parts, key=len)
            if title:
                return self._clean_text(title)

        return None

    def _extract_album_date(self, html: str, title: str) -> str:
        """Extract album date from breadcrumb year + title month/day.

        Breadcrumb: "Home > Candids > 2026 > January 11 - Leaving..."
        Title: "January 11 - Leaving Golden Globes afterparty..."

        Returns ISO date string, or current datetime as fallback.
        """
        MONTHS = {
            'january': 1, 'february': 2, 'march': 3, 'april': 4,
            'may': 5, 'june': 6, 'july': 7, 'august': 8,
            'september': 9, 'october': 10, 'november': 11, 'december': 12,
        }

        # Extract year from breadcrumb path (look for 4-digit year in links)
        year = None
        # Breadcrumb links: index.php?cat=155">2026</a>
        for m in re.finditer(r'>\s*((?:19|20)\d{2})\s*</', html):
            year = int(m.group(1))

        # Also try path segments in albums/ URLs for year
        if not year:
            path_match = re.search(r'albums/[^/]+/(20\d{2})/', html)
            if path_match:
                year = int(path_match.group(1))

        # Extract month and day from album title
        month, day = None, None
        if title:
            # "January 11 - ..." or "March 3 - ..."
            date_match = re.match(
                r'(\w+)\s+(\d{1,2})\b', title
            )
            if date_match:
                month_name = date_match.group(1).lower()
                if month_name in MONTHS:
                    month = MONTHS[month_name]
                    day = int(date_match.group(2))

        # Build date from breadcrumb year + title month/day
        if year and month and day:
            try:
                return datetime(year, month, day).isoformat()
            except ValueError:
                pass
        if year and month:
            try:
                return datetime(year, month, 1).isoformat()
            except ValueError:
                pass
        if year:
            return datetime(year, 1, 1).isoformat()

        # Fallback: parse "Date added=Jan 13, 2026" from thumbnail tooltips
        MONTH_ABBR = {
            'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
            'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
            'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
        }
        added_match = re.search(
            r'Date added\s*=\s*(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
        )
        if added_match:
            m_abbr = added_match.group(1).lower()
            if m_abbr in MONTH_ABBR:
                try:
                    return datetime(
                        int(added_match.group(3)),
                        MONTH_ABBR[m_abbr],
                        int(added_match.group(2))
                    ).isoformat()
                except ValueError:
                    pass

        # Also try "last one added on Jan 13, 2026" from album_stat
        stat_match = re.search(
            r'last one added on\s+(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
        )
        if stat_match:
            m_abbr = stat_match.group(1).lower()
            if m_abbr in MONTH_ABBR:
                try:
                    return datetime(
                        int(stat_match.group(3)),
                        MONTH_ABBR[m_abbr],
                        int(stat_match.group(2))
                    ).isoformat()
                except ValueError:
                    pass

        return datetime.now().isoformat()

    def _extract_attachments(self, html: str, gallery_url: str) -> List[Attachment]:
        """Extract photo attachments from album page HTML.

        Finds thumbnail images and converts them to full-res URLs by
        stripping the `thumb_` prefix from the filename.
        """
        attachments = []
        seen_urls = set()

        # Pattern: thumbnail images in album pages
        # Common patterns:
        # <img src="albums/path/thumb_filename.jpg" ...>
        # <img src="albums/path/normal_filename.jpg" ...>
        for match in re.finditer(
            r'<img[^>]+src=["\']([^"\']*?albums/[^"\']*?(?:thumb_|normal_)[^"\']+)["\']',
            html, re.IGNORECASE
        ):
            thumb_src = match.group(1)
            full_url = self._thumb_to_fullres(thumb_src, gallery_url)
            if full_url and full_url not in seen_urls:
                seen_urls.add(full_url)
                filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
                ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''

                attachments.append(Attachment(
                    name=filename,
                    server_path=full_url,  # use as dedup key
                    file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
                    extension=ext or None,
                    download_url=full_url,
                ))

        # Also try: <a href="displayimage.php?..."><img src="albums/...">
        # Some themes wrap thumbnails in links
        if not attachments:
            for match in re.finditer(
                r'<a[^>]+href=["\'][^"\']*displayimage\.php[^"\']*["\'][^>]*>'
                r'\s*<img[^>]+src=["\']([^"\']+)["\']',
                html, re.IGNORECASE | re.DOTALL
            ):
                thumb_src = match.group(1)
                full_url = self._thumb_to_fullres(thumb_src, gallery_url)
                if full_url and full_url not in seen_urls:
                    seen_urls.add(full_url)
                    filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
                    ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''

                    attachments.append(Attachment(
                        name=filename,
                        server_path=full_url,
                        file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
                        extension=ext or None,
                        download_url=full_url,
                    ))

        return attachments

    def _thumb_to_fullres(self, thumb_src: str, gallery_url: str) -> Optional[str]:
        """Convert a thumbnail URL to a full-resolution URL.

        Strips `thumb_` or `normal_` prefix from the filename and
        prepends the gallery base URL if needed.

        Args:
            thumb_src: Thumbnail src attribute value
            gallery_url: Base gallery URL

        Returns:
            Full-resolution image URL, or None if conversion fails
        """
        if not thumb_src:
            return None

        # Strip thumb_ or normal_ prefix from filename
        # e.g. albums/candids/2026/0111/thumb_001.jpg → albums/candids/2026/0111/001.jpg
        fullres_path = re.sub(r'(/)(?:thumb_|normal_)', r'\1', thumb_src)

        # If the path is already absolute (starts with http), return as-is
        if fullres_path.startswith(('http://', 'https://')):
            return fullres_path

        # Otherwise, make it absolute relative to gallery URL
        base = gallery_url.rstrip('/')
        fullres_path = fullres_path.lstrip('./')
        return f"{base}/{fullres_path}"

    def _clean_text(self, text: str) -> str:
        """Clean HTML entities and whitespace from text."""
        text = re.sub(r'&amp;', '&', text)
        text = re.sub(r'&lt;', '<', text)
        text = re.sub(r'&gt;', '>', text)
        text = re.sub(r'&quot;', '"', text)
        text = re.sub(r'&#\d+;', '', text)
        text = re.sub(r'&\w+;', '', text)
        text = re.sub(r'<[^>]+>', '', text)
        return text.strip()