media-downloader/modules/paid_content/soundgasm_client.py

"""
Soundgasm + Liltsome Archive Client for Paid Content

Handles:
- Soundgasm profile scraping (no auth/Cloudflare needed)
- Liltsome archive (liltsome.yerf.org) as supplementary source
- Bracket tag parsing from audio titles: [F4M] [Whisper] etc.
- Direct HTTP audio downloads (.m4a)
"""

import asyncio
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import quote

import aiohttp
import aiofiles

from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment


# ---------------------------------------------------------------------------
# Bracket tag helpers
# ---------------------------------------------------------------------------

def parse_bracket_tags(title: str) -> Tuple[str, List[str]]:
    """Extract [bracket] tags from a title, normalize, return (clean_title, tags)."""
    tags = re.findall(r'\[([^\]]+)\]', title)
    clean_title = re.sub(r'\s*\[[^\]]+\]\s*', ' ', title).strip()
    normalized: List[str] = []
    seen: Set[str] = set()
    for tag in tags:
        tag_lower = tag.strip().lower()
        if tag_lower and tag_lower not in seen:
            seen.add(tag_lower)
            normalized.append(tag_lower)
    return clean_title, normalized


def format_tag_display(tag_lower: str) -> str:
    """Format a normalized lowercase tag for display.

    Gender tags (f4m, m4f, f4a …) → uppercase.
    Everything else → title case.
    """
    if re.match(r'^[a-z]+\d[a-z]+$', tag_lower):
        return tag_lower.upper()
    return tag_lower.title()


# ---------------------------------------------------------------------------
# SoundgasmClient
# ---------------------------------------------------------------------------

class SoundgasmClient(LoggingMixin):
    """Client for fetching audio from Soundgasm and the Liltsome archive."""

    SERVICE_ID = 'soundgasm'
    PLATFORM = 'soundgasm'

    SOUNDGASM_BASE = 'https://soundgasm.net'
    LILTSOME_BASE = 'https://liltsome.yerf.org'
    LILTSOME_LIBRARY_URL = f'{LILTSOME_BASE}/data/library.json'
    LILTSOME_CACHE_PATH = Path('/opt/media-downloader/data/liltsome_library.json')
    LILTSOME_ETAG_PATH = Path('/opt/media-downloader/data/liltsome_library.json.etag')

    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                       '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
    }

    def __init__(self, log_callback=None):
        self._init_logger('PaidContent', log_callback, default_module='Soundgasm')
        self._liltsome_data: Optional[Dict] = None  # cached in-memory per sync run

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    async def get_profile_info(self, username: str) -> Optional[Dict]:
        """Return basic profile info (post count) from Soundgasm and/or Liltsome."""
        post_count = 0
        source = None

        # Try Soundgasm profile page first
        try:
            sg_posts = await self._fetch_soundgasm_profile(username)
            if sg_posts is not None:
                post_count = len(sg_posts)
                source = 'soundgasm'
        except Exception as e:
            self.log(f"Soundgasm profile fetch failed for {username}: {e}", 'debug')

        # Also check Liltsome for additional posts
        try:
            lt_entries = await self._get_liltsome_entries(username)
            if lt_entries:
                post_count = max(post_count, len(lt_entries))
                if source is None:
                    source = 'liltsome'
        except Exception as e:
            self.log(f"Liltsome lookup failed for {username}: {e}", 'debug')

        if post_count == 0 and source is None:
            return None

        return {
            'username': username,
            'post_count': post_count,
            'source': source,
        }

    async def get_posts(self, username: str, known_post_ids: Optional[Set[str]] = None,
                        progress_callback=None) -> List[Post]:
        """Fetch posts from both Soundgasm and Liltsome, deduplicating by post_id."""
        known = known_post_ids or set()
        posts: List[Post] = []
        seen_ids: Set[str] = set(known)

        # 1. Soundgasm (may fail if account deleted — that's OK)
        try:
            sg_posts = await self._fetch_soundgasm_posts(username, seen_ids)
            for p in sg_posts:
                if p.post_id not in seen_ids:
                    seen_ids.add(p.post_id)
                    posts.append(p)
            self.log(f"Soundgasm: {len(sg_posts)} new posts for {username}", 'info')
        except Exception as e:
            self.log(f"Soundgasm fetch failed for {username} (account may be deleted): {e}", 'warning')

        if progress_callback:
            progress_callback(len(posts))

        # 2. Liltsome archive (always)
        try:
            lt_posts = await self._fetch_liltsome_posts(username, seen_ids)
            for p in lt_posts:
                if p.post_id not in seen_ids:
                    seen_ids.add(p.post_id)
                    posts.append(p)
            self.log(f"Liltsome: {len(lt_posts)} new posts for {username}", 'info')
        except Exception as e:
            self.log(f"Liltsome fetch failed for {username}: {e}", 'warning')

        if progress_callback:
            progress_callback(len(posts))

        return posts

    async def download_audio(self, download_url: str, output_path: Path) -> Dict:
        """Download an audio file via direct HTTP GET."""
        try:
            output_path.parent.mkdir(parents=True, exist_ok=True)

            timeout = aiohttp.ClientTimeout(total=300)
            async with aiohttp.ClientSession(timeout=timeout) as session:
                async with session.get(download_url, headers=self.HEADERS) as resp:
                    if resp.status != 200:
                        return {'success': False, 'error': f'HTTP {resp.status}'}

                    async with aiofiles.open(str(output_path), 'wb') as f:
                        total = 0
                        async for chunk in resp.content.iter_chunked(65536):
                            await f.write(chunk)
                            total += len(chunk)

            return {
                'success': True,
                'file_path': str(output_path),
                'file_size': total,
            }

        except Exception as e:
            self.log(f"Download failed for {download_url}: {e}", 'error')
            return {'success': False, 'error': str(e)}

    # ------------------------------------------------------------------
    # Soundgasm scraping
    # ------------------------------------------------------------------

    async def _fetch_soundgasm_profile(self, username: str) -> Optional[List[Dict]]:
        """Scrape the Soundgasm profile page, return list of {slug, title, plays}."""
        url = f'{self.SOUNDGASM_BASE}/u/{username}'
        timeout = aiohttp.ClientTimeout(total=30)

        async with aiohttp.ClientSession(timeout=timeout) as session:
            async with session.get(url, headers=self.HEADERS) as resp:
                if resp.status == 404:
                    return None
                if resp.status != 200:
                    self.log(f"Soundgasm profile returned {resp.status}", 'warning')
                    return None
                html = await resp.text()

        # Parse .sound-details divs for links
        entries: List[Dict] = []
        # Pattern: <a href="https://soundgasm.net/u/{username}/{slug}">title</a>
        # (profile page uses absolute URLs)
        for m in re.finditer(
            r'<a\s+href="(?:https?://soundgasm\.net)?/u/' + re.escape(username) + r'/([^"]+)"[^>]*>\s*([^<]+)',
            html, re.IGNORECASE
        ):
            slug = m.group(1).strip()
            title = m.group(2).strip()
            entries.append({'slug': slug, 'title': title})

        return entries

    async def _fetch_soundgasm_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
        """Fetch full post details from Soundgasm for new posts."""
        profile_entries = await self._fetch_soundgasm_profile(username)
        if not profile_entries:
            return []

        posts: List[Post] = []
        timeout = aiohttp.ClientTimeout(total=30)

        async with aiohttp.ClientSession(timeout=timeout) as session:
            for entry in profile_entries:
                slug = entry['slug']
                if slug in seen_ids:
                    continue

                try:
                    detail = await self._fetch_soundgasm_detail(session, username, slug)
                    if detail is None:
                        continue

                    title_raw = detail.get('title', entry.get('title', slug))
                    clean_title, tags = parse_bracket_tags(title_raw)
                    description = detail.get('description', '')
                    audio_url = detail.get('audio_url')

                    if not audio_url:
                        continue

                    # Determine extension from URL
                    ext = '.m4a'
                    if audio_url:
                        url_path = audio_url.split('?')[0]
                        if '.' in url_path.split('/')[-1]:
                            ext = '.' + url_path.split('/')[-1].rsplit('.', 1)[1]

                    filename = f"{slug}{ext}"

                    attachment = Attachment(
                        name=filename,
                        file_type='audio',
                        extension=ext.lstrip('.'),
                        server_path=f'/u/{username}/{slug}',
                        download_url=audio_url,
                    )

                    post = Post(
                        post_id=slug,
                        service_id='soundgasm',
                        platform='soundgasm',
                        creator_id=username,
                        title=clean_title or None,
                        content=description or None,
                        published_at=None,  # Soundgasm has no dates
                        attachments=[attachment],
                        auto_tags=tags,
                    )
                    posts.append(post)

                except Exception as e:
                    self.log(f"Error fetching Soundgasm detail for {slug}: {e}", 'debug')

        return posts

    async def _fetch_soundgasm_detail(self, session: aiohttp.ClientSession,
                                       username: str, slug: str) -> Optional[Dict]:
        """Fetch a single Soundgasm audio detail page and extract metadata."""
        url = f'{self.SOUNDGASM_BASE}/u/{username}/{slug}'

        async with session.get(url, headers=self.HEADERS) as resp:
            if resp.status != 200:
                return None
            html = await resp.text()

        # Title: <div aria-label="title"...>Title Text</div>
        # or from the page title tag
        title = None
        title_match = re.search(r'aria-label="title"[^>]*>([^<]+)', html)
        if title_match:
            title = title_match.group(1).strip()
        if not title:
            title_match = re.search(r'<title>([^<]+)</title>', html, re.IGNORECASE)
            if title_match:
                title = title_match.group(1).strip()
                # Remove " - Soundgasm" suffix if present
                title = re.sub(r'\s*[-–—]\s*Soundgasm.*$', '', title, flags=re.IGNORECASE).strip()

        # Description: <div class="jp-description">...</div>
        description = None
        desc_match = re.search(r'class="jp-description"[^>]*>(.*?)</div>', html, re.DOTALL)
        if desc_match:
            desc_html = desc_match.group(1)
            # Strip HTML tags
            description = re.sub(r'<br\s*/?>', '\n', desc_html)
            description = re.sub(r'<[^>]+>', '', description).strip()

        # Audio URL: m4a: "https://..."
        audio_url = None
        audio_match = re.search(r'm4a:\s*"([^"]+)"', html)
        if audio_match:
            audio_url = audio_match.group(1)

        if not audio_url:
            return None

        return {
            'title': title or slug,
            'description': description,
            'audio_url': audio_url,
        }

    # ------------------------------------------------------------------
    # Liltsome archive
    # ------------------------------------------------------------------

    async def _ensure_liltsome_cache(self) -> bool:
        """Download/refresh the Liltsome library.json using ETag-based invalidation.

        Returns True if cache is available (fresh or existing), False otherwise.
        """
        etag_file = self.LILTSOME_ETAG_PATH
        cache_file = self.LILTSOME_CACHE_PATH

        stored_etag = None
        if etag_file.exists():
            try:
                stored_etag = etag_file.read_text().strip()
            except Exception:
                pass

        timeout = aiohttp.ClientTimeout(total=600)  # 131MB can take a while
        try:
            async with aiohttp.ClientSession(timeout=timeout) as session:
                # HEAD request to check ETag
                async with session.head(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
                    if resp.status != 200:
                        self.log(f"Liltsome HEAD returned {resp.status}", 'warning')
                        return cache_file.exists()

                    remote_etag = resp.headers.get('ETag', '').strip()

                if stored_etag and remote_etag and stored_etag == remote_etag and cache_file.exists():
                    self.log("Liltsome cache is fresh (ETag match)", 'debug')
                    return True

                # Download the full library
                self.log("Downloading Liltsome library.json (this may take a while)...", 'info')
                async with session.get(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
                    if resp.status != 200:
                        self.log(f"Liltsome GET returned {resp.status}", 'warning')
                        return cache_file.exists()

                    cache_file.parent.mkdir(parents=True, exist_ok=True)
                    async with aiofiles.open(str(cache_file), 'wb') as f:
                        async for chunk in resp.content.iter_chunked(262144):
                            await f.write(chunk)

                    new_etag = resp.headers.get('ETag', remote_etag or '').strip()

                if new_etag:
                    etag_file.write_text(new_etag)

                self.log("Liltsome library.json downloaded successfully", 'info')
                self._liltsome_data = None  # force re-parse
                return True

        except Exception as e:
            self.log(f"Failed to refresh Liltsome cache: {e}", 'warning')
            return cache_file.exists()

    async def _load_liltsome_data(self) -> Optional[Dict]:
        """Load and cache the Liltsome library data in memory."""
        if self._liltsome_data is not None:
            return self._liltsome_data

        cache_file = self.LILTSOME_CACHE_PATH
        if not cache_file.exists():
            return None

        try:
            data = await asyncio.to_thread(self._read_liltsome_json, cache_file)
            self._liltsome_data = data
            return data
        except Exception as e:
            self.log(f"Failed to parse Liltsome library.json: {e}", 'error')
            return None

    @staticmethod
    def _read_liltsome_json(path: Path) -> Dict:
        """Read and parse the Liltsome JSON file (blocking, run in thread)."""
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)

    async def _get_liltsome_entries(self, username: str) -> Optional[List[Dict]]:
        """Find artist entries in Liltsome data by username (case-insensitive).

        library.json structure: {"artists": [{"id": "name", "files": {"audio": [...]}}]}
        """
        await self._ensure_liltsome_cache()
        data = await self._load_liltsome_data()
        if not data:
            return None

        username_lower = username.lower()

        # Top-level is {"artists": [...]}
        artists = data.get('artists', []) if isinstance(data, dict) else data

        for artist in artists:
            artist_id = str(artist.get('id', '')).lower()
            artist_name = str(artist.get('name', '')).lower()
            if artist_id == username_lower or artist_name == username_lower:
                # Audio entries are in files.audio
                files = artist.get('files', {})
                if isinstance(files, dict):
                    return files.get('audio', [])
                return []

        return None

    async def _fetch_liltsome_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
        """Convert Liltsome archive entries to Post objects."""
        entries = await self._get_liltsome_entries(username)
        if not entries:
            return []

        posts: List[Post] = []
        for entry in entries:
            filename = entry.get('filename', '')
            path = entry.get('path', '')
            title_raw = entry.get('title', filename)
            entry_tags = entry.get('tags', [])  # already lowercase in Liltsome
            duration = None
            file_size = entry.get('size')

            if isinstance(entry.get('metadata'), dict):
                duration = entry['metadata'].get('duration')

            # Build post_id: prefix with liltsome- to avoid collision
            sanitized_name = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) if filename else path
            post_id = f'liltsome-{sanitized_name}'

            if post_id in seen_ids:
                continue

            # Parse bracket tags from title for clean_title
            clean_title, title_tags = parse_bracket_tags(title_raw)

            # Merge: use Liltsome's pre-parsed tags + any extra from title
            all_tags_set: Set[str] = set()
            all_tags: List[str] = []
            for t in entry_tags:
                t_lower = t.strip().lower()
                if t_lower and t_lower not in all_tags_set:
                    all_tags_set.add(t_lower)
                    all_tags.append(t_lower)
            for t in title_tags:
                if t not in all_tags_set:
                    all_tags_set.add(t)
                    all_tags.append(t)

            # Build download URL
            download_url = f'{self.LILTSOME_BASE}/audio_files/{quote(path, safe="/")}' if path else None

            # Determine extension
            ext = 'm4a'
            if filename and '.' in filename:
                ext = filename.rsplit('.', 1)[1].lower()
            elif path and '.' in path:
                ext = path.rsplit('.', 1)[1].lower()

            attachment = Attachment(
                name=f"{sanitized_name}.{ext}" if not filename.endswith(f'.{ext}') else filename,
                file_type='audio',
                extension=ext,
                server_path=path or filename,
                download_url=download_url,
                file_size=file_size,
                duration=duration,
            )

            post = Post(
                post_id=post_id,
                service_id='soundgasm',
                platform='soundgasm',
                creator_id=username,
                title=clean_title or None,
                content=None,
                published_at=None,
                attachments=[attachment],
                auto_tags=all_tags,
            )
            posts.append(post)

        return posts