Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/paid_content/soundgasm_client.py
+++ b/modules/paid_content/soundgasm_client.py
@@ -0,0 +1,508 @@
+"""
+Soundgasm + Liltsome Archive Client for Paid Content
+
+Handles:
+- Soundgasm profile scraping (no auth/Cloudflare needed)
+- Liltsome archive (liltsome.yerf.org) as supplementary source
+- Bracket tag parsing from audio titles: [F4M] [Whisper] etc.
+- Direct HTTP audio downloads (.m4a)
+"""
+
+import asyncio
+import json
+import os
+import re
+from pathlib import Path
+from typing import Dict, List, Optional, Set, Tuple
+from urllib.parse import quote
+
+import aiohttp
+import aiofiles
+
+from modules.base_module import LoggingMixin
+from .models import Creator, Post, Attachment
+
+
+# ---------------------------------------------------------------------------
+# Bracket tag helpers
+# ---------------------------------------------------------------------------
+
+def parse_bracket_tags(title: str) -> Tuple[str, List[str]]:
+    """Extract [bracket] tags from a title, normalize, return (clean_title, tags)."""
+    tags = re.findall(r'\[([^\]]+)\]', title)
+    clean_title = re.sub(r'\s*\[[^\]]+\]\s*', ' ', title).strip()
+    normalized: List[str] = []
+    seen: Set[str] = set()
+    for tag in tags:
+        tag_lower = tag.strip().lower()
+        if tag_lower and tag_lower not in seen:
+            seen.add(tag_lower)
+            normalized.append(tag_lower)
+    return clean_title, normalized
+
+
+def format_tag_display(tag_lower: str) -> str:
+    """Format a normalized lowercase tag for display.
+
+    Gender tags (f4m, m4f, f4a …) → uppercase.
+    Everything else → title case.
+    """
+    if re.match(r'^[a-z]+\d[a-z]+$', tag_lower):
+        return tag_lower.upper()
+    return tag_lower.title()
+
+
+# ---------------------------------------------------------------------------
+# SoundgasmClient
+# ---------------------------------------------------------------------------
+
+class SoundgasmClient(LoggingMixin):
+    """Client for fetching audio from Soundgasm and the Liltsome archive."""
+
+    SERVICE_ID = 'soundgasm'
+    PLATFORM = 'soundgasm'
+
+    SOUNDGASM_BASE = 'https://soundgasm.net'
+    LILTSOME_BASE = 'https://liltsome.yerf.org'
+    LILTSOME_LIBRARY_URL = f'{LILTSOME_BASE}/data/library.json'
+    LILTSOME_CACHE_PATH = Path('/opt/media-downloader/data/liltsome_library.json')
+    LILTSOME_ETAG_PATH = Path('/opt/media-downloader/data/liltsome_library.json.etag')
+
+    HEADERS = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+                       '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.9',
+    }
+
+    def __init__(self, log_callback=None):
+        self._init_logger('PaidContent', log_callback, default_module='Soundgasm')
+        self._liltsome_data: Optional[Dict] = None  # cached in-memory per sync run
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    async def get_profile_info(self, username: str) -> Optional[Dict]:
+        """Return basic profile info (post count) from Soundgasm and/or Liltsome."""
+        post_count = 0
+        source = None
+
+        # Try Soundgasm profile page first
+        try:
+            sg_posts = await self._fetch_soundgasm_profile(username)
+            if sg_posts is not None:
+                post_count = len(sg_posts)
+                source = 'soundgasm'
+        except Exception as e:
+            self.log(f"Soundgasm profile fetch failed for {username}: {e}", 'debug')
+
+        # Also check Liltsome for additional posts
+        try:
+            lt_entries = await self._get_liltsome_entries(username)
+            if lt_entries:
+                post_count = max(post_count, len(lt_entries))
+                if source is None:
+                    source = 'liltsome'
+        except Exception as e:
+            self.log(f"Liltsome lookup failed for {username}: {e}", 'debug')
+
+        if post_count == 0 and source is None:
+            return None
+
+        return {
+            'username': username,
+            'post_count': post_count,
+            'source': source,
+        }
+
+    async def get_posts(self, username: str, known_post_ids: Optional[Set[str]] = None,
+                        progress_callback=None) -> List[Post]:
+        """Fetch posts from both Soundgasm and Liltsome, deduplicating by post_id."""
+        known = known_post_ids or set()
+        posts: List[Post] = []
+        seen_ids: Set[str] = set(known)
+
+        # 1. Soundgasm (may fail if account deleted — that's OK)
+        try:
+            sg_posts = await self._fetch_soundgasm_posts(username, seen_ids)
+            for p in sg_posts:
+                if p.post_id not in seen_ids:
+                    seen_ids.add(p.post_id)
+                    posts.append(p)
+            self.log(f"Soundgasm: {len(sg_posts)} new posts for {username}", 'info')
+        except Exception as e:
+            self.log(f"Soundgasm fetch failed for {username} (account may be deleted): {e}", 'warning')
+
+        if progress_callback:
+            progress_callback(len(posts))
+
+        # 2. Liltsome archive (always)
+        try:
+            lt_posts = await self._fetch_liltsome_posts(username, seen_ids)
+            for p in lt_posts:
+                if p.post_id not in seen_ids:
+                    seen_ids.add(p.post_id)
+                    posts.append(p)
+            self.log(f"Liltsome: {len(lt_posts)} new posts for {username}", 'info')
+        except Exception as e:
+            self.log(f"Liltsome fetch failed for {username}: {e}", 'warning')
+
+        if progress_callback:
+            progress_callback(len(posts))
+
+        return posts
+
+    async def download_audio(self, download_url: str, output_path: Path) -> Dict:
+        """Download an audio file via direct HTTP GET."""
+        try:
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+
+            timeout = aiohttp.ClientTimeout(total=300)
+            async with aiohttp.ClientSession(timeout=timeout) as session:
+                async with session.get(download_url, headers=self.HEADERS) as resp:
+                    if resp.status != 200:
+                        return {'success': False, 'error': f'HTTP {resp.status}'}
+
+                    async with aiofiles.open(str(output_path), 'wb') as f:
+                        total = 0
+                        async for chunk in resp.content.iter_chunked(65536):
+                            await f.write(chunk)
+                            total += len(chunk)
+
+            return {
+                'success': True,
+                'file_path': str(output_path),
+                'file_size': total,
+            }
+
+        except Exception as e:
+            self.log(f"Download failed for {download_url}: {e}", 'error')
+            return {'success': False, 'error': str(e)}
+
+    # ------------------------------------------------------------------
+    # Soundgasm scraping
+    # ------------------------------------------------------------------
+
+    async def _fetch_soundgasm_profile(self, username: str) -> Optional[List[Dict]]:
+        """Scrape the Soundgasm profile page, return list of {slug, title, plays}."""
+        url = f'{self.SOUNDGASM_BASE}/u/{username}'
+        timeout = aiohttp.ClientTimeout(total=30)
+
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            async with session.get(url, headers=self.HEADERS) as resp:
+                if resp.status == 404:
+                    return None
+                if resp.status != 200:
+                    self.log(f"Soundgasm profile returned {resp.status}", 'warning')
+                    return None
+                html = await resp.text()
+
+        # Parse .sound-details divs for links
+        entries: List[Dict] = []
+        # Pattern: <a href="https://soundgasm.net/u/{username}/{slug}">title</a>
+        # (profile page uses absolute URLs)
+        for m in re.finditer(
+            r'<a\s+href="(?:https?://soundgasm\.net)?/u/' + re.escape(username) + r'/([^"]+)"[^>]*>\s*([^<]+)',
+            html, re.IGNORECASE
+        ):
+            slug = m.group(1).strip()
+            title = m.group(2).strip()
+            entries.append({'slug': slug, 'title': title})
+
+        return entries
+
+    async def _fetch_soundgasm_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
+        """Fetch full post details from Soundgasm for new posts."""
+        profile_entries = await self._fetch_soundgasm_profile(username)
+        if not profile_entries:
+            return []
+
+        posts: List[Post] = []
+        timeout = aiohttp.ClientTimeout(total=30)
+
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            for entry in profile_entries:
+                slug = entry['slug']
+                if slug in seen_ids:
+                    continue
+
+                try:
+                    detail = await self._fetch_soundgasm_detail(session, username, slug)
+                    if detail is None:
+                        continue
+
+                    title_raw = detail.get('title', entry.get('title', slug))
+                    clean_title, tags = parse_bracket_tags(title_raw)
+                    description = detail.get('description', '')
+                    audio_url = detail.get('audio_url')
+
+                    if not audio_url:
+                        continue
+
+                    # Determine extension from URL
+                    ext = '.m4a'
+                    if audio_url:
+                        url_path = audio_url.split('?')[0]
+                        if '.' in url_path.split('/')[-1]:
+                            ext = '.' + url_path.split('/')[-1].rsplit('.', 1)[1]
+
+                    filename = f"{slug}{ext}"
+
+                    attachment = Attachment(
+                        name=filename,
+                        file_type='audio',
+                        extension=ext.lstrip('.'),
+                        server_path=f'/u/{username}/{slug}',
+                        download_url=audio_url,
+                    )
+
+                    post = Post(
+                        post_id=slug,
+                        service_id='soundgasm',
+                        platform='soundgasm',
+                        creator_id=username,
+                        title=clean_title or None,
+                        content=description or None,
+                        published_at=None,  # Soundgasm has no dates
+                        attachments=[attachment],
+                        auto_tags=tags,
+                    )
+                    posts.append(post)
+
+                except Exception as e:
+                    self.log(f"Error fetching Soundgasm detail for {slug}: {e}", 'debug')
+
+        return posts
+
+    async def _fetch_soundgasm_detail(self, session: aiohttp.ClientSession,
+                                       username: str, slug: str) -> Optional[Dict]:
+        """Fetch a single Soundgasm audio detail page and extract metadata."""
+        url = f'{self.SOUNDGASM_BASE}/u/{username}/{slug}'
+
+        async with session.get(url, headers=self.HEADERS) as resp:
+            if resp.status != 200:
+                return None
+            html = await resp.text()
+
+        # Title: <div aria-label="title"...>Title Text</div>
+        # or from the page title tag
+        title = None
+        title_match = re.search(r'aria-label="title"[^>]*>([^<]+)', html)
+        if title_match:
+            title = title_match.group(1).strip()
+        if not title:
+            title_match = re.search(r'<title>([^<]+)</title>', html, re.IGNORECASE)
+            if title_match:
+                title = title_match.group(1).strip()
+                # Remove " - Soundgasm" suffix if present
+                title = re.sub(r'\s*[-–—]\s*Soundgasm.*$', '', title, flags=re.IGNORECASE).strip()
+
+        # Description: <div class="jp-description">...</div>
+        description = None
+        desc_match = re.search(r'class="jp-description"[^>]*>(.*?)</div>', html, re.DOTALL)
+        if desc_match:
+            desc_html = desc_match.group(1)
+            # Strip HTML tags
+            description = re.sub(r'<br\s*/?>', '\n', desc_html)
+            description = re.sub(r'<[^>]+>', '', description).strip()
+
+        # Audio URL: m4a: "https://..."
+        audio_url = None
+        audio_match = re.search(r'm4a:\s*"([^"]+)"', html)
+        if audio_match:
+            audio_url = audio_match.group(1)
+
+        if not audio_url:
+            return None
+
+        return {
+            'title': title or slug,
+            'description': description,
+            'audio_url': audio_url,
+        }
+
+    # ------------------------------------------------------------------
+    # Liltsome archive
+    # ------------------------------------------------------------------
+
+    async def _ensure_liltsome_cache(self) -> bool:
+        """Download/refresh the Liltsome library.json using ETag-based invalidation.
+
+        Returns True if cache is available (fresh or existing), False otherwise.
+        """
+        etag_file = self.LILTSOME_ETAG_PATH
+        cache_file = self.LILTSOME_CACHE_PATH
+
+        stored_etag = None
+        if etag_file.exists():
+            try:
+                stored_etag = etag_file.read_text().strip()
+            except Exception:
+                pass
+
+        timeout = aiohttp.ClientTimeout(total=600)  # 131MB can take a while
+        try:
+            async with aiohttp.ClientSession(timeout=timeout) as session:
+                # HEAD request to check ETag
+                async with session.head(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
+                    if resp.status != 200:
+                        self.log(f"Liltsome HEAD returned {resp.status}", 'warning')
+                        return cache_file.exists()
+
+                    remote_etag = resp.headers.get('ETag', '').strip()
+
+                if stored_etag and remote_etag and stored_etag == remote_etag and cache_file.exists():
+                    self.log("Liltsome cache is fresh (ETag match)", 'debug')
+                    return True
+
+                # Download the full library
+                self.log("Downloading Liltsome library.json (this may take a while)...", 'info')
+                async with session.get(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
+                    if resp.status != 200:
+                        self.log(f"Liltsome GET returned {resp.status}", 'warning')
+                        return cache_file.exists()
+
+                    cache_file.parent.mkdir(parents=True, exist_ok=True)
+                    async with aiofiles.open(str(cache_file), 'wb') as f:
+                        async for chunk in resp.content.iter_chunked(262144):
+                            await f.write(chunk)
+
+                    new_etag = resp.headers.get('ETag', remote_etag or '').strip()
+
+                if new_etag:
+                    etag_file.write_text(new_etag)
+
+                self.log("Liltsome library.json downloaded successfully", 'info')
+                self._liltsome_data = None  # force re-parse
+                return True
+
+        except Exception as e:
+            self.log(f"Failed to refresh Liltsome cache: {e}", 'warning')
+            return cache_file.exists()
+
+    async def _load_liltsome_data(self) -> Optional[Dict]:
+        """Load and cache the Liltsome library data in memory."""
+        if self._liltsome_data is not None:
+            return self._liltsome_data
+
+        cache_file = self.LILTSOME_CACHE_PATH
+        if not cache_file.exists():
+            return None
+
+        try:
+            data = await asyncio.to_thread(self._read_liltsome_json, cache_file)
+            self._liltsome_data = data
+            return data
+        except Exception as e:
+            self.log(f"Failed to parse Liltsome library.json: {e}", 'error')
+            return None
+
+    @staticmethod
+    def _read_liltsome_json(path: Path) -> Dict:
+        """Read and parse the Liltsome JSON file (blocking, run in thread)."""
+        with open(path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+
+    async def _get_liltsome_entries(self, username: str) -> Optional[List[Dict]]:
+        """Find artist entries in Liltsome data by username (case-insensitive).
+
+        library.json structure: {"artists": [{"id": "name", "files": {"audio": [...]}}]}
+        """
+        await self._ensure_liltsome_cache()
+        data = await self._load_liltsome_data()
+        if not data:
+            return None
+
+        username_lower = username.lower()
+
+        # Top-level is {"artists": [...]}
+        artists = data.get('artists', []) if isinstance(data, dict) else data
+
+        for artist in artists:
+            artist_id = str(artist.get('id', '')).lower()
+            artist_name = str(artist.get('name', '')).lower()
+            if artist_id == username_lower or artist_name == username_lower:
+                # Audio entries are in files.audio
+                files = artist.get('files', {})
+                if isinstance(files, dict):
+                    return files.get('audio', [])
+                return []
+
+        return None
+
+    async def _fetch_liltsome_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
+        """Convert Liltsome archive entries to Post objects."""
+        entries = await self._get_liltsome_entries(username)
+        if not entries:
+            return []
+
+        posts: List[Post] = []
+        for entry in entries:
+            filename = entry.get('filename', '')
+            path = entry.get('path', '')
+            title_raw = entry.get('title', filename)
+            entry_tags = entry.get('tags', [])  # already lowercase in Liltsome
+            duration = None
+            file_size = entry.get('size')
+
+            if isinstance(entry.get('metadata'), dict):
+                duration = entry['metadata'].get('duration')
+
+            # Build post_id: prefix with liltsome- to avoid collision
+            sanitized_name = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) if filename else path
+            post_id = f'liltsome-{sanitized_name}'
+
+            if post_id in seen_ids:
+                continue
+
+            # Parse bracket tags from title for clean_title
+            clean_title, title_tags = parse_bracket_tags(title_raw)
+
+            # Merge: use Liltsome's pre-parsed tags + any extra from title
+            all_tags_set: Set[str] = set()
+            all_tags: List[str] = []
+            for t in entry_tags:
+                t_lower = t.strip().lower()
+                if t_lower and t_lower not in all_tags_set:
+                    all_tags_set.add(t_lower)
+                    all_tags.append(t_lower)
+            for t in title_tags:
+                if t not in all_tags_set:
+                    all_tags_set.add(t)
+                    all_tags.append(t)
+
+            # Build download URL
+            download_url = f'{self.LILTSOME_BASE}/audio_files/{quote(path, safe="/")}' if path else None
+
+            # Determine extension
+            ext = 'm4a'
+            if filename and '.' in filename:
+                ext = filename.rsplit('.', 1)[1].lower()
+            elif path and '.' in path:
+                ext = path.rsplit('.', 1)[1].lower()
+
+            attachment = Attachment(
+                name=f"{sanitized_name}.{ext}" if not filename.endswith(f'.{ext}') else filename,
+                file_type='audio',
+                extension=ext,
+                server_path=path or filename,
+                download_url=download_url,
+                file_size=file_size,
+                duration=duration,
+            )
+
+            post = Post(
+                post_id=post_id,
+                service_id='soundgasm',
+                platform='soundgasm',
+                creator_id=username,
+                title=clean_title or None,
+                content=None,
+                published_at=None,
+                attachments=[attachment],
+                auto_tags=all_tags,
+            )
+            posts.append(post)
+
+        return posts