""" Soundgasm + Liltsome Archive Client for Paid Content Handles: - Soundgasm profile scraping (no auth/Cloudflare needed) - Liltsome archive (liltsome.yerf.org) as supplementary source - Bracket tag parsing from audio titles: [F4M] [Whisper] etc. - Direct HTTP audio downloads (.m4a) """ import asyncio import json import os import re from pathlib import Path from typing import Dict, List, Optional, Set, Tuple from urllib.parse import quote import aiohttp import aiofiles from modules.base_module import LoggingMixin from .models import Creator, Post, Attachment # --------------------------------------------------------------------------- # Bracket tag helpers # --------------------------------------------------------------------------- def parse_bracket_tags(title: str) -> Tuple[str, List[str]]: """Extract [bracket] tags from a title, normalize, return (clean_title, tags).""" tags = re.findall(r'\[([^\]]+)\]', title) clean_title = re.sub(r'\s*\[[^\]]+\]\s*', ' ', title).strip() normalized: List[str] = [] seen: Set[str] = set() for tag in tags: tag_lower = tag.strip().lower() if tag_lower and tag_lower not in seen: seen.add(tag_lower) normalized.append(tag_lower) return clean_title, normalized def format_tag_display(tag_lower: str) -> str: """Format a normalized lowercase tag for display. Gender tags (f4m, m4f, f4a …) → uppercase. Everything else → title case. """ if re.match(r'^[a-z]+\d[a-z]+$', tag_lower): return tag_lower.upper() return tag_lower.title() # --------------------------------------------------------------------------- # SoundgasmClient # --------------------------------------------------------------------------- class SoundgasmClient(LoggingMixin): """Client for fetching audio from Soundgasm and the Liltsome archive.""" SERVICE_ID = 'soundgasm' PLATFORM = 'soundgasm' SOUNDGASM_BASE = 'https://soundgasm.net' LILTSOME_BASE = 'https://liltsome.yerf.org' LILTSOME_LIBRARY_URL = f'{LILTSOME_BASE}/data/library.json' LILTSOME_CACHE_PATH = Path('/opt/media-downloader/data/liltsome_library.json') LILTSOME_ETAG_PATH = Path('/opt/media-downloader/data/liltsome_library.json.etag') HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', } def __init__(self, log_callback=None): self._init_logger('PaidContent', log_callback, default_module='Soundgasm') self._liltsome_data: Optional[Dict] = None # cached in-memory per sync run # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ async def get_profile_info(self, username: str) -> Optional[Dict]: """Return basic profile info (post count) from Soundgasm and/or Liltsome.""" post_count = 0 source = None # Try Soundgasm profile page first try: sg_posts = await self._fetch_soundgasm_profile(username) if sg_posts is not None: post_count = len(sg_posts) source = 'soundgasm' except Exception as e: self.log(f"Soundgasm profile fetch failed for {username}: {e}", 'debug') # Also check Liltsome for additional posts try: lt_entries = await self._get_liltsome_entries(username) if lt_entries: post_count = max(post_count, len(lt_entries)) if source is None: source = 'liltsome' except Exception as e: self.log(f"Liltsome lookup failed for {username}: {e}", 'debug') if post_count == 0 and source is None: return None return { 'username': username, 'post_count': post_count, 'source': source, } async def get_posts(self, username: str, known_post_ids: Optional[Set[str]] = None, progress_callback=None) -> List[Post]: """Fetch posts from both Soundgasm and Liltsome, deduplicating by post_id.""" known = known_post_ids or set() posts: List[Post] = [] seen_ids: Set[str] = set(known) # 1. Soundgasm (may fail if account deleted — that's OK) try: sg_posts = await self._fetch_soundgasm_posts(username, seen_ids) for p in sg_posts: if p.post_id not in seen_ids: seen_ids.add(p.post_id) posts.append(p) self.log(f"Soundgasm: {len(sg_posts)} new posts for {username}", 'info') except Exception as e: self.log(f"Soundgasm fetch failed for {username} (account may be deleted): {e}", 'warning') if progress_callback: progress_callback(len(posts)) # 2. Liltsome archive (always) try: lt_posts = await self._fetch_liltsome_posts(username, seen_ids) for p in lt_posts: if p.post_id not in seen_ids: seen_ids.add(p.post_id) posts.append(p) self.log(f"Liltsome: {len(lt_posts)} new posts for {username}", 'info') except Exception as e: self.log(f"Liltsome fetch failed for {username}: {e}", 'warning') if progress_callback: progress_callback(len(posts)) return posts async def download_audio(self, download_url: str, output_path: Path) -> Dict: """Download an audio file via direct HTTP GET.""" try: output_path.parent.mkdir(parents=True, exist_ok=True) timeout = aiohttp.ClientTimeout(total=300) async with aiohttp.ClientSession(timeout=timeout) as session: async with session.get(download_url, headers=self.HEADERS) as resp: if resp.status != 200: return {'success': False, 'error': f'HTTP {resp.status}'} async with aiofiles.open(str(output_path), 'wb') as f: total = 0 async for chunk in resp.content.iter_chunked(65536): await f.write(chunk) total += len(chunk) return { 'success': True, 'file_path': str(output_path), 'file_size': total, } except Exception as e: self.log(f"Download failed for {download_url}: {e}", 'error') return {'success': False, 'error': str(e)} # ------------------------------------------------------------------ # Soundgasm scraping # ------------------------------------------------------------------ async def _fetch_soundgasm_profile(self, username: str) -> Optional[List[Dict]]: """Scrape the Soundgasm profile page, return list of {slug, title, plays}.""" url = f'{self.SOUNDGASM_BASE}/u/{username}' timeout = aiohttp.ClientTimeout(total=30) async with aiohttp.ClientSession(timeout=timeout) as session: async with session.get(url, headers=self.HEADERS) as resp: if resp.status == 404: return None if resp.status != 200: self.log(f"Soundgasm profile returned {resp.status}", 'warning') return None html = await resp.text() # Parse .sound-details divs for links entries: List[Dict] = [] # Pattern: title # (profile page uses absolute URLs) for m in re.finditer( r']*>\s*([^<]+)', html, re.IGNORECASE ): slug = m.group(1).strip() title = m.group(2).strip() entries.append({'slug': slug, 'title': title}) return entries async def _fetch_soundgasm_posts(self, username: str, seen_ids: Set[str]) -> List[Post]: """Fetch full post details from Soundgasm for new posts.""" profile_entries = await self._fetch_soundgasm_profile(username) if not profile_entries: return [] posts: List[Post] = [] timeout = aiohttp.ClientTimeout(total=30) async with aiohttp.ClientSession(timeout=timeout) as session: for entry in profile_entries: slug = entry['slug'] if slug in seen_ids: continue try: detail = await self._fetch_soundgasm_detail(session, username, slug) if detail is None: continue title_raw = detail.get('title', entry.get('title', slug)) clean_title, tags = parse_bracket_tags(title_raw) description = detail.get('description', '') audio_url = detail.get('audio_url') if not audio_url: continue # Determine extension from URL ext = '.m4a' if audio_url: url_path = audio_url.split('?')[0] if '.' in url_path.split('/')[-1]: ext = '.' + url_path.split('/')[-1].rsplit('.', 1)[1] filename = f"{slug}{ext}" attachment = Attachment( name=filename, file_type='audio', extension=ext.lstrip('.'), server_path=f'/u/{username}/{slug}', download_url=audio_url, ) post = Post( post_id=slug, service_id='soundgasm', platform='soundgasm', creator_id=username, title=clean_title or None, content=description or None, published_at=None, # Soundgasm has no dates attachments=[attachment], auto_tags=tags, ) posts.append(post) except Exception as e: self.log(f"Error fetching Soundgasm detail for {slug}: {e}", 'debug') return posts async def _fetch_soundgasm_detail(self, session: aiohttp.ClientSession, username: str, slug: str) -> Optional[Dict]: """Fetch a single Soundgasm audio detail page and extract metadata.""" url = f'{self.SOUNDGASM_BASE}/u/{username}/{slug}' async with session.get(url, headers=self.HEADERS) as resp: if resp.status != 200: return None html = await resp.text() # Title:
Title Text
# or from the page title tag title = None title_match = re.search(r'aria-label="title"[^>]*>([^<]+)', html) if title_match: title = title_match.group(1).strip() if not title: title_match = re.search(r'([^<]+)', html, re.IGNORECASE) if title_match: title = title_match.group(1).strip() # Remove " - Soundgasm" suffix if present title = re.sub(r'\s*[-–—]\s*Soundgasm.*$', '', title, flags=re.IGNORECASE).strip() # Description:
...
description = None desc_match = re.search(r'class="jp-description"[^>]*>(.*?)', html, re.DOTALL) if desc_match: desc_html = desc_match.group(1) # Strip HTML tags description = re.sub(r'', '\n', desc_html) description = re.sub(r'<[^>]+>', '', description).strip() # Audio URL: m4a: "https://..." audio_url = None audio_match = re.search(r'm4a:\s*"([^"]+)"', html) if audio_match: audio_url = audio_match.group(1) if not audio_url: return None return { 'title': title or slug, 'description': description, 'audio_url': audio_url, } # ------------------------------------------------------------------ # Liltsome archive # ------------------------------------------------------------------ async def _ensure_liltsome_cache(self) -> bool: """Download/refresh the Liltsome library.json using ETag-based invalidation. Returns True if cache is available (fresh or existing), False otherwise. """ etag_file = self.LILTSOME_ETAG_PATH cache_file = self.LILTSOME_CACHE_PATH stored_etag = None if etag_file.exists(): try: stored_etag = etag_file.read_text().strip() except Exception: pass timeout = aiohttp.ClientTimeout(total=600) # 131MB can take a while try: async with aiohttp.ClientSession(timeout=timeout) as session: # HEAD request to check ETag async with session.head(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp: if resp.status != 200: self.log(f"Liltsome HEAD returned {resp.status}", 'warning') return cache_file.exists() remote_etag = resp.headers.get('ETag', '').strip() if stored_etag and remote_etag and stored_etag == remote_etag and cache_file.exists(): self.log("Liltsome cache is fresh (ETag match)", 'debug') return True # Download the full library self.log("Downloading Liltsome library.json (this may take a while)...", 'info') async with session.get(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp: if resp.status != 200: self.log(f"Liltsome GET returned {resp.status}", 'warning') return cache_file.exists() cache_file.parent.mkdir(parents=True, exist_ok=True) async with aiofiles.open(str(cache_file), 'wb') as f: async for chunk in resp.content.iter_chunked(262144): await f.write(chunk) new_etag = resp.headers.get('ETag', remote_etag or '').strip() if new_etag: etag_file.write_text(new_etag) self.log("Liltsome library.json downloaded successfully", 'info') self._liltsome_data = None # force re-parse return True except Exception as e: self.log(f"Failed to refresh Liltsome cache: {e}", 'warning') return cache_file.exists() async def _load_liltsome_data(self) -> Optional[Dict]: """Load and cache the Liltsome library data in memory.""" if self._liltsome_data is not None: return self._liltsome_data cache_file = self.LILTSOME_CACHE_PATH if not cache_file.exists(): return None try: data = await asyncio.to_thread(self._read_liltsome_json, cache_file) self._liltsome_data = data return data except Exception as e: self.log(f"Failed to parse Liltsome library.json: {e}", 'error') return None @staticmethod def _read_liltsome_json(path: Path) -> Dict: """Read and parse the Liltsome JSON file (blocking, run in thread).""" with open(path, 'r', encoding='utf-8') as f: return json.load(f) async def _get_liltsome_entries(self, username: str) -> Optional[List[Dict]]: """Find artist entries in Liltsome data by username (case-insensitive). library.json structure: {"artists": [{"id": "name", "files": {"audio": [...]}}]} """ await self._ensure_liltsome_cache() data = await self._load_liltsome_data() if not data: return None username_lower = username.lower() # Top-level is {"artists": [...]} artists = data.get('artists', []) if isinstance(data, dict) else data for artist in artists: artist_id = str(artist.get('id', '')).lower() artist_name = str(artist.get('name', '')).lower() if artist_id == username_lower or artist_name == username_lower: # Audio entries are in files.audio files = artist.get('files', {}) if isinstance(files, dict): return files.get('audio', []) return [] return None async def _fetch_liltsome_posts(self, username: str, seen_ids: Set[str]) -> List[Post]: """Convert Liltsome archive entries to Post objects.""" entries = await self._get_liltsome_entries(username) if not entries: return [] posts: List[Post] = [] for entry in entries: filename = entry.get('filename', '') path = entry.get('path', '') title_raw = entry.get('title', filename) entry_tags = entry.get('tags', []) # already lowercase in Liltsome duration = None file_size = entry.get('size') if isinstance(entry.get('metadata'), dict): duration = entry['metadata'].get('duration') # Build post_id: prefix with liltsome- to avoid collision sanitized_name = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) if filename else path post_id = f'liltsome-{sanitized_name}' if post_id in seen_ids: continue # Parse bracket tags from title for clean_title clean_title, title_tags = parse_bracket_tags(title_raw) # Merge: use Liltsome's pre-parsed tags + any extra from title all_tags_set: Set[str] = set() all_tags: List[str] = [] for t in entry_tags: t_lower = t.strip().lower() if t_lower and t_lower not in all_tags_set: all_tags_set.add(t_lower) all_tags.append(t_lower) for t in title_tags: if t not in all_tags_set: all_tags_set.add(t) all_tags.append(t) # Build download URL download_url = f'{self.LILTSOME_BASE}/audio_files/{quote(path, safe="/")}' if path else None # Determine extension ext = 'm4a' if filename and '.' in filename: ext = filename.rsplit('.', 1)[1].lower() elif path and '.' in path: ext = path.rsplit('.', 1)[1].lower() attachment = Attachment( name=f"{sanitized_name}.{ext}" if not filename.endswith(f'.{ext}') else filename, file_type='audio', extension=ext, server_path=path or filename, download_url=download_url, file_size=file_size, duration=duration, ) post = Post( post_id=post_id, service_id='soundgasm', platform='soundgasm', creator_id=username, title=clean_title or None, content=None, published_at=None, attachments=[attachment], auto_tags=all_tags, ) posts.append(post) return posts