"""
Soundgasm + Liltsome Archive Client for Paid Content
Handles:
- Soundgasm profile scraping (no auth/Cloudflare needed)
- Liltsome archive (liltsome.yerf.org) as supplementary source
- Bracket tag parsing from audio titles: [F4M] [Whisper] etc.
- Direct HTTP audio downloads (.m4a)
"""
import asyncio
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import quote
import aiohttp
import aiofiles
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
# ---------------------------------------------------------------------------
# Bracket tag helpers
# ---------------------------------------------------------------------------
def parse_bracket_tags(title: str) -> Tuple[str, List[str]]:
"""Extract [bracket] tags from a title, normalize, return (clean_title, tags)."""
tags = re.findall(r'\[([^\]]+)\]', title)
clean_title = re.sub(r'\s*\[[^\]]+\]\s*', ' ', title).strip()
normalized: List[str] = []
seen: Set[str] = set()
for tag in tags:
tag_lower = tag.strip().lower()
if tag_lower and tag_lower not in seen:
seen.add(tag_lower)
normalized.append(tag_lower)
return clean_title, normalized
def format_tag_display(tag_lower: str) -> str:
"""Format a normalized lowercase tag for display.
Gender tags (f4m, m4f, f4a …) → uppercase.
Everything else → title case.
"""
if re.match(r'^[a-z]+\d[a-z]+$', tag_lower):
return tag_lower.upper()
return tag_lower.title()
# ---------------------------------------------------------------------------
# SoundgasmClient
# ---------------------------------------------------------------------------
class SoundgasmClient(LoggingMixin):
"""Client for fetching audio from Soundgasm and the Liltsome archive."""
SERVICE_ID = 'soundgasm'
PLATFORM = 'soundgasm'
SOUNDGASM_BASE = 'https://soundgasm.net'
LILTSOME_BASE = 'https://liltsome.yerf.org'
LILTSOME_LIBRARY_URL = f'{LILTSOME_BASE}/data/library.json'
LILTSOME_CACHE_PATH = Path('/opt/media-downloader/data/liltsome_library.json')
LILTSOME_ETAG_PATH = Path('/opt/media-downloader/data/liltsome_library.json.etag')
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
def __init__(self, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Soundgasm')
self._liltsome_data: Optional[Dict] = None # cached in-memory per sync run
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def get_profile_info(self, username: str) -> Optional[Dict]:
"""Return basic profile info (post count) from Soundgasm and/or Liltsome."""
post_count = 0
source = None
# Try Soundgasm profile page first
try:
sg_posts = await self._fetch_soundgasm_profile(username)
if sg_posts is not None:
post_count = len(sg_posts)
source = 'soundgasm'
except Exception as e:
self.log(f"Soundgasm profile fetch failed for {username}: {e}", 'debug')
# Also check Liltsome for additional posts
try:
lt_entries = await self._get_liltsome_entries(username)
if lt_entries:
post_count = max(post_count, len(lt_entries))
if source is None:
source = 'liltsome'
except Exception as e:
self.log(f"Liltsome lookup failed for {username}: {e}", 'debug')
if post_count == 0 and source is None:
return None
return {
'username': username,
'post_count': post_count,
'source': source,
}
async def get_posts(self, username: str, known_post_ids: Optional[Set[str]] = None,
progress_callback=None) -> List[Post]:
"""Fetch posts from both Soundgasm and Liltsome, deduplicating by post_id."""
known = known_post_ids or set()
posts: List[Post] = []
seen_ids: Set[str] = set(known)
# 1. Soundgasm (may fail if account deleted — that's OK)
try:
sg_posts = await self._fetch_soundgasm_posts(username, seen_ids)
for p in sg_posts:
if p.post_id not in seen_ids:
seen_ids.add(p.post_id)
posts.append(p)
self.log(f"Soundgasm: {len(sg_posts)} new posts for {username}", 'info')
except Exception as e:
self.log(f"Soundgasm fetch failed for {username} (account may be deleted): {e}", 'warning')
if progress_callback:
progress_callback(len(posts))
# 2. Liltsome archive (always)
try:
lt_posts = await self._fetch_liltsome_posts(username, seen_ids)
for p in lt_posts:
if p.post_id not in seen_ids:
seen_ids.add(p.post_id)
posts.append(p)
self.log(f"Liltsome: {len(lt_posts)} new posts for {username}", 'info')
except Exception as e:
self.log(f"Liltsome fetch failed for {username}: {e}", 'warning')
if progress_callback:
progress_callback(len(posts))
return posts
async def download_audio(self, download_url: str, output_path: Path) -> Dict:
"""Download an audio file via direct HTTP GET."""
try:
output_path.parent.mkdir(parents=True, exist_ok=True)
timeout = aiohttp.ClientTimeout(total=300)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(download_url, headers=self.HEADERS) as resp:
if resp.status != 200:
return {'success': False, 'error': f'HTTP {resp.status}'}
async with aiofiles.open(str(output_path), 'wb') as f:
total = 0
async for chunk in resp.content.iter_chunked(65536):
await f.write(chunk)
total += len(chunk)
return {
'success': True,
'file_path': str(output_path),
'file_size': total,
}
except Exception as e:
self.log(f"Download failed for {download_url}: {e}", 'error')
return {'success': False, 'error': str(e)}
# ------------------------------------------------------------------
# Soundgasm scraping
# ------------------------------------------------------------------
async def _fetch_soundgasm_profile(self, username: str) -> Optional[List[Dict]]:
"""Scrape the Soundgasm profile page, return list of {slug, title, plays}."""
url = f'{self.SOUNDGASM_BASE}/u/{username}'
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url, headers=self.HEADERS) as resp:
if resp.status == 404:
return None
if resp.status != 200:
self.log(f"Soundgasm profile returned {resp.status}", 'warning')
return None
html = await resp.text()
# Parse .sound-details divs for links
entries: List[Dict] = []
# Pattern: title
# (profile page uses absolute URLs)
for m in re.finditer(
r']*>\s*([^<]+)',
html, re.IGNORECASE
):
slug = m.group(1).strip()
title = m.group(2).strip()
entries.append({'slug': slug, 'title': title})
return entries
async def _fetch_soundgasm_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
"""Fetch full post details from Soundgasm for new posts."""
profile_entries = await self._fetch_soundgasm_profile(username)
if not profile_entries:
return []
posts: List[Post] = []
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
for entry in profile_entries:
slug = entry['slug']
if slug in seen_ids:
continue
try:
detail = await self._fetch_soundgasm_detail(session, username, slug)
if detail is None:
continue
title_raw = detail.get('title', entry.get('title', slug))
clean_title, tags = parse_bracket_tags(title_raw)
description = detail.get('description', '')
audio_url = detail.get('audio_url')
if not audio_url:
continue
# Determine extension from URL
ext = '.m4a'
if audio_url:
url_path = audio_url.split('?')[0]
if '.' in url_path.split('/')[-1]:
ext = '.' + url_path.split('/')[-1].rsplit('.', 1)[1]
filename = f"{slug}{ext}"
attachment = Attachment(
name=filename,
file_type='audio',
extension=ext.lstrip('.'),
server_path=f'/u/{username}/{slug}',
download_url=audio_url,
)
post = Post(
post_id=slug,
service_id='soundgasm',
platform='soundgasm',
creator_id=username,
title=clean_title or None,
content=description or None,
published_at=None, # Soundgasm has no dates
attachments=[attachment],
auto_tags=tags,
)
posts.append(post)
except Exception as e:
self.log(f"Error fetching Soundgasm detail for {slug}: {e}", 'debug')
return posts
async def _fetch_soundgasm_detail(self, session: aiohttp.ClientSession,
username: str, slug: str) -> Optional[Dict]:
"""Fetch a single Soundgasm audio detail page and extract metadata."""
url = f'{self.SOUNDGASM_BASE}/u/{username}/{slug}'
async with session.get(url, headers=self.HEADERS) as resp:
if resp.status != 200:
return None
html = await resp.text()
# Title: Title Text
# or from the page title tag
title = None
title_match = re.search(r'aria-label="title"[^>]*>([^<]+)', html)
if title_match:
title = title_match.group(1).strip()
if not title:
title_match = re.search(r'([^<]+)', html, re.IGNORECASE)
if title_match:
title = title_match.group(1).strip()
# Remove " - Soundgasm" suffix if present
title = re.sub(r'\s*[-–—]\s*Soundgasm.*$', '', title, flags=re.IGNORECASE).strip()
# Description: ...
description = None
desc_match = re.search(r'class="jp-description"[^>]*>(.*?)', html, re.DOTALL)
if desc_match:
desc_html = desc_match.group(1)
# Strip HTML tags
description = re.sub(r'
', '\n', desc_html)
description = re.sub(r'<[^>]+>', '', description).strip()
# Audio URL: m4a: "https://..."
audio_url = None
audio_match = re.search(r'm4a:\s*"([^"]+)"', html)
if audio_match:
audio_url = audio_match.group(1)
if not audio_url:
return None
return {
'title': title or slug,
'description': description,
'audio_url': audio_url,
}
# ------------------------------------------------------------------
# Liltsome archive
# ------------------------------------------------------------------
async def _ensure_liltsome_cache(self) -> bool:
"""Download/refresh the Liltsome library.json using ETag-based invalidation.
Returns True if cache is available (fresh or existing), False otherwise.
"""
etag_file = self.LILTSOME_ETAG_PATH
cache_file = self.LILTSOME_CACHE_PATH
stored_etag = None
if etag_file.exists():
try:
stored_etag = etag_file.read_text().strip()
except Exception:
pass
timeout = aiohttp.ClientTimeout(total=600) # 131MB can take a while
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
# HEAD request to check ETag
async with session.head(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
if resp.status != 200:
self.log(f"Liltsome HEAD returned {resp.status}", 'warning')
return cache_file.exists()
remote_etag = resp.headers.get('ETag', '').strip()
if stored_etag and remote_etag and stored_etag == remote_etag and cache_file.exists():
self.log("Liltsome cache is fresh (ETag match)", 'debug')
return True
# Download the full library
self.log("Downloading Liltsome library.json (this may take a while)...", 'info')
async with session.get(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
if resp.status != 200:
self.log(f"Liltsome GET returned {resp.status}", 'warning')
return cache_file.exists()
cache_file.parent.mkdir(parents=True, exist_ok=True)
async with aiofiles.open(str(cache_file), 'wb') as f:
async for chunk in resp.content.iter_chunked(262144):
await f.write(chunk)
new_etag = resp.headers.get('ETag', remote_etag or '').strip()
if new_etag:
etag_file.write_text(new_etag)
self.log("Liltsome library.json downloaded successfully", 'info')
self._liltsome_data = None # force re-parse
return True
except Exception as e:
self.log(f"Failed to refresh Liltsome cache: {e}", 'warning')
return cache_file.exists()
async def _load_liltsome_data(self) -> Optional[Dict]:
"""Load and cache the Liltsome library data in memory."""
if self._liltsome_data is not None:
return self._liltsome_data
cache_file = self.LILTSOME_CACHE_PATH
if not cache_file.exists():
return None
try:
data = await asyncio.to_thread(self._read_liltsome_json, cache_file)
self._liltsome_data = data
return data
except Exception as e:
self.log(f"Failed to parse Liltsome library.json: {e}", 'error')
return None
@staticmethod
def _read_liltsome_json(path: Path) -> Dict:
"""Read and parse the Liltsome JSON file (blocking, run in thread)."""
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
async def _get_liltsome_entries(self, username: str) -> Optional[List[Dict]]:
"""Find artist entries in Liltsome data by username (case-insensitive).
library.json structure: {"artists": [{"id": "name", "files": {"audio": [...]}}]}
"""
await self._ensure_liltsome_cache()
data = await self._load_liltsome_data()
if not data:
return None
username_lower = username.lower()
# Top-level is {"artists": [...]}
artists = data.get('artists', []) if isinstance(data, dict) else data
for artist in artists:
artist_id = str(artist.get('id', '')).lower()
artist_name = str(artist.get('name', '')).lower()
if artist_id == username_lower or artist_name == username_lower:
# Audio entries are in files.audio
files = artist.get('files', {})
if isinstance(files, dict):
return files.get('audio', [])
return []
return None
async def _fetch_liltsome_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
"""Convert Liltsome archive entries to Post objects."""
entries = await self._get_liltsome_entries(username)
if not entries:
return []
posts: List[Post] = []
for entry in entries:
filename = entry.get('filename', '')
path = entry.get('path', '')
title_raw = entry.get('title', filename)
entry_tags = entry.get('tags', []) # already lowercase in Liltsome
duration = None
file_size = entry.get('size')
if isinstance(entry.get('metadata'), dict):
duration = entry['metadata'].get('duration')
# Build post_id: prefix with liltsome- to avoid collision
sanitized_name = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) if filename else path
post_id = f'liltsome-{sanitized_name}'
if post_id in seen_ids:
continue
# Parse bracket tags from title for clean_title
clean_title, title_tags = parse_bracket_tags(title_raw)
# Merge: use Liltsome's pre-parsed tags + any extra from title
all_tags_set: Set[str] = set()
all_tags: List[str] = []
for t in entry_tags:
t_lower = t.strip().lower()
if t_lower and t_lower not in all_tags_set:
all_tags_set.add(t_lower)
all_tags.append(t_lower)
for t in title_tags:
if t not in all_tags_set:
all_tags_set.add(t)
all_tags.append(t)
# Build download URL
download_url = f'{self.LILTSOME_BASE}/audio_files/{quote(path, safe="/")}' if path else None
# Determine extension
ext = 'm4a'
if filename and '.' in filename:
ext = filename.rsplit('.', 1)[1].lower()
elif path and '.' in path:
ext = path.rsplit('.', 1)[1].lower()
attachment = Attachment(
name=f"{sanitized_name}.{ext}" if not filename.endswith(f'.{ext}') else filename,
file_type='audio',
extension=ext,
server_path=path or filename,
download_url=download_url,
file_size=file_size,
duration=duration,
)
post = Post(
post_id=post_id,
service_id='soundgasm',
platform='soundgasm',
creator_id=username,
title=clean_title or None,
content=None,
published_at=None,
attachments=[attachment],
auto_tags=all_tags,
)
posts.append(post)
return posts