508
modules/paid_content/soundgasm_client.py
Normal file
508
modules/paid_content/soundgasm_client.py
Normal file
@@ -0,0 +1,508 @@
|
||||
"""
|
||||
Soundgasm + Liltsome Archive Client for Paid Content
|
||||
|
||||
Handles:
|
||||
- Soundgasm profile scraping (no auth/Cloudflare needed)
|
||||
- Liltsome archive (liltsome.yerf.org) as supplementary source
|
||||
- Bracket tag parsing from audio titles: [F4M] [Whisper] etc.
|
||||
- Direct HTTP audio downloads (.m4a)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from urllib.parse import quote
|
||||
|
||||
import aiohttp
|
||||
import aiofiles
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bracket tag helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_bracket_tags(title: str) -> Tuple[str, List[str]]:
|
||||
"""Extract [bracket] tags from a title, normalize, return (clean_title, tags)."""
|
||||
tags = re.findall(r'\[([^\]]+)\]', title)
|
||||
clean_title = re.sub(r'\s*\[[^\]]+\]\s*', ' ', title).strip()
|
||||
normalized: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
for tag in tags:
|
||||
tag_lower = tag.strip().lower()
|
||||
if tag_lower and tag_lower not in seen:
|
||||
seen.add(tag_lower)
|
||||
normalized.append(tag_lower)
|
||||
return clean_title, normalized
|
||||
|
||||
|
||||
def format_tag_display(tag_lower: str) -> str:
|
||||
"""Format a normalized lowercase tag for display.
|
||||
|
||||
Gender tags (f4m, m4f, f4a …) → uppercase.
|
||||
Everything else → title case.
|
||||
"""
|
||||
if re.match(r'^[a-z]+\d[a-z]+$', tag_lower):
|
||||
return tag_lower.upper()
|
||||
return tag_lower.title()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SoundgasmClient
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class SoundgasmClient(LoggingMixin):
|
||||
"""Client for fetching audio from Soundgasm and the Liltsome archive."""
|
||||
|
||||
SERVICE_ID = 'soundgasm'
|
||||
PLATFORM = 'soundgasm'
|
||||
|
||||
SOUNDGASM_BASE = 'https://soundgasm.net'
|
||||
LILTSOME_BASE = 'https://liltsome.yerf.org'
|
||||
LILTSOME_LIBRARY_URL = f'{LILTSOME_BASE}/data/library.json'
|
||||
LILTSOME_CACHE_PATH = Path('/opt/media-downloader/data/liltsome_library.json')
|
||||
LILTSOME_ETAG_PATH = Path('/opt/media-downloader/data/liltsome_library.json.etag')
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
def __init__(self, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Soundgasm')
|
||||
self._liltsome_data: Optional[Dict] = None # cached in-memory per sync run
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_profile_info(self, username: str) -> Optional[Dict]:
|
||||
"""Return basic profile info (post count) from Soundgasm and/or Liltsome."""
|
||||
post_count = 0
|
||||
source = None
|
||||
|
||||
# Try Soundgasm profile page first
|
||||
try:
|
||||
sg_posts = await self._fetch_soundgasm_profile(username)
|
||||
if sg_posts is not None:
|
||||
post_count = len(sg_posts)
|
||||
source = 'soundgasm'
|
||||
except Exception as e:
|
||||
self.log(f"Soundgasm profile fetch failed for {username}: {e}", 'debug')
|
||||
|
||||
# Also check Liltsome for additional posts
|
||||
try:
|
||||
lt_entries = await self._get_liltsome_entries(username)
|
||||
if lt_entries:
|
||||
post_count = max(post_count, len(lt_entries))
|
||||
if source is None:
|
||||
source = 'liltsome'
|
||||
except Exception as e:
|
||||
self.log(f"Liltsome lookup failed for {username}: {e}", 'debug')
|
||||
|
||||
if post_count == 0 and source is None:
|
||||
return None
|
||||
|
||||
return {
|
||||
'username': username,
|
||||
'post_count': post_count,
|
||||
'source': source,
|
||||
}
|
||||
|
||||
async def get_posts(self, username: str, known_post_ids: Optional[Set[str]] = None,
|
||||
progress_callback=None) -> List[Post]:
|
||||
"""Fetch posts from both Soundgasm and Liltsome, deduplicating by post_id."""
|
||||
known = known_post_ids or set()
|
||||
posts: List[Post] = []
|
||||
seen_ids: Set[str] = set(known)
|
||||
|
||||
# 1. Soundgasm (may fail if account deleted — that's OK)
|
||||
try:
|
||||
sg_posts = await self._fetch_soundgasm_posts(username, seen_ids)
|
||||
for p in sg_posts:
|
||||
if p.post_id not in seen_ids:
|
||||
seen_ids.add(p.post_id)
|
||||
posts.append(p)
|
||||
self.log(f"Soundgasm: {len(sg_posts)} new posts for {username}", 'info')
|
||||
except Exception as e:
|
||||
self.log(f"Soundgasm fetch failed for {username} (account may be deleted): {e}", 'warning')
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
# 2. Liltsome archive (always)
|
||||
try:
|
||||
lt_posts = await self._fetch_liltsome_posts(username, seen_ids)
|
||||
for p in lt_posts:
|
||||
if p.post_id not in seen_ids:
|
||||
seen_ids.add(p.post_id)
|
||||
posts.append(p)
|
||||
self.log(f"Liltsome: {len(lt_posts)} new posts for {username}", 'info')
|
||||
except Exception as e:
|
||||
self.log(f"Liltsome fetch failed for {username}: {e}", 'warning')
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
return posts
|
||||
|
||||
async def download_audio(self, download_url: str, output_path: Path) -> Dict:
|
||||
"""Download an audio file via direct HTTP GET."""
|
||||
try:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=300)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(download_url, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'error': f'HTTP {resp.status}'}
|
||||
|
||||
async with aiofiles.open(str(output_path), 'wb') as f:
|
||||
total = 0
|
||||
async for chunk in resp.content.iter_chunked(65536):
|
||||
await f.write(chunk)
|
||||
total += len(chunk)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(output_path),
|
||||
'file_size': total,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Download failed for {download_url}: {e}", 'error')
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Soundgasm scraping
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_soundgasm_profile(self, username: str) -> Optional[List[Dict]]:
|
||||
"""Scrape the Soundgasm profile page, return list of {slug, title, plays}."""
|
||||
url = f'{self.SOUNDGASM_BASE}/u/{username}'
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(url, headers=self.HEADERS) as resp:
|
||||
if resp.status == 404:
|
||||
return None
|
||||
if resp.status != 200:
|
||||
self.log(f"Soundgasm profile returned {resp.status}", 'warning')
|
||||
return None
|
||||
html = await resp.text()
|
||||
|
||||
# Parse .sound-details divs for links
|
||||
entries: List[Dict] = []
|
||||
# Pattern: <a href="https://soundgasm.net/u/{username}/{slug}">title</a>
|
||||
# (profile page uses absolute URLs)
|
||||
for m in re.finditer(
|
||||
r'<a\s+href="(?:https?://soundgasm\.net)?/u/' + re.escape(username) + r'/([^"]+)"[^>]*>\s*([^<]+)',
|
||||
html, re.IGNORECASE
|
||||
):
|
||||
slug = m.group(1).strip()
|
||||
title = m.group(2).strip()
|
||||
entries.append({'slug': slug, 'title': title})
|
||||
|
||||
return entries
|
||||
|
||||
async def _fetch_soundgasm_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
|
||||
"""Fetch full post details from Soundgasm for new posts."""
|
||||
profile_entries = await self._fetch_soundgasm_profile(username)
|
||||
if not profile_entries:
|
||||
return []
|
||||
|
||||
posts: List[Post] = []
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
for entry in profile_entries:
|
||||
slug = entry['slug']
|
||||
if slug in seen_ids:
|
||||
continue
|
||||
|
||||
try:
|
||||
detail = await self._fetch_soundgasm_detail(session, username, slug)
|
||||
if detail is None:
|
||||
continue
|
||||
|
||||
title_raw = detail.get('title', entry.get('title', slug))
|
||||
clean_title, tags = parse_bracket_tags(title_raw)
|
||||
description = detail.get('description', '')
|
||||
audio_url = detail.get('audio_url')
|
||||
|
||||
if not audio_url:
|
||||
continue
|
||||
|
||||
# Determine extension from URL
|
||||
ext = '.m4a'
|
||||
if audio_url:
|
||||
url_path = audio_url.split('?')[0]
|
||||
if '.' in url_path.split('/')[-1]:
|
||||
ext = '.' + url_path.split('/')[-1].rsplit('.', 1)[1]
|
||||
|
||||
filename = f"{slug}{ext}"
|
||||
|
||||
attachment = Attachment(
|
||||
name=filename,
|
||||
file_type='audio',
|
||||
extension=ext.lstrip('.'),
|
||||
server_path=f'/u/{username}/{slug}',
|
||||
download_url=audio_url,
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=slug,
|
||||
service_id='soundgasm',
|
||||
platform='soundgasm',
|
||||
creator_id=username,
|
||||
title=clean_title or None,
|
||||
content=description or None,
|
||||
published_at=None, # Soundgasm has no dates
|
||||
attachments=[attachment],
|
||||
auto_tags=tags,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching Soundgasm detail for {slug}: {e}", 'debug')
|
||||
|
||||
return posts
|
||||
|
||||
async def _fetch_soundgasm_detail(self, session: aiohttp.ClientSession,
|
||||
username: str, slug: str) -> Optional[Dict]:
|
||||
"""Fetch a single Soundgasm audio detail page and extract metadata."""
|
||||
url = f'{self.SOUNDGASM_BASE}/u/{username}/{slug}'
|
||||
|
||||
async with session.get(url, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
return None
|
||||
html = await resp.text()
|
||||
|
||||
# Title: <div aria-label="title"...>Title Text</div>
|
||||
# or from the page title tag
|
||||
title = None
|
||||
title_match = re.search(r'aria-label="title"[^>]*>([^<]+)', html)
|
||||
if title_match:
|
||||
title = title_match.group(1).strip()
|
||||
if not title:
|
||||
title_match = re.search(r'<title>([^<]+)</title>', html, re.IGNORECASE)
|
||||
if title_match:
|
||||
title = title_match.group(1).strip()
|
||||
# Remove " - Soundgasm" suffix if present
|
||||
title = re.sub(r'\s*[-–—]\s*Soundgasm.*$', '', title, flags=re.IGNORECASE).strip()
|
||||
|
||||
# Description: <div class="jp-description">...</div>
|
||||
description = None
|
||||
desc_match = re.search(r'class="jp-description"[^>]*>(.*?)</div>', html, re.DOTALL)
|
||||
if desc_match:
|
||||
desc_html = desc_match.group(1)
|
||||
# Strip HTML tags
|
||||
description = re.sub(r'<br\s*/?>', '\n', desc_html)
|
||||
description = re.sub(r'<[^>]+>', '', description).strip()
|
||||
|
||||
# Audio URL: m4a: "https://..."
|
||||
audio_url = None
|
||||
audio_match = re.search(r'm4a:\s*"([^"]+)"', html)
|
||||
if audio_match:
|
||||
audio_url = audio_match.group(1)
|
||||
|
||||
if not audio_url:
|
||||
return None
|
||||
|
||||
return {
|
||||
'title': title or slug,
|
||||
'description': description,
|
||||
'audio_url': audio_url,
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Liltsome archive
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _ensure_liltsome_cache(self) -> bool:
|
||||
"""Download/refresh the Liltsome library.json using ETag-based invalidation.
|
||||
|
||||
Returns True if cache is available (fresh or existing), False otherwise.
|
||||
"""
|
||||
etag_file = self.LILTSOME_ETAG_PATH
|
||||
cache_file = self.LILTSOME_CACHE_PATH
|
||||
|
||||
stored_etag = None
|
||||
if etag_file.exists():
|
||||
try:
|
||||
stored_etag = etag_file.read_text().strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=600) # 131MB can take a while
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# HEAD request to check ETag
|
||||
async with session.head(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Liltsome HEAD returned {resp.status}", 'warning')
|
||||
return cache_file.exists()
|
||||
|
||||
remote_etag = resp.headers.get('ETag', '').strip()
|
||||
|
||||
if stored_etag and remote_etag and stored_etag == remote_etag and cache_file.exists():
|
||||
self.log("Liltsome cache is fresh (ETag match)", 'debug')
|
||||
return True
|
||||
|
||||
# Download the full library
|
||||
self.log("Downloading Liltsome library.json (this may take a while)...", 'info')
|
||||
async with session.get(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Liltsome GET returned {resp.status}", 'warning')
|
||||
return cache_file.exists()
|
||||
|
||||
cache_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
async with aiofiles.open(str(cache_file), 'wb') as f:
|
||||
async for chunk in resp.content.iter_chunked(262144):
|
||||
await f.write(chunk)
|
||||
|
||||
new_etag = resp.headers.get('ETag', remote_etag or '').strip()
|
||||
|
||||
if new_etag:
|
||||
etag_file.write_text(new_etag)
|
||||
|
||||
self.log("Liltsome library.json downloaded successfully", 'info')
|
||||
self._liltsome_data = None # force re-parse
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Failed to refresh Liltsome cache: {e}", 'warning')
|
||||
return cache_file.exists()
|
||||
|
||||
async def _load_liltsome_data(self) -> Optional[Dict]:
|
||||
"""Load and cache the Liltsome library data in memory."""
|
||||
if self._liltsome_data is not None:
|
||||
return self._liltsome_data
|
||||
|
||||
cache_file = self.LILTSOME_CACHE_PATH
|
||||
if not cache_file.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
data = await asyncio.to_thread(self._read_liltsome_json, cache_file)
|
||||
self._liltsome_data = data
|
||||
return data
|
||||
except Exception as e:
|
||||
self.log(f"Failed to parse Liltsome library.json: {e}", 'error')
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _read_liltsome_json(path: Path) -> Dict:
|
||||
"""Read and parse the Liltsome JSON file (blocking, run in thread)."""
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
async def _get_liltsome_entries(self, username: str) -> Optional[List[Dict]]:
|
||||
"""Find artist entries in Liltsome data by username (case-insensitive).
|
||||
|
||||
library.json structure: {"artists": [{"id": "name", "files": {"audio": [...]}}]}
|
||||
"""
|
||||
await self._ensure_liltsome_cache()
|
||||
data = await self._load_liltsome_data()
|
||||
if not data:
|
||||
return None
|
||||
|
||||
username_lower = username.lower()
|
||||
|
||||
# Top-level is {"artists": [...]}
|
||||
artists = data.get('artists', []) if isinstance(data, dict) else data
|
||||
|
||||
for artist in artists:
|
||||
artist_id = str(artist.get('id', '')).lower()
|
||||
artist_name = str(artist.get('name', '')).lower()
|
||||
if artist_id == username_lower or artist_name == username_lower:
|
||||
# Audio entries are in files.audio
|
||||
files = artist.get('files', {})
|
||||
if isinstance(files, dict):
|
||||
return files.get('audio', [])
|
||||
return []
|
||||
|
||||
return None
|
||||
|
||||
async def _fetch_liltsome_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
|
||||
"""Convert Liltsome archive entries to Post objects."""
|
||||
entries = await self._get_liltsome_entries(username)
|
||||
if not entries:
|
||||
return []
|
||||
|
||||
posts: List[Post] = []
|
||||
for entry in entries:
|
||||
filename = entry.get('filename', '')
|
||||
path = entry.get('path', '')
|
||||
title_raw = entry.get('title', filename)
|
||||
entry_tags = entry.get('tags', []) # already lowercase in Liltsome
|
||||
duration = None
|
||||
file_size = entry.get('size')
|
||||
|
||||
if isinstance(entry.get('metadata'), dict):
|
||||
duration = entry['metadata'].get('duration')
|
||||
|
||||
# Build post_id: prefix with liltsome- to avoid collision
|
||||
sanitized_name = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) if filename else path
|
||||
post_id = f'liltsome-{sanitized_name}'
|
||||
|
||||
if post_id in seen_ids:
|
||||
continue
|
||||
|
||||
# Parse bracket tags from title for clean_title
|
||||
clean_title, title_tags = parse_bracket_tags(title_raw)
|
||||
|
||||
# Merge: use Liltsome's pre-parsed tags + any extra from title
|
||||
all_tags_set: Set[str] = set()
|
||||
all_tags: List[str] = []
|
||||
for t in entry_tags:
|
||||
t_lower = t.strip().lower()
|
||||
if t_lower and t_lower not in all_tags_set:
|
||||
all_tags_set.add(t_lower)
|
||||
all_tags.append(t_lower)
|
||||
for t in title_tags:
|
||||
if t not in all_tags_set:
|
||||
all_tags_set.add(t)
|
||||
all_tags.append(t)
|
||||
|
||||
# Build download URL
|
||||
download_url = f'{self.LILTSOME_BASE}/audio_files/{quote(path, safe="/")}' if path else None
|
||||
|
||||
# Determine extension
|
||||
ext = 'm4a'
|
||||
if filename and '.' in filename:
|
||||
ext = filename.rsplit('.', 1)[1].lower()
|
||||
elif path and '.' in path:
|
||||
ext = path.rsplit('.', 1)[1].lower()
|
||||
|
||||
attachment = Attachment(
|
||||
name=f"{sanitized_name}.{ext}" if not filename.endswith(f'.{ext}') else filename,
|
||||
file_type='audio',
|
||||
extension=ext,
|
||||
server_path=path or filename,
|
||||
download_url=download_url,
|
||||
file_size=file_size,
|
||||
duration=duration,
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id='soundgasm',
|
||||
platform='soundgasm',
|
||||
creator_id=username,
|
||||
title=clean_title or None,
|
||||
content=None,
|
||||
published_at=None,
|
||||
attachments=[attachment],
|
||||
auto_tags=all_tags,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
Reference in New Issue
Block a user