509 lines
20 KiB
Python
509 lines
20 KiB
Python
"""
|
|
Soundgasm + Liltsome Archive Client for Paid Content
|
|
|
|
Handles:
|
|
- Soundgasm profile scraping (no auth/Cloudflare needed)
|
|
- Liltsome archive (liltsome.yerf.org) as supplementary source
|
|
- Bracket tag parsing from audio titles: [F4M] [Whisper] etc.
|
|
- Direct HTTP audio downloads (.m4a)
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Set, Tuple
|
|
from urllib.parse import quote
|
|
|
|
import aiohttp
|
|
import aiofiles
|
|
|
|
from modules.base_module import LoggingMixin
|
|
from .models import Creator, Post, Attachment
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Bracket tag helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def parse_bracket_tags(title: str) -> Tuple[str, List[str]]:
|
|
"""Extract [bracket] tags from a title, normalize, return (clean_title, tags)."""
|
|
tags = re.findall(r'\[([^\]]+)\]', title)
|
|
clean_title = re.sub(r'\s*\[[^\]]+\]\s*', ' ', title).strip()
|
|
normalized: List[str] = []
|
|
seen: Set[str] = set()
|
|
for tag in tags:
|
|
tag_lower = tag.strip().lower()
|
|
if tag_lower and tag_lower not in seen:
|
|
seen.add(tag_lower)
|
|
normalized.append(tag_lower)
|
|
return clean_title, normalized
|
|
|
|
|
|
def format_tag_display(tag_lower: str) -> str:
|
|
"""Format a normalized lowercase tag for display.
|
|
|
|
Gender tags (f4m, m4f, f4a …) → uppercase.
|
|
Everything else → title case.
|
|
"""
|
|
if re.match(r'^[a-z]+\d[a-z]+$', tag_lower):
|
|
return tag_lower.upper()
|
|
return tag_lower.title()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SoundgasmClient
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class SoundgasmClient(LoggingMixin):
|
|
"""Client for fetching audio from Soundgasm and the Liltsome archive."""
|
|
|
|
SERVICE_ID = 'soundgasm'
|
|
PLATFORM = 'soundgasm'
|
|
|
|
SOUNDGASM_BASE = 'https://soundgasm.net'
|
|
LILTSOME_BASE = 'https://liltsome.yerf.org'
|
|
LILTSOME_LIBRARY_URL = f'{LILTSOME_BASE}/data/library.json'
|
|
LILTSOME_CACHE_PATH = Path('/opt/media-downloader/data/liltsome_library.json')
|
|
LILTSOME_ETAG_PATH = Path('/opt/media-downloader/data/liltsome_library.json.etag')
|
|
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
|
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
}
|
|
|
|
def __init__(self, log_callback=None):
|
|
self._init_logger('PaidContent', log_callback, default_module='Soundgasm')
|
|
self._liltsome_data: Optional[Dict] = None # cached in-memory per sync run
|
|
|
|
# ------------------------------------------------------------------
|
|
# Public API
|
|
# ------------------------------------------------------------------
|
|
|
|
async def get_profile_info(self, username: str) -> Optional[Dict]:
|
|
"""Return basic profile info (post count) from Soundgasm and/or Liltsome."""
|
|
post_count = 0
|
|
source = None
|
|
|
|
# Try Soundgasm profile page first
|
|
try:
|
|
sg_posts = await self._fetch_soundgasm_profile(username)
|
|
if sg_posts is not None:
|
|
post_count = len(sg_posts)
|
|
source = 'soundgasm'
|
|
except Exception as e:
|
|
self.log(f"Soundgasm profile fetch failed for {username}: {e}", 'debug')
|
|
|
|
# Also check Liltsome for additional posts
|
|
try:
|
|
lt_entries = await self._get_liltsome_entries(username)
|
|
if lt_entries:
|
|
post_count = max(post_count, len(lt_entries))
|
|
if source is None:
|
|
source = 'liltsome'
|
|
except Exception as e:
|
|
self.log(f"Liltsome lookup failed for {username}: {e}", 'debug')
|
|
|
|
if post_count == 0 and source is None:
|
|
return None
|
|
|
|
return {
|
|
'username': username,
|
|
'post_count': post_count,
|
|
'source': source,
|
|
}
|
|
|
|
async def get_posts(self, username: str, known_post_ids: Optional[Set[str]] = None,
|
|
progress_callback=None) -> List[Post]:
|
|
"""Fetch posts from both Soundgasm and Liltsome, deduplicating by post_id."""
|
|
known = known_post_ids or set()
|
|
posts: List[Post] = []
|
|
seen_ids: Set[str] = set(known)
|
|
|
|
# 1. Soundgasm (may fail if account deleted — that's OK)
|
|
try:
|
|
sg_posts = await self._fetch_soundgasm_posts(username, seen_ids)
|
|
for p in sg_posts:
|
|
if p.post_id not in seen_ids:
|
|
seen_ids.add(p.post_id)
|
|
posts.append(p)
|
|
self.log(f"Soundgasm: {len(sg_posts)} new posts for {username}", 'info')
|
|
except Exception as e:
|
|
self.log(f"Soundgasm fetch failed for {username} (account may be deleted): {e}", 'warning')
|
|
|
|
if progress_callback:
|
|
progress_callback(len(posts))
|
|
|
|
# 2. Liltsome archive (always)
|
|
try:
|
|
lt_posts = await self._fetch_liltsome_posts(username, seen_ids)
|
|
for p in lt_posts:
|
|
if p.post_id not in seen_ids:
|
|
seen_ids.add(p.post_id)
|
|
posts.append(p)
|
|
self.log(f"Liltsome: {len(lt_posts)} new posts for {username}", 'info')
|
|
except Exception as e:
|
|
self.log(f"Liltsome fetch failed for {username}: {e}", 'warning')
|
|
|
|
if progress_callback:
|
|
progress_callback(len(posts))
|
|
|
|
return posts
|
|
|
|
async def download_audio(self, download_url: str, output_path: Path) -> Dict:
|
|
"""Download an audio file via direct HTTP GET."""
|
|
try:
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
timeout = aiohttp.ClientTimeout(total=300)
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
async with session.get(download_url, headers=self.HEADERS) as resp:
|
|
if resp.status != 200:
|
|
return {'success': False, 'error': f'HTTP {resp.status}'}
|
|
|
|
async with aiofiles.open(str(output_path), 'wb') as f:
|
|
total = 0
|
|
async for chunk in resp.content.iter_chunked(65536):
|
|
await f.write(chunk)
|
|
total += len(chunk)
|
|
|
|
return {
|
|
'success': True,
|
|
'file_path': str(output_path),
|
|
'file_size': total,
|
|
}
|
|
|
|
except Exception as e:
|
|
self.log(f"Download failed for {download_url}: {e}", 'error')
|
|
return {'success': False, 'error': str(e)}
|
|
|
|
# ------------------------------------------------------------------
|
|
# Soundgasm scraping
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _fetch_soundgasm_profile(self, username: str) -> Optional[List[Dict]]:
|
|
"""Scrape the Soundgasm profile page, return list of {slug, title, plays}."""
|
|
url = f'{self.SOUNDGASM_BASE}/u/{username}'
|
|
timeout = aiohttp.ClientTimeout(total=30)
|
|
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
async with session.get(url, headers=self.HEADERS) as resp:
|
|
if resp.status == 404:
|
|
return None
|
|
if resp.status != 200:
|
|
self.log(f"Soundgasm profile returned {resp.status}", 'warning')
|
|
return None
|
|
html = await resp.text()
|
|
|
|
# Parse .sound-details divs for links
|
|
entries: List[Dict] = []
|
|
# Pattern: <a href="https://soundgasm.net/u/{username}/{slug}">title</a>
|
|
# (profile page uses absolute URLs)
|
|
for m in re.finditer(
|
|
r'<a\s+href="(?:https?://soundgasm\.net)?/u/' + re.escape(username) + r'/([^"]+)"[^>]*>\s*([^<]+)',
|
|
html, re.IGNORECASE
|
|
):
|
|
slug = m.group(1).strip()
|
|
title = m.group(2).strip()
|
|
entries.append({'slug': slug, 'title': title})
|
|
|
|
return entries
|
|
|
|
async def _fetch_soundgasm_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
|
|
"""Fetch full post details from Soundgasm for new posts."""
|
|
profile_entries = await self._fetch_soundgasm_profile(username)
|
|
if not profile_entries:
|
|
return []
|
|
|
|
posts: List[Post] = []
|
|
timeout = aiohttp.ClientTimeout(total=30)
|
|
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
for entry in profile_entries:
|
|
slug = entry['slug']
|
|
if slug in seen_ids:
|
|
continue
|
|
|
|
try:
|
|
detail = await self._fetch_soundgasm_detail(session, username, slug)
|
|
if detail is None:
|
|
continue
|
|
|
|
title_raw = detail.get('title', entry.get('title', slug))
|
|
clean_title, tags = parse_bracket_tags(title_raw)
|
|
description = detail.get('description', '')
|
|
audio_url = detail.get('audio_url')
|
|
|
|
if not audio_url:
|
|
continue
|
|
|
|
# Determine extension from URL
|
|
ext = '.m4a'
|
|
if audio_url:
|
|
url_path = audio_url.split('?')[0]
|
|
if '.' in url_path.split('/')[-1]:
|
|
ext = '.' + url_path.split('/')[-1].rsplit('.', 1)[1]
|
|
|
|
filename = f"{slug}{ext}"
|
|
|
|
attachment = Attachment(
|
|
name=filename,
|
|
file_type='audio',
|
|
extension=ext.lstrip('.'),
|
|
server_path=f'/u/{username}/{slug}',
|
|
download_url=audio_url,
|
|
)
|
|
|
|
post = Post(
|
|
post_id=slug,
|
|
service_id='soundgasm',
|
|
platform='soundgasm',
|
|
creator_id=username,
|
|
title=clean_title or None,
|
|
content=description or None,
|
|
published_at=None, # Soundgasm has no dates
|
|
attachments=[attachment],
|
|
auto_tags=tags,
|
|
)
|
|
posts.append(post)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error fetching Soundgasm detail for {slug}: {e}", 'debug')
|
|
|
|
return posts
|
|
|
|
async def _fetch_soundgasm_detail(self, session: aiohttp.ClientSession,
|
|
username: str, slug: str) -> Optional[Dict]:
|
|
"""Fetch a single Soundgasm audio detail page and extract metadata."""
|
|
url = f'{self.SOUNDGASM_BASE}/u/{username}/{slug}'
|
|
|
|
async with session.get(url, headers=self.HEADERS) as resp:
|
|
if resp.status != 200:
|
|
return None
|
|
html = await resp.text()
|
|
|
|
# Title: <div aria-label="title"...>Title Text</div>
|
|
# or from the page title tag
|
|
title = None
|
|
title_match = re.search(r'aria-label="title"[^>]*>([^<]+)', html)
|
|
if title_match:
|
|
title = title_match.group(1).strip()
|
|
if not title:
|
|
title_match = re.search(r'<title>([^<]+)</title>', html, re.IGNORECASE)
|
|
if title_match:
|
|
title = title_match.group(1).strip()
|
|
# Remove " - Soundgasm" suffix if present
|
|
title = re.sub(r'\s*[-–—]\s*Soundgasm.*$', '', title, flags=re.IGNORECASE).strip()
|
|
|
|
# Description: <div class="jp-description">...</div>
|
|
description = None
|
|
desc_match = re.search(r'class="jp-description"[^>]*>(.*?)</div>', html, re.DOTALL)
|
|
if desc_match:
|
|
desc_html = desc_match.group(1)
|
|
# Strip HTML tags
|
|
description = re.sub(r'<br\s*/?>', '\n', desc_html)
|
|
description = re.sub(r'<[^>]+>', '', description).strip()
|
|
|
|
# Audio URL: m4a: "https://..."
|
|
audio_url = None
|
|
audio_match = re.search(r'm4a:\s*"([^"]+)"', html)
|
|
if audio_match:
|
|
audio_url = audio_match.group(1)
|
|
|
|
if not audio_url:
|
|
return None
|
|
|
|
return {
|
|
'title': title or slug,
|
|
'description': description,
|
|
'audio_url': audio_url,
|
|
}
|
|
|
|
# ------------------------------------------------------------------
|
|
# Liltsome archive
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _ensure_liltsome_cache(self) -> bool:
|
|
"""Download/refresh the Liltsome library.json using ETag-based invalidation.
|
|
|
|
Returns True if cache is available (fresh or existing), False otherwise.
|
|
"""
|
|
etag_file = self.LILTSOME_ETAG_PATH
|
|
cache_file = self.LILTSOME_CACHE_PATH
|
|
|
|
stored_etag = None
|
|
if etag_file.exists():
|
|
try:
|
|
stored_etag = etag_file.read_text().strip()
|
|
except Exception:
|
|
pass
|
|
|
|
timeout = aiohttp.ClientTimeout(total=600) # 131MB can take a while
|
|
try:
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
# HEAD request to check ETag
|
|
async with session.head(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
|
|
if resp.status != 200:
|
|
self.log(f"Liltsome HEAD returned {resp.status}", 'warning')
|
|
return cache_file.exists()
|
|
|
|
remote_etag = resp.headers.get('ETag', '').strip()
|
|
|
|
if stored_etag and remote_etag and stored_etag == remote_etag and cache_file.exists():
|
|
self.log("Liltsome cache is fresh (ETag match)", 'debug')
|
|
return True
|
|
|
|
# Download the full library
|
|
self.log("Downloading Liltsome library.json (this may take a while)...", 'info')
|
|
async with session.get(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
|
|
if resp.status != 200:
|
|
self.log(f"Liltsome GET returned {resp.status}", 'warning')
|
|
return cache_file.exists()
|
|
|
|
cache_file.parent.mkdir(parents=True, exist_ok=True)
|
|
async with aiofiles.open(str(cache_file), 'wb') as f:
|
|
async for chunk in resp.content.iter_chunked(262144):
|
|
await f.write(chunk)
|
|
|
|
new_etag = resp.headers.get('ETag', remote_etag or '').strip()
|
|
|
|
if new_etag:
|
|
etag_file.write_text(new_etag)
|
|
|
|
self.log("Liltsome library.json downloaded successfully", 'info')
|
|
self._liltsome_data = None # force re-parse
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.log(f"Failed to refresh Liltsome cache: {e}", 'warning')
|
|
return cache_file.exists()
|
|
|
|
async def _load_liltsome_data(self) -> Optional[Dict]:
|
|
"""Load and cache the Liltsome library data in memory."""
|
|
if self._liltsome_data is not None:
|
|
return self._liltsome_data
|
|
|
|
cache_file = self.LILTSOME_CACHE_PATH
|
|
if not cache_file.exists():
|
|
return None
|
|
|
|
try:
|
|
data = await asyncio.to_thread(self._read_liltsome_json, cache_file)
|
|
self._liltsome_data = data
|
|
return data
|
|
except Exception as e:
|
|
self.log(f"Failed to parse Liltsome library.json: {e}", 'error')
|
|
return None
|
|
|
|
@staticmethod
|
|
def _read_liltsome_json(path: Path) -> Dict:
|
|
"""Read and parse the Liltsome JSON file (blocking, run in thread)."""
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
async def _get_liltsome_entries(self, username: str) -> Optional[List[Dict]]:
|
|
"""Find artist entries in Liltsome data by username (case-insensitive).
|
|
|
|
library.json structure: {"artists": [{"id": "name", "files": {"audio": [...]}}]}
|
|
"""
|
|
await self._ensure_liltsome_cache()
|
|
data = await self._load_liltsome_data()
|
|
if not data:
|
|
return None
|
|
|
|
username_lower = username.lower()
|
|
|
|
# Top-level is {"artists": [...]}
|
|
artists = data.get('artists', []) if isinstance(data, dict) else data
|
|
|
|
for artist in artists:
|
|
artist_id = str(artist.get('id', '')).lower()
|
|
artist_name = str(artist.get('name', '')).lower()
|
|
if artist_id == username_lower or artist_name == username_lower:
|
|
# Audio entries are in files.audio
|
|
files = artist.get('files', {})
|
|
if isinstance(files, dict):
|
|
return files.get('audio', [])
|
|
return []
|
|
|
|
return None
|
|
|
|
async def _fetch_liltsome_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
|
|
"""Convert Liltsome archive entries to Post objects."""
|
|
entries = await self._get_liltsome_entries(username)
|
|
if not entries:
|
|
return []
|
|
|
|
posts: List[Post] = []
|
|
for entry in entries:
|
|
filename = entry.get('filename', '')
|
|
path = entry.get('path', '')
|
|
title_raw = entry.get('title', filename)
|
|
entry_tags = entry.get('tags', []) # already lowercase in Liltsome
|
|
duration = None
|
|
file_size = entry.get('size')
|
|
|
|
if isinstance(entry.get('metadata'), dict):
|
|
duration = entry['metadata'].get('duration')
|
|
|
|
# Build post_id: prefix with liltsome- to avoid collision
|
|
sanitized_name = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) if filename else path
|
|
post_id = f'liltsome-{sanitized_name}'
|
|
|
|
if post_id in seen_ids:
|
|
continue
|
|
|
|
# Parse bracket tags from title for clean_title
|
|
clean_title, title_tags = parse_bracket_tags(title_raw)
|
|
|
|
# Merge: use Liltsome's pre-parsed tags + any extra from title
|
|
all_tags_set: Set[str] = set()
|
|
all_tags: List[str] = []
|
|
for t in entry_tags:
|
|
t_lower = t.strip().lower()
|
|
if t_lower and t_lower not in all_tags_set:
|
|
all_tags_set.add(t_lower)
|
|
all_tags.append(t_lower)
|
|
for t in title_tags:
|
|
if t not in all_tags_set:
|
|
all_tags_set.add(t)
|
|
all_tags.append(t)
|
|
|
|
# Build download URL
|
|
download_url = f'{self.LILTSOME_BASE}/audio_files/{quote(path, safe="/")}' if path else None
|
|
|
|
# Determine extension
|
|
ext = 'm4a'
|
|
if filename and '.' in filename:
|
|
ext = filename.rsplit('.', 1)[1].lower()
|
|
elif path and '.' in path:
|
|
ext = path.rsplit('.', 1)[1].lower()
|
|
|
|
attachment = Attachment(
|
|
name=f"{sanitized_name}.{ext}" if not filename.endswith(f'.{ext}') else filename,
|
|
file_type='audio',
|
|
extension=ext,
|
|
server_path=path or filename,
|
|
download_url=download_url,
|
|
file_size=file_size,
|
|
duration=duration,
|
|
)
|
|
|
|
post = Post(
|
|
post_id=post_id,
|
|
service_id='soundgasm',
|
|
platform='soundgasm',
|
|
creator_id=username,
|
|
title=clean_title or None,
|
|
content=None,
|
|
published_at=None,
|
|
attachments=[attachment],
|
|
auto_tags=all_tags,
|
|
)
|
|
posts.append(post)
|
|
|
|
return posts
|