Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

View File

@@ -0,0 +1,36 @@
"""
Paid Content Module
Downloads and organizes content from subscription-based creator platforms
(OnlyFans, Fansly, Patreon, Fanbox, etc.) via the Coomer.party and Kemono.party archival APIs.
Also supports YouTube channels and Twitch clips via yt-dlp.
"""
from .scraper import PaidContentScraper
from .api_client import PaidContentAPIClient
from .db_adapter import PaidContentDBAdapter
from .file_host_downloader import FileHostDownloader
from .embed_downloader import EmbedDownloader
from .youtube_client import YouTubeClient
from .twitch_client import TwitchClient, TwitchThumbnailCache
from .fansly_direct_client import FanslyDirectClient
from .onlyfans_client import OnlyFansClient
from .xhamster_client import XHamsterClient
from .tiktok_client import TikTokClient
from .instagram_adapter import InstagramAdapter
__all__ = [
'PaidContentScraper',
'PaidContentAPIClient',
'PaidContentDBAdapter',
'FileHostDownloader',
'EmbedDownloader',
'YouTubeClient',
'TwitchClient',
'TwitchThumbnailCache',
'FanslyDirectClient',
'OnlyFansClient',
'XHamsterClient',
'TikTokClient',
'InstagramAdapter',
]

View File

@@ -0,0 +1,311 @@
"""
Unified API client for Coomer.party and Kemono.party
Both services share the same API structure (Kemono fork)
"""
import aiohttp
import asyncio
from typing import List, Optional, Dict, Any
from modules.base_module import LoggingMixin, RateLimitMixin
from .models import Creator, Post, Attachment
class PaidContentAPIClient(LoggingMixin, RateLimitMixin):
"""
API client for Coomer and Kemono archival services
API Endpoints:
- GET /creators - List all creators
- GET /{service}/user/{creator_id} - Get creator info
- GET /{service}/user/{creator_id} - Get creator's posts (paginated with ?o=offset)
- GET /{service}/user/{creator_id}/post/{post_id} - Get single post
"""
# Fallback URLs if database doesn't have them configured
DEFAULT_SERVICE_URLS = {
'coomer': 'https://coomer.party',
'kemono': 'https://kemono.party'
}
SUPPORTED_PLATFORMS = {
'coomer': ['onlyfans', 'fansly', 'candfans'],
'kemono': ['patreon', 'fanbox', 'gumroad', 'subscribestar', 'discord']
}
def __init__(self, service_id: str, session_cookie: str = None, base_url: str = None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='API')
self._init_rate_limiter(min_delay=0.5, max_delay=2.0, batch_delay_min=1, batch_delay_max=3)
self.service_id = service_id
# Use provided base_url, or fall back to defaults
if base_url:
# If base_url includes /api/v1, extract just the base
if '/api/v1' in base_url:
self.base_url = base_url.replace('/api/v1', '').rstrip('/')
else:
self.base_url = base_url.rstrip('/')
else:
self.base_url = self.DEFAULT_SERVICE_URLS.get(service_id)
self.api_url = f"{self.base_url}/api/v1"
self.session_cookie = session_cookie
self._session: Optional[aiohttp.ClientSession] = None
async def _get_session(self) -> aiohttp.ClientSession:
"""Get or create aiohttp session"""
if self._session is None or self._session.closed:
# Note: Coomer/Kemono require 'Accept: text/css' header as anti-scraping measure
# Despite this, they still return JSON responses
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/css',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': self.base_url
}
cookies = {}
if self.session_cookie:
cookies['session'] = self.session_cookie
timeout = aiohttp.ClientTimeout(total=30)
self._session = aiohttp.ClientSession(headers=headers, cookies=cookies, timeout=timeout)
return self._session
async def close(self):
"""Close the aiohttp session"""
if self._session and not self._session.closed:
await self._session.close()
self._session = None
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()
async def check_health(self) -> Dict[str, Any]:
"""Check API health status"""
import time
try:
session = await self._get_session()
start = time.time()
async with session.get(f"{self.api_url}/creators", timeout=aiohttp.ClientTimeout(total=10)) as resp:
elapsed = time.time() - start
if resp.status == 200:
# content_type=None allows parsing JSON regardless of response content-type
await resp.json(content_type=None)
return {'status': 'healthy', 'response_time': round(elapsed, 3)}
elif resp.status == 429:
return {'status': 'rate_limited', 'response_code': 429}
else:
return {'status': 'degraded', 'response_code': resp.status}
except asyncio.TimeoutError:
return {'status': 'timeout', 'error': 'Request timed out'}
except Exception as e:
return {'status': 'down', 'error': str(e)}
async def get_all_creators(self) -> List[Dict]:
"""Get list of all available creators (for search)"""
self._delay_between_items()
try:
session = await self._get_session()
async with session.get(f"{self.api_url}/creators") as resp:
if resp.status == 200:
return await resp.json(content_type=None)
self.log(f"Failed to get creators list: HTTP {resp.status}", 'warning')
return []
except Exception as e:
self.log(f"Error getting creators list: {e}", 'error')
return []
async def get_creator(self, platform: str, creator_id: str) -> Optional[Creator]:
"""Get creator info"""
self._delay_between_items()
try:
session = await self._get_session()
# First try to get creator profile
url = f"{self.api_url}/{platform}/user/{creator_id}/profile"
async with session.get(url) as resp:
if resp.status == 200:
data = await resp.json(content_type=None)
return Creator.from_api(data, self.service_id, platform, self.base_url)
# Fallback: get first post to extract creator info
url = f"{self.api_url}/{platform}/user/{creator_id}/posts"
async with session.get(url) as resp:
if resp.status == 200:
posts = await resp.json(content_type=None)
if posts and len(posts) > 0:
# Extract creator info from first post
first_post = posts[0]
# Construct image URLs - use .st instead of .party
from urllib.parse import urlparse
parsed = urlparse(self.base_url)
# Convert .party to .st for image URLs (coomer.party/kemono.party images are at .st)
netloc = parsed.netloc.replace('.party', '.st')
img_domain = f"img.{netloc}"
profile_image_url = f"https://{img_domain}/icons/{platform}/{creator_id}"
banner_image_url = f"https://{img_domain}/banners/{platform}/{creator_id}"
return Creator(
creator_id=creator_id,
service_id=self.service_id,
platform=platform,
username=first_post.get('user', creator_id),
display_name=first_post.get('user', creator_id),
profile_image_url=profile_image_url,
banner_image_url=banner_image_url
)
self.log(f"Creator not found: {platform}/{creator_id}", 'warning')
return None
except Exception as e:
self.log(f"Error getting creator {platform}/{creator_id}: {e}", 'error')
return None
async def get_creator_posts(self, platform: str, creator_id: str, offset: int = 0) -> List[Post]:
"""Get creator's posts (50 per page by default)"""
self._delay_between_items()
try:
session = await self._get_session()
url = f"{self.api_url}/{platform}/user/{creator_id}/posts"
params = {'o': offset} if offset > 0 else {}
async with session.get(url, params=params) as resp:
if resp.status == 200:
data = await resp.json(content_type=None)
return [Post.from_api(p, self.service_id, platform, creator_id, self.base_url) for p in data]
elif resp.status == 404:
self.log(f"Creator not found: {platform}/{creator_id}", 'warning')
else:
self.log(f"Failed to get posts: HTTP {resp.status}", 'warning')
return []
except Exception as e:
self.log(f"Error getting posts for {platform}/{creator_id}: {e}", 'error')
return []
async def get_all_creator_posts(self, platform: str, creator_id: str,
since_date: str = None, max_posts: int = None,
progress_callback=None) -> List[Post]:
"""Fetch all posts with pagination"""
all_posts = []
offset = 0
page = 0
self.log(f"Fetching posts for {platform}/{creator_id}", 'info')
while True:
posts = await self.get_creator_posts(platform, creator_id, offset)
if not posts:
break
for post in posts:
# Stop if we've reached posts we've already seen
if since_date and post.published_at and post.published_at <= since_date:
self.log(f"Reached already-seen post date: {post.published_at}", 'debug')
return all_posts
all_posts.append(post)
if max_posts and len(all_posts) >= max_posts:
self.log(f"Reached max posts limit: {max_posts}", 'debug')
return all_posts
page += 1
offset += 50
if progress_callback:
progress_callback(page, len(all_posts))
self._delay_between_batches()
self.log(f"Fetched {len(all_posts)} posts for {platform}/{creator_id}", 'info')
return all_posts
async def get_post(self, platform: str, creator_id: str, post_id: str) -> Optional[Post]:
"""Get single post by ID"""
self._delay_between_items()
try:
session = await self._get_session()
url = f"{self.api_url}/{platform}/user/{creator_id}/post/{post_id}"
async with session.get(url) as resp:
if resp.status == 200:
data = await resp.json(content_type=None)
# Single post endpoint wraps response in {"post": {...}}
if isinstance(data, dict) and 'post' in data:
data = data['post']
return Post.from_api(data, self.service_id, platform, creator_id, self.base_url)
return None
except Exception as e:
self.log(f"Error getting post {post_id}: {e}", 'error')
return None
async def search_creators(self, query: str, platform: str = None) -> List[Dict]:
"""Search for creators by name"""
self._delay_between_items()
try:
# Get all creators and filter locally (API doesn't have search endpoint)
all_creators = await self.get_all_creators()
query_lower = query.lower()
results = []
for creator in all_creators:
if platform and creator.get('service') != platform:
continue
name = (creator.get('name') or '').lower()
if query_lower in name:
results.append({
'id': creator.get('id'),
'name': creator.get('name'),
'service': creator.get('service'),
'indexed': creator.get('indexed'),
'updated': creator.get('updated'),
'favorited': creator.get('favorited', 0)
})
# Sort by favorited count (popularity)
results.sort(key=lambda x: x.get('favorited', 0), reverse=True)
return results[:50] # Limit results
except Exception as e:
self.log(f"Error searching creators: {e}", 'error')
return []
def get_attachment_url(self, server_path: str) -> str:
"""Convert server path to full download URL"""
if not server_path:
return ''
if server_path.startswith('http'):
return server_path
return f"{self.base_url}/data{server_path}"
def get_thumbnail_url(self, server_path: str) -> str:
"""Get thumbnail URL for an attachment"""
if not server_path:
return ''
if server_path.startswith('http'):
return server_path
return f"{self.base_url}/thumbnail/data{server_path}"
@classmethod
def get_supported_platforms(cls, service_id: str) -> List[str]:
"""Get list of supported platforms for a service"""
return cls.SUPPORTED_PLATFORMS.get(service_id, [])
@classmethod
def is_valid_service(cls, service_id: str) -> bool:
"""Check if service ID is valid"""
return service_id in cls.SERVICE_URLS
@classmethod
def get_service_ids(cls) -> List[str]:
"""Get list of all service IDs"""
return list(cls.SERVICE_URLS.keys())

View File

@@ -0,0 +1,389 @@
"""
Bellazon Forum Thread Client for Paid Content
Scrapes Bellazon forum threads (Invision Power Suite) treating each thread
as a "creator" and each reply with media as a post.
Only bellazon-hosted uploads are captured (external image host links are
unreliable/ephemeral). Video attachments (attachment.php) are also captured.
"""
import asyncio
import html
import json
import re
from datetime import datetime, timezone
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class BellazonClient(LoggingMixin):
"""Client for scraping Bellazon forum threads."""
SERVICE_ID = 'bellazon'
PLATFORM = 'bellazon'
BASE_URL = 'https://www.bellazon.com/main'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
# Extensions considered images
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
# Extensions considered videos
VIDEO_EXTS = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
def __init__(self, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Bellazon')
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def get_profile_info(self, topic_id: str) -> Optional[Dict]:
"""Fetch first page of a thread and return profile-like info.
Returns dict with: username (slug), display_name, post_count, topic_url
"""
# Bellazon requires a slug in the URL but redirects to the correct one
url = f'{self.BASE_URL}/topic/{topic_id}-x/'
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"Bellazon topic {topic_id} returned HTTP {resp.status}", 'warning')
return None
final_url = str(resp.url)
page_html = await resp.text()
except Exception as e:
self.log(f"Failed to fetch Bellazon topic {topic_id}: {e}", 'error')
return None
# Extract slug from final URL: /topic/{id}-{slug}/
slug = self._extract_slug(final_url, topic_id)
# Extract thread title from <h1>
title = self._extract_title(page_html)
# Extract page count from "Page X of Y"
page_count = self._extract_page_count(page_html)
# Count comments on this page to estimate total
comment_ids = re.findall(r'data-commentid="(\d+)"', page_html)
per_page = len(comment_ids) or 20
estimated_comments = per_page * page_count
return {
'username': slug,
'display_name': title or slug,
'post_count': estimated_comments,
'page_count': page_count,
'topic_url': final_url.split('?')[0].rstrip('/'),
}
async def get_posts(self, topic_id: str, topic_url: str,
known_post_ids: Optional[Set[str]] = None,
progress_callback=None) -> List[Post]:
"""Scrape all pages of a thread and return posts with media."""
known = known_post_ids or set()
posts: List[Post] = []
# Fetch page 1 to get page count
page1_url = f'{topic_url}/page/1/'
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
page_html = await self._fetch_page(session, page1_url)
if page_html is None:
return posts
page_count = self._extract_page_count(page_html)
self.log(f"Thread has {page_count} pages", 'info')
# Parse page 1
page_posts = self._parse_page(page_html, topic_id, known)
posts.extend(page_posts)
if progress_callback:
progress_callback(len(posts))
# Parse remaining pages
for page_num in range(2, page_count + 1):
page_url = f'{topic_url}/page/{page_num}/'
await asyncio.sleep(1) # Rate limit
page_html = await self._fetch_page(session, page_url)
if page_html is None:
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
break
page_posts = self._parse_page(page_html, topic_id, known)
posts.extend(page_posts)
if progress_callback:
progress_callback(len(posts))
self.log(f"Page {page_num}/{page_count}: {len(page_posts)} posts with media", 'debug')
except Exception as e:
self.log(f"Error scraping Bellazon thread: {e}", 'error')
self.log(f"Total: {len(posts)} posts with media from {page_count} pages", 'info')
return posts
# ------------------------------------------------------------------
# HTML parsing helpers
# ------------------------------------------------------------------
def _parse_page(self, page_html: str, topic_id: str, known: Set[str]) -> List[Post]:
"""Parse a single page of HTML and return Post objects for comments with media."""
posts: List[Post] = []
# Split HTML into comment blocks using data-commentid markers
# Each comment starts with data-commentid="..." and contains a content block
comment_pattern = re.compile(
r'data-commentid="(\d+)"\s+data-quotedata="([^"]*)"',
re.DOTALL
)
matches = list(comment_pattern.finditer(page_html))
if not matches:
return posts
for i, match in enumerate(matches):
comment_id = match.group(1)
post_id = f"comment_{comment_id}"
if post_id in known:
continue
quotedata_raw = match.group(2)
# Parse quote data for username and timestamp
username, timestamp = self._parse_quotedata(quotedata_raw)
# Extract the content block for this comment
start = match.end()
end = matches[i + 1].start() if i + 1 < len(matches) else len(page_html)
content_block = page_html[start:end]
# Find the actual content within data-role="commentContent"
# The closing pattern is </div> followed by blank lines then </div>
content_match = re.search(
r'data-role="commentContent"[^>]*>(.*?)</div>\s*\n\s*\n\s*</div>',
content_block, re.DOTALL
)
if not content_match:
# Fallback: grab everything from commentContent to ipsEntry__foot
content_match = re.search(
r'data-role="commentContent"[^>]*>(.*?)(?=ipsEntry__foot)',
content_block, re.DOTALL
)
if not content_match:
continue
content_html = content_match.group(1)
# Extract media from content
attachments = self._extract_media(content_html)
if not attachments:
continue # Skip text-only replies
# Build published_at from timestamp
published_at = None
if timestamp:
try:
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
published_at = dt.isoformat()
except (ValueError, OSError):
pass
post = Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=topic_id,
title='',
content=f"Posted by {username}" if username else '',
published_at=published_at,
attachments=attachments,
)
posts.append(post)
known.add(post_id)
return posts
def _extract_media(self, content_html: str) -> List[Attachment]:
"""Extract image and video attachments from a comment's HTML content."""
attachments: List[Attachment] = []
seen_urls: set = set()
# 1. Bellazon-hosted images: <a class="ipsAttachLink ipsAttachLink_image" href="...full..."><img src="...thumb...">
for m in re.finditer(
r'ipsAttachLink_image"\s+href="([^"]+)"[^>]*><img[^>]*src="([^"]+)"',
content_html
):
full_url = self._normalize_url(m.group(1))
if full_url in seen_urls:
continue
# Skip thumbnails as the full URL
if '_thumb.' in full_url or '.thumb.' in full_url:
continue
seen_urls.add(full_url)
attachments.append(self._make_attachment(full_url, 'image'))
# 2. Direct image/video links from bellazon uploads not caught by pattern 1
for m in re.finditer(
r'href="([^"]*bellazon\.com/main/uploads/[^"]+)"',
content_html
):
url = self._normalize_url(m.group(1))
if url in seen_urls:
continue
if '_thumb.' in url or '.thumb.' in url:
continue
ext = self._get_extension(url)
if ext in self.IMAGE_EXTS or ext in self.VIDEO_EXTS:
seen_urls.add(url)
file_type = 'image' if ext in self.IMAGE_EXTS else 'video'
attachments.append(self._make_attachment(url, file_type))
# 3. Video <source> tags: <source src="//www.bellazon.com/main/uploads/...MP4" type="video/mp4">
for m in re.finditer(
r'<source\s+src="([^"]+)"[^>]*type="video/',
content_html
):
url = self._normalize_url(m.group(1))
if url in seen_urls:
continue
seen_urls.add(url)
name = self._filename_from_url(url)
attachments.append(self._make_attachment(url, 'video', name=name))
# 4. Video/file attachments: <a href="...attachment.php?id=XXX">filename.MP4</a>
# These are protocol-relative URLs like //www.bellazon.com/main/applications/...
for m in re.finditer(
r'href="([^"]*attachment\.php\?id=\d+[^"]*)"[^>]*>([^<]+)',
content_html
):
att_url = self._normalize_url(m.group(1))
filename = m.group(2).strip()
if att_url in seen_urls:
continue
ext = self._get_extension(filename)
if ext in self.VIDEO_EXTS or ext in self.IMAGE_EXTS:
seen_urls.add(att_url)
file_type = 'video' if ext in self.VIDEO_EXTS else 'image'
attachments.append(self._make_attachment(att_url, file_type, name=filename))
return attachments
def _make_attachment(self, url: str, file_type: str, name: str = None) -> Attachment:
"""Create an Attachment from a URL."""
if name is None:
name = self._filename_from_url(url)
ext = self._get_extension(name)
return Attachment(
name=name,
file_type=file_type,
extension=ext if ext else None,
server_path=url, # Used as dedup key
download_url=url,
)
# ------------------------------------------------------------------
# Utility helpers
# ------------------------------------------------------------------
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
"""Fetch a single page, return HTML or None."""
try:
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"HTTP {resp.status} for {url}", 'warning')
return None
return await resp.text()
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return None
@staticmethod
def _extract_slug(url: str, topic_id: str) -> str:
"""Extract slug from URL like /topic/39089-india-reynolds/"""
m = re.search(rf'/topic/{re.escape(topic_id)}-([^/?#]+)', url)
if m:
return m.group(1).strip('/')
return topic_id
@staticmethod
def _extract_title(page_html: str) -> Optional[str]:
"""Extract thread title from <h1>."""
m = re.search(r'<h1[^>]*>([^<]+)</h1>', page_html)
if m:
return html.unescape(m.group(1).strip())
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
if m:
title = html.unescape(m.group(1).strip())
# Remove site suffix
title = re.sub(r'\s*[-–—]\s*Bellazon.*$', '', title, flags=re.IGNORECASE).strip()
return title
return None
@staticmethod
def _extract_page_count(page_html: str) -> int:
"""Extract total page count from 'Page X of Y'."""
m = re.search(r'Page\s+\d+\s+of\s+(\d+)', page_html)
if m:
return int(m.group(1))
return 1
@staticmethod
def _parse_quotedata(raw: str) -> tuple:
"""Parse HTML-encoded JSON quotedata, return (username, unix_timestamp)."""
try:
decoded = html.unescape(raw)
data = json.loads(decoded)
return data.get('username', ''), data.get('timestamp')
except (json.JSONDecodeError, ValueError):
return '', None
@staticmethod
def _normalize_url(url: str) -> str:
"""Normalize a URL: handle protocol-relative, decode HTML entities, make absolute."""
url = html.unescape(url) # &amp; → &
if url.startswith('//'):
url = 'https:' + url
elif url.startswith('/'):
url = 'https://www.bellazon.com' + url
elif not url.startswith('http'):
url = 'https://www.bellazon.com/main/' + url
return url
@staticmethod
def _get_extension(filename_or_url: str) -> str:
"""Get lowercase file extension from a filename or URL."""
# Strip query params
clean = filename_or_url.split('?')[0].split('#')[0]
if '.' in clean.split('/')[-1]:
return clean.rsplit('.', 1)[-1].lower()
return ''
@staticmethod
def _filename_from_url(url: str) -> str:
"""Extract filename from URL path."""
path = urlparse(url).path
name = path.rstrip('/').split('/')[-1]
return name if name else 'unnamed'

View File

@@ -0,0 +1,468 @@
"""
BestEyeCandy.com Client for Paid Content
Scrapes celebrity photo galleries from BestEyeCandy.com.
Each celeb has a unique CID and paginated photo listings.
Optimization: Full-res URLs follow a predictable pattern. We visit ONE
detail page to determine the pattern (server hostname + name format),
then construct all remaining URLs from photo IDs found on listing pages.
"""
import asyncio
import html
import json
import re
from datetime import datetime, timezone
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class BestEyeCandyClient(LoggingMixin):
"""Client for scraping BestEyeCandy.com celebrity photo galleries."""
SERVICE_ID = 'besteyecandy'
PLATFORM = 'besteyecandy'
BASE_URL = 'https://besteyecandy.com'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
def __init__(self, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='BestEyeCandy')
self.unified_db = unified_db
# ------------------------------------------------------------------
# Cookie support
# ------------------------------------------------------------------
def _get_cookies(self) -> Optional[list]:
"""Load cookies from the scrapers table for besteyecandy."""
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?",
(self.SERVICE_ID,))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
if isinstance(data, dict) and 'cookies' in data:
return data['cookies']
elif isinstance(data, list):
return data
except Exception as e:
self.log(f"Could not load cookies: {e}", 'debug')
return None
def _build_cookie_jar(self, cookies_list: list) -> aiohttp.CookieJar:
"""Build an aiohttp CookieJar from a list of cookie dicts."""
jar = aiohttp.CookieJar(unsafe=True)
for cookie in cookies_list:
from http.cookies import Morsel
import types
name = cookie.get('name', '')
value = cookie.get('value', '')
domain = cookie.get('domain', '')
path = cookie.get('path', '/')
# Use SimpleCookie approach
from http.cookies import SimpleCookie
sc = SimpleCookie()
sc[name] = value
sc[name]['domain'] = domain
sc[name]['path'] = path
if cookie.get('secure'):
sc[name]['secure'] = True
jar.update_cookies(sc, urlparse(f"https://{domain.lstrip('.')}"))
return jar
def _create_session(self, timeout: aiohttp.ClientTimeout = None) -> aiohttp.ClientSession:
"""Create an aiohttp session with cookies loaded from DB."""
if timeout is None:
timeout = aiohttp.ClientTimeout(total=60)
cookies_list = self._get_cookies()
if cookies_list:
jar = self._build_cookie_jar(cookies_list)
self.log(f"Loaded {len(cookies_list)} cookies for session", 'debug')
return aiohttp.ClientSession(timeout=timeout, cookie_jar=jar)
else:
self.log("No cookies found for besteyecandy, requests may fail", 'warning')
return aiohttp.ClientSession(timeout=timeout)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def get_profile_info(self, cid: str, celeb_slug: str) -> Optional[Dict]:
"""Fetch page 1 of a celeb's listing and return profile-like info."""
url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-1/{celeb_slug}.html')
try:
async with self._create_session() as session:
async with session.get(url, headers=self.HEADERS,
allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"BestEyeCandy cid {cid} returned HTTP {resp.status}",
'warning')
return None
page_html = await resp.text()
except Exception as e:
self.log(f"Failed to fetch BestEyeCandy cid {cid}: {e}", 'error')
return None
# Extract celeb name from page title or heading
celeb_name = self._extract_celeb_name(page_html) or celeb_slug.replace('-', ' ')
# Extract total photos and pages
total_photos = self._extract_total_photos(page_html)
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
page_count = self._extract_page_count(page_html,
photos_per_page=photos_per_page)
celeb_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-1/{celeb_slug}.html')
return {
'username': celeb_slug,
'display_name': celeb_name,
'post_count': total_photos,
'page_count': page_count,
'celeb_url': celeb_url,
}
async def get_posts(self, cid: str, celeb_slug: str,
known_post_ids: Optional[Set[str]] = None,
progress_callback=None) -> List[Post]:
"""Scrape all listing pages and return posts with full-res image URLs.
Each listing page becomes one Post with ~48 Attachments (one per photo).
Post IDs are "page_N" (e.g. "page_1", "page_2", ...).
Phase 1: Fetch page 1, get first photo ID, visit detail page to learn
the full-res URL pattern.
Phase 2: Paginate all listing pages, build one Post per page.
"""
known = known_post_ids or set()
posts: List[Post] = []
total_photos = 0
url_pattern = None
try:
async with self._create_session() as session:
# -- Phase 1: Fetch page 1 and determine full-res URL pattern --
page1_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-1/{celeb_slug}.html')
page_html = await self._fetch_page(session, page1_url)
if page_html is None:
return []
# Estimate page count for progress display
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
estimated_pages = self._extract_page_count(
page_html, photos_per_page=photos_per_page)
self.log(f"Estimated {estimated_pages} pages of photos "
f"({photos_per_page}/page)", 'info')
# Discover full-res URL pattern from first photo
first_page_ids = self._extract_photo_ids(page_html)
if first_page_ids:
url_pattern = await self._discover_url_pattern(
session, first_page_ids[0], cid, celeb_slug)
if not url_pattern:
self.log("Could not determine full-res URL pattern", 'error')
return []
self.log(f"URL pattern: server={url_pattern['server']}, "
f"name_format={url_pattern['name_format']}, "
f"ext={url_pattern['ext']}", 'info')
# -- Phase 2: Paginate all pages, one Post per page --
page_num = 0
has_next = True # start with page 1
while has_next:
page_num += 1
if page_num == 1:
# Already fetched page 1
pass
else:
await asyncio.sleep(2) # Rate limit
page_url = (
f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-{page_num}/{celeb_slug}.html')
page_html = await self._fetch_page(session, page_url)
if page_html is None:
self.log(f"Failed to fetch page {page_num}, stopping",
'warning')
break
page_ids = self._extract_photo_ids(page_html)
if not page_ids:
self.log(f"Page {page_num}: no photos, stopping", 'info')
break
total_photos += len(page_ids)
has_next = self._has_next_page(page_html)
# Check if this page-post is already known
post_id = f"page_{page_num}"
if post_id in known:
self.log(f"Page {page_num}: already known, skipping",
'debug')
if progress_callback:
progress_callback(
f"Page {page_num}/~{estimated_pages}"
f"{total_photos} photos (skipped known)")
continue
# Build attachments for all photos on this page
attachments = []
for photo_id in page_ids:
dl_url = self._construct_full_res_url(url_pattern, photo_id)
filename = dl_url.rsplit('/', 1)[-1]
attachments.append(Attachment(
name=filename,
file_type='image',
extension=url_pattern.get('ext', 'jpg'),
server_path=dl_url,
download_url=dl_url,
))
post = Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=cid,
title=f"Page {page_num}",
content=f"{len(page_ids)} photos",
published_at=datetime.now(tz=timezone.utc).isoformat(),
attachments=attachments,
)
posts.append(post)
if progress_callback:
progress_callback(
f"Page {page_num}/~{estimated_pages}"
f"{total_photos} photos")
self.log(f"Page {page_num}/~{estimated_pages}: "
f"{len(page_ids)} photos", 'debug')
except Exception as e:
self.log(f"Error scraping BestEyeCandy: {e}", 'error')
self.log(f"Total: {len(posts)} new page-posts with "
f"{total_photos} photos across all pages", 'info')
return posts
# ------------------------------------------------------------------
# URL pattern discovery
# ------------------------------------------------------------------
async def _discover_url_pattern(self, session: aiohttp.ClientSession,
photo_id: str, cid: str,
celeb_slug: str) -> Optional[Dict]:
"""Visit a detail page to discover the full-res URL pattern.
Returns dict with keys: server, dir_pattern, name_format, ext
"""
detail_url = (f'{self.BASE_URL}/section/celeb-photogallery/'
f'cid-{cid}/{celeb_slug}/photo-{photo_id}.html')
await asyncio.sleep(2) # Rate limit
page_html = await self._fetch_page(session, detail_url)
if page_html is None:
return None
# Look for full-res image URL in the detail page
# Pattern: <img src="https://euX.besteyecandy.com/section/large-photos/area-female/besteyecandy-{ID}/{Name}_{ID}_BestEyeCandyCOM.jpg">
# or <a href="..."> with similar pattern
patterns = [
r'(https?://[a-z0-9]+\.besteyecandy\.com/section/large-photos/[^"\'>\s]+)',
r'(https?://[a-z0-9]+\.besteyecandy\.com/[^"\'>\s]*besteyecandy-' + re.escape(photo_id) + r'[^"\'>\s]*)',
]
full_res_url = None
for pattern in patterns:
match = re.search(pattern, page_html)
if match:
full_res_url = match.group(1)
break
if not full_res_url:
self.log(f"Could not find full-res URL on detail page for photo {photo_id}",
'error')
return None
self.log(f"Found full-res URL: {full_res_url}", 'debug')
# Parse the URL to extract the pattern components
parsed = urlparse(full_res_url)
server = parsed.netloc # e.g., eu4.besteyecandy.com
# Extract name format from the filename
# e.g., Myleene_Klass_7727820_BestEyeCandyCOM.jpg
filename = parsed.path.rsplit('/', 1)[-1]
ext = filename.rsplit('.', 1)[-1] if '.' in filename else 'jpg'
# Extract the path pattern (everything before the filename)
path_dir = parsed.path.rsplit('/', 1)[0] # e.g., /section/large-photos/area-female/besteyecandy-7727820
# The directory pattern includes the photo ID, extract the base
# e.g., /section/large-photos/area-female/besteyecandy-{ID}
dir_pattern = re.sub(re.escape(photo_id), '{ID}', path_dir)
# Extract the name format by removing the photo ID
# e.g., Myleene_Klass_{ID}_BestEyeCandyCOM.jpg -> Myleene_Klass_{ID}_BestEyeCandyCOM
name_without_ext = filename.rsplit('.', 1)[0]
name_format = name_without_ext.replace(photo_id, '{ID}')
return {
'server': server,
'dir_pattern': dir_pattern,
'name_format': name_format,
'ext': ext,
'example_url': full_res_url,
}
def _construct_full_res_url(self, url_pattern: Dict, photo_id: str) -> str:
"""Construct the full-res URL for a photo ID using the discovered pattern."""
dir_path = url_pattern['dir_pattern'].replace('{ID}', photo_id)
filename = url_pattern['name_format'].replace('{ID}', photo_id) + '.' + url_pattern['ext']
return f"https://{url_pattern['server']}{dir_path}/{filename}"
# ------------------------------------------------------------------
# HTML parsing helpers
# ------------------------------------------------------------------
def _extract_photo_ids(self, page_html: str) -> List[str]:
"""Extract photo IDs from a listing page.
Photo links look like: href="...photo-12345.html"
"""
ids = re.findall(r'href="[^"]*photo-(\d+)\.html"', page_html)
# Deduplicate while preserving order
seen = set()
unique_ids = []
for pid in ids:
if pid not in seen:
seen.add(pid)
unique_ids.append(pid)
return unique_ids
@staticmethod
def _extract_celeb_name(page_html: str) -> Optional[str]:
"""Extract celebrity name from the page."""
# Try <title> tag: "Myleene Klass Photo Collection @ ...::: BestEyeCandy.com :::..."
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
if m:
title = html.unescape(m.group(1).strip())
# Remove everything from "Photo Collection" or "@" onwards
title = re.sub(r'\s*Photo\s+Collection.*$', '', title,
flags=re.IGNORECASE).strip()
title = re.sub(r'\s*@.*$', '', title).strip()
# Fallback: remove BestEyeCandy suffix
title = re.sub(r'\s*[-\u2013\u2014|]?\s*\.{0,3}:{0,3}\s*BestEyeCandy.*$', '',
title, flags=re.IGNORECASE).strip()
if title:
return title
# Try <h1> or <h2>
m = re.search(r'<h[12][^>]*>([^<]+)</h[12]>', page_html)
if m:
return html.unescape(m.group(1).strip())
return None
@staticmethod
def _extract_total_photos(page_html: str) -> int:
"""Extract total photo count from the page.
Handles European format (15.660) and US format (15,660).
"""
# Look for "N.NNN photos" or "N,NNN photos" or "NNN photos"
# Require leading digit to avoid matching ", photo" from keywords
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
if m:
num_str = m.group(1)
# European format uses dots as thousands separators: 15.660
# US format uses commas: 15,660
# Remove both dots and commas (they're thousands separators)
num_str = num_str.replace('.', '').replace(',', '')
try:
return int(num_str)
except ValueError:
pass
return 0
@staticmethod
def _extract_page_count(page_html: str, photos_per_page: int = 48) -> int:
"""Extract total page count from the listing page.
Uses total photo count divided by photos per page, or falls back
to finding the maximum page number in pagination links.
"""
# Method 1: Calculate from total photos
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
if m:
num_str = m.group(1).replace('.', '').replace(',', '')
try:
total = int(num_str)
if total > 0:
return (total + photos_per_page - 1) // photos_per_page
except ValueError:
pass
# Method 2: Find max page-N in pagination links for same celeb
page_nums = [int(x) for x in re.findall(r'/page-(\d+)/', page_html)]
if page_nums:
return max(page_nums)
return 1
@staticmethod
def _has_next_page(page_html: str) -> bool:
"""Check if there's a 'Next Page' link on the current page."""
return 'alt="Next Page"' in page_html
# ------------------------------------------------------------------
# Utility helpers
# ------------------------------------------------------------------
async def _fetch_page(self, session: aiohttp.ClientSession,
url: str) -> Optional[str]:
"""Fetch a single page, return HTML or None."""
try:
async with session.get(url, headers=self.HEADERS,
allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"HTTP {resp.status} for {url}", 'warning')
return None
return await resp.text()
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return None

View File

@@ -0,0 +1,622 @@
"""
Coppermine Gallery scraper client.
Coppermine is a PHP photo gallery with a nested structure:
categories > sub-categories > albums > photos
One album maps to one Post with N Attachments.
Full-res URLs are derived from thumbnails by stripping the `thumb_` prefix.
"""
import asyncio
import re
from datetime import datetime
from typing import Dict, List, Optional, Set
from urllib.parse import urljoin, urlparse, parse_qs
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class CoppermineClient(LoggingMixin):
SERVICE_ID = 'coppermine'
PLATFORM = 'coppermine'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
def __init__(self, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Coppermine')
async def get_profile_info(self, gallery_url: str) -> Optional[Dict]:
"""Fetch gallery root and extract profile metadata.
Args:
gallery_url: Base gallery URL (e.g. https://kylie-jenner.org/gallery)
Returns:
Dict with username, display_name, post_count, gallery_url or None on failure
"""
root_url = self._build_url(gallery_url, 'index.php')
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
html = await self._fetch_page(session, root_url)
if not html:
return None
# Extract site title from <title> tag
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
site_title = title_match.group(1).strip() if title_match else 'Coppermine Gallery'
# Clean HTML entities
site_title = re.sub(r'&amp;', '&', site_title)
site_title = re.sub(r'&lt;', '<', site_title)
site_title = re.sub(r'&gt;', '>', site_title)
site_title = re.sub(r'&#\d+;', '', site_title)
site_title = re.sub(r'&\w+;', '', site_title)
# Try to extract stats: "N files in M albums"
total_files = 0
total_albums = 0
stats_match = re.search(
r'(\d[\d,]*)\s+files?\s+in\s+(\d[\d,]*)\s+albums?',
html, re.IGNORECASE
)
if stats_match:
total_files = int(stats_match.group(1).replace(',', ''))
total_albums = int(stats_match.group(2).replace(',', ''))
# Use domain as username
parsed = urlparse(gallery_url)
domain = parsed.netloc.replace('www.', '')
return {
'username': domain,
'display_name': site_title,
'post_count': total_albums,
'gallery_url': gallery_url,
}
except Exception as e:
self.log(f"Error fetching profile info from {gallery_url}: {e}", 'error')
return None
async def get_posts(self, gallery_url: str,
known_post_ids: Optional[Set[str]] = None,
progress_callback=None,
post_callback=None):
"""Crawl the gallery, yielding new albums as Post objects incrementally.
Phase 1: Fetch root, extract top-level category links
Phase 2: Recursively crawl categories until album links found
Phase 3: For each album, fetch thumbnails and call post_callback immediately
Args:
gallery_url: Base gallery URL
known_post_ids: Set of post IDs already in DB (album_NNN)
progress_callback: Called with status message strings
post_callback: async callable(post) — called for each album as it's fetched.
If provided, posts are streamed instead of collected.
Returns:
List of Post objects (only if post_callback is None)
"""
known = known_post_ids or set()
timeout = aiohttp.ClientTimeout(total=None, sock_connect=30, sock_read=60)
posts_collected = [] if post_callback is None else None
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
# Phase 1: Get all category links from root
root_url = self._build_url(gallery_url, 'index.php')
root_html = await self._fetch_page(session, root_url)
if not root_html:
self.log("Failed to fetch gallery root", 'error')
return [] if post_callback is None else None
category_ids = self._extract_category_ids(root_html)
self.log(f"Found {len(category_ids)} top-level categories", 'info')
if progress_callback:
progress_callback(f'Found {len(category_ids)} categories, crawling...')
# Phase 2: Recursively crawl categories to find album IDs
album_ids = set()
visited_cats = set()
for cat_id in category_ids:
new_albums = await self._crawl_category(
session, gallery_url, cat_id, visited_cats, known, progress_callback
)
album_ids.update(new_albums)
# Filter out known albums
new_album_ids = {aid for aid in album_ids
if f"album_{aid}" not in known}
self.log(f"Found {len(new_album_ids)} new albums "
f"({len(album_ids)} total, {len(album_ids) - len(new_album_ids)} known)",
'info')
if progress_callback:
progress_callback(f'Found {len(new_album_ids)} new albums, fetching photos...')
# Phase 3: Fetch each new album and deliver Post objects
parsed = urlparse(gallery_url)
domain = parsed.netloc.replace('www.', '')
fetched = 0
for i, album_id in enumerate(sorted(new_album_ids)):
if progress_callback and (i + 1) % 5 == 0:
progress_callback(
f'Fetching album {i + 1}/{len(new_album_ids)}...'
)
post = await self._fetch_album(session, gallery_url, album_id, domain)
if post and post.attachments:
fetched += 1
if post_callback:
await post_callback(post)
else:
posts_collected.append(post)
# Rate limit: 1s between page fetches
await asyncio.sleep(2)
self.log(f"Fetched {fetched} albums with attachments", 'info')
return posts_collected
except Exception as e:
self.log(f"Error crawling gallery {gallery_url}: {e}", 'error')
return [] if post_callback is None else None
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _build_url(self, gallery_url: str, page: str) -> str:
"""Build a full URL from the gallery base and a page name."""
base = gallery_url.rstrip('/')
return f"{base}/{page}"
async def _fetch_page(self, session: aiohttp.ClientSession, url: str,
max_retries: int = 3) -> Optional[str]:
"""Fetch a page and return its HTML text, or None on failure.
Retries with exponential backoff on connection errors / server disconnects.
"""
for attempt in range(max_retries):
try:
async with session.get(url, headers=self.HEADERS) as resp:
if resp.status == 429:
wait = 5 * (attempt + 1)
self.log(f"Rate limited on {url}, waiting {wait}s", 'warning')
await asyncio.sleep(wait)
continue
if resp.status != 200:
self.log(f"HTTP {resp.status} fetching {url}", 'warning')
return None
return await resp.text()
except (aiohttp.ServerDisconnectedError, aiohttp.ClientOSError,
aiohttp.ClientPayloadError, ConnectionResetError) as e:
wait = 3 * (attempt + 1)
if attempt < max_retries - 1:
self.log(f"Connection error on {url}, retry {attempt + 1} in {wait}s: {e}",
'warning')
await asyncio.sleep(wait)
else:
self.log(f"Failed after {max_retries} attempts: {url}: {e}", 'warning')
return None
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return None
return None
def _extract_category_ids(self, html: str) -> List[str]:
"""Extract category IDs from index.php page.
Looks for links like: index.php?cat=N
"""
cat_ids = []
seen = set()
for match in re.finditer(r'index\.php\?cat=(\d+)', html):
cat_id = match.group(1)
if cat_id not in seen:
seen.add(cat_id)
cat_ids.append(cat_id)
return cat_ids
def _extract_album_ids(self, html: str) -> List[str]:
"""Extract album IDs from a category page.
Looks for links like: thumbnails.php?album=N
"""
album_ids = []
seen = set()
for match in re.finditer(r'thumbnails\.php\?album=(\d+)', html):
album_id = match.group(1)
if album_id not in seen:
seen.add(album_id)
album_ids.append(album_id)
return album_ids
def _extract_page_count(self, html: str) -> int:
"""Extract total page count from Coppermine pagination text.
Looks for patterns like "53 albums on 2 page(s)" or "N files on M page(s)".
"""
match = re.search(r'on\s+(\d+)\s+page\(s\)', html, re.IGNORECASE)
if match:
return int(match.group(1))
return 1
async def _crawl_category(self, session: aiohttp.ClientSession,
gallery_url: str, cat_id: str,
visited: Set[str], known: Set[str],
progress_callback=None,
depth: int = 0) -> Set[str]:
"""Recursively crawl a category to find all album IDs.
Categories can contain sub-categories or albums. We recurse
until we find album links (thumbnails.php?album=N).
Handles pagination within category pages (index.php?cat=N&page=M).
Args:
session: aiohttp session
gallery_url: Base gallery URL
cat_id: Category ID to crawl
visited: Set of already-visited category IDs (prevents loops)
known: Set of known post_ids (for logging only)
progress_callback: Status callback
depth: Recursion depth (max 10)
Returns:
Set of album ID strings
"""
if cat_id in visited or depth > 10:
return set()
visited.add(cat_id)
# Fetch first page
cat_url = self._build_url(gallery_url, f'index.php?cat={cat_id}')
html = await self._fetch_page(session, cat_url)
if not html:
return set()
await asyncio.sleep(2)
album_ids = set(self._extract_album_ids(html))
sub_cat_ids = self._extract_category_ids(html)
# Handle pagination: fetch remaining pages
total_pages = self._extract_page_count(html)
if total_pages > 1:
for page_num in range(2, total_pages + 1):
page_url = self._build_url(
gallery_url, f'index.php?cat={cat_id}&page={page_num}'
)
page_html = await self._fetch_page(session, page_url)
if page_html:
album_ids.update(self._extract_album_ids(page_html))
# Sub-categories are the same on every page, no need to re-extract
await asyncio.sleep(2)
# Filter out the current category from sub-categories
sub_cat_ids = [c for c in sub_cat_ids if c != cat_id and c not in visited]
if progress_callback:
progress_callback(
f'Category {cat_id}: {len(album_ids)} albums, '
f'{len(sub_cat_ids)} sub-categories'
+ (f' ({total_pages} pages)' if total_pages > 1 else '')
)
# Recurse into sub-categories
for sub_id in sub_cat_ids:
sub_albums = await self._crawl_category(
session, gallery_url, sub_id, visited, known,
progress_callback, depth + 1
)
album_ids.update(sub_albums)
return album_ids
async def _fetch_album(self, session: aiohttp.ClientSession,
gallery_url: str, album_id: str,
domain: str) -> Optional[Post]:
"""Fetch an album page (all pages) and build a Post object.
Handles pagination within albums (thumbnails.php?album=N&page=M).
Args:
session: aiohttp session
gallery_url: Base gallery URL
album_id: Album ID to fetch
domain: Domain name for creator_id
Returns:
Post object with attachments, or None on failure
"""
album_url = self._build_url(gallery_url, f'thumbnails.php?album={album_id}')
html = await self._fetch_page(session, album_url)
if not html:
return None
# Extract album title from first page
title = self._extract_album_title(html)
if not title:
title = f"Album {album_id}"
# Extract attachments from first page
attachments = self._extract_attachments(html, gallery_url)
# Handle pagination within album
total_pages = self._extract_page_count(html)
if total_pages > 1:
for page_num in range(2, total_pages + 1):
page_url = self._build_url(
gallery_url, f'thumbnails.php?album={album_id}&page={page_num}'
)
page_html = await self._fetch_page(session, page_url)
if page_html:
attachments.extend(self._extract_attachments(page_html, gallery_url))
await asyncio.sleep(2)
if not attachments:
return None
# Extract album date from breadcrumb + title
album_date = self._extract_album_date(html, title)
post_id = f"album_{album_id}"
return Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=domain,
title=None,
content=title,
published_at=album_date,
attachments=attachments,
)
def _extract_album_title(self, html: str) -> Optional[str]:
"""Extract album title from page HTML.
Priority: breadcrumb last item > <h1>/<h2> heading > <title> last segment
"""
# Try breadcrumb: last text segment after the last ">"
# Coppermine breadcrumbs: "Home > Category > Sub > Album Title"
bc_match = re.search(
r'class="[^"]*breadcrumb[^"]*"[^>]*>(.*?)</(?:div|span|td|p)',
html, re.DOTALL | re.IGNORECASE
)
if bc_match:
bc_text = bc_match.group(1)
# Strip HTML tags, split on ">", take last segment
bc_text = re.sub(r'<[^>]+>', ' ', bc_text)
parts = [p.strip() for p in bc_text.split('>') if p.strip()]
if parts:
title = self._clean_text(parts[-1])
if title and title.lower() not in ('home', 'index', 'gallery'):
return title
# Try headings
for tag in ('h1', 'h2', 'h3'):
h_match = re.search(
rf'<{tag}[^>]*>(.*?)</{tag}>', html, re.DOTALL | re.IGNORECASE
)
if h_match:
title = self._clean_text(h_match.group(1))
if title and len(title) > 2:
return title
# Fallback: <title> tag — take the last segment before the site name
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
if title_match:
title = title_match.group(1).strip()
# Usually "Site Name - Album Title" or "Album Title - Site Name"
# The album-specific part is typically not the site name;
# use the longest segment as a heuristic
if ' - ' in title:
parts = [p.strip() for p in title.split(' - ')]
# Pick the longest part (album names tend to be longer than site names)
title = max(parts, key=len)
if title:
return self._clean_text(title)
return None
def _extract_album_date(self, html: str, title: str) -> str:
"""Extract album date from breadcrumb year + title month/day.
Breadcrumb: "Home > Candids > 2026 > January 11 - Leaving..."
Title: "January 11 - Leaving Golden Globes afterparty..."
Returns ISO date string, or current datetime as fallback.
"""
MONTHS = {
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12,
}
# Extract year from breadcrumb path (look for 4-digit year in links)
year = None
# Breadcrumb links: index.php?cat=155">2026</a>
for m in re.finditer(r'>\s*((?:19|20)\d{2})\s*</', html):
year = int(m.group(1))
# Also try path segments in albums/ URLs for year
if not year:
path_match = re.search(r'albums/[^/]+/(20\d{2})/', html)
if path_match:
year = int(path_match.group(1))
# Extract month and day from album title
month, day = None, None
if title:
# "January 11 - ..." or "March 3 - ..."
date_match = re.match(
r'(\w+)\s+(\d{1,2})\b', title
)
if date_match:
month_name = date_match.group(1).lower()
if month_name in MONTHS:
month = MONTHS[month_name]
day = int(date_match.group(2))
# Build date from breadcrumb year + title month/day
if year and month and day:
try:
return datetime(year, month, day).isoformat()
except ValueError:
pass
if year and month:
try:
return datetime(year, month, 1).isoformat()
except ValueError:
pass
if year:
return datetime(year, 1, 1).isoformat()
# Fallback: parse "Date added=Jan 13, 2026" from thumbnail tooltips
MONTH_ABBR = {
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
}
added_match = re.search(
r'Date added\s*=\s*(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
)
if added_match:
m_abbr = added_match.group(1).lower()
if m_abbr in MONTH_ABBR:
try:
return datetime(
int(added_match.group(3)),
MONTH_ABBR[m_abbr],
int(added_match.group(2))
).isoformat()
except ValueError:
pass
# Also try "last one added on Jan 13, 2026" from album_stat
stat_match = re.search(
r'last one added on\s+(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
)
if stat_match:
m_abbr = stat_match.group(1).lower()
if m_abbr in MONTH_ABBR:
try:
return datetime(
int(stat_match.group(3)),
MONTH_ABBR[m_abbr],
int(stat_match.group(2))
).isoformat()
except ValueError:
pass
return datetime.now().isoformat()
def _extract_attachments(self, html: str, gallery_url: str) -> List[Attachment]:
"""Extract photo attachments from album page HTML.
Finds thumbnail images and converts them to full-res URLs by
stripping the `thumb_` prefix from the filename.
"""
attachments = []
seen_urls = set()
# Pattern: thumbnail images in album pages
# Common patterns:
# <img src="albums/path/thumb_filename.jpg" ...>
# <img src="albums/path/normal_filename.jpg" ...>
for match in re.finditer(
r'<img[^>]+src=["\']([^"\']*?albums/[^"\']*?(?:thumb_|normal_)[^"\']+)["\']',
html, re.IGNORECASE
):
thumb_src = match.group(1)
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
if full_url and full_url not in seen_urls:
seen_urls.add(full_url)
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
attachments.append(Attachment(
name=filename,
server_path=full_url, # use as dedup key
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
extension=ext or None,
download_url=full_url,
))
# Also try: <a href="displayimage.php?..."><img src="albums/...">
# Some themes wrap thumbnails in links
if not attachments:
for match in re.finditer(
r'<a[^>]+href=["\'][^"\']*displayimage\.php[^"\']*["\'][^>]*>'
r'\s*<img[^>]+src=["\']([^"\']+)["\']',
html, re.IGNORECASE | re.DOTALL
):
thumb_src = match.group(1)
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
if full_url and full_url not in seen_urls:
seen_urls.add(full_url)
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
attachments.append(Attachment(
name=filename,
server_path=full_url,
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
extension=ext or None,
download_url=full_url,
))
return attachments
def _thumb_to_fullres(self, thumb_src: str, gallery_url: str) -> Optional[str]:
"""Convert a thumbnail URL to a full-resolution URL.
Strips `thumb_` or `normal_` prefix from the filename and
prepends the gallery base URL if needed.
Args:
thumb_src: Thumbnail src attribute value
gallery_url: Base gallery URL
Returns:
Full-resolution image URL, or None if conversion fails
"""
if not thumb_src:
return None
# Strip thumb_ or normal_ prefix from filename
# e.g. albums/candids/2026/0111/thumb_001.jpg → albums/candids/2026/0111/001.jpg
fullres_path = re.sub(r'(/)(?:thumb_|normal_)', r'\1', thumb_src)
# If the path is already absolute (starts with http), return as-is
if fullres_path.startswith(('http://', 'https://')):
return fullres_path
# Otherwise, make it absolute relative to gallery URL
base = gallery_url.rstrip('/')
fullres_path = fullres_path.lstrip('./')
return f"{base}/{fullres_path}"
def _clean_text(self, text: str) -> str:
"""Clean HTML entities and whitespace from text."""
text = re.sub(r'&amp;', '&', text)
text = re.sub(r'&lt;', '<', text)
text = re.sub(r'&gt;', '>', text)
text = re.sub(r'&quot;', '"', text)
text = re.sub(r'&#\d+;', '', text)
text = re.sub(r'&\w+;', '', text)
text = re.sub(r'<[^>]+>', '', text)
return text.strip()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,297 @@
"""
Embed Downloader - Downloads embedded videos from posts using yt-dlp
Supports: YouTube, Vimeo, Dailymotion, Twitch, and many other platforms
"""
import asyncio
import json
import os
import subprocess
from pathlib import Path
from typing import Dict, Optional
from modules.base_module import LoggingMixin
class EmbedDownloader(LoggingMixin):
"""
Download embedded videos from posts using yt-dlp
Wrapper around yt-dlp for downloading videos from various platforms
embedded in creator posts.
"""
# Quality presets for yt-dlp
QUALITY_PRESETS = {
'best': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
'1080p': 'bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/best[height<=1080][ext=mp4]/best',
'720p': 'bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/best[height<=720][ext=mp4]/best',
'480p': 'bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/best[height<=480][ext=mp4]/best',
'audio': 'bestaudio[ext=m4a]/bestaudio/best',
}
def __init__(self, ytdlp_path: str = None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Embed')
# Find yt-dlp executable
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
if not self.ytdlp_path:
self.log("yt-dlp not found, embed downloading will be disabled", 'warning')
def _find_ytdlp(self) -> Optional[str]:
"""Find yt-dlp executable"""
# Check common locations
common_paths = [
'/usr/local/bin/yt-dlp',
'/usr/bin/yt-dlp',
'/opt/homebrew/bin/yt-dlp',
os.path.expanduser('~/.local/bin/yt-dlp'),
]
for path in common_paths:
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
# Try to find via which
try:
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def is_available(self) -> bool:
"""Check if yt-dlp is available"""
return self.ytdlp_path is not None
async def download(self, url: str, output_dir: Path, quality: str = 'best',
filename_template: str = None) -> Dict:
"""
Download video from URL
Args:
url: Video URL to download
output_dir: Directory to save the video
quality: Quality preset ('best', '1080p', '720p', '480p', 'audio')
filename_template: Optional custom filename template
Returns:
Dict with success status and file info
"""
if not self.is_available():
return {
'success': False,
'error': 'yt-dlp not available'
}
try:
# Create output directory
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Build output template
if filename_template:
output_template = str(output_dir / filename_template)
else:
output_template = str(output_dir / 'embed_%(title).50s_%(id)s.%(ext)s')
# Get format string
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
# Build command
cmd = [
self.ytdlp_path,
'--no-playlist',
'--no-warnings',
'-f', format_str,
'--merge-output-format', 'mp4',
'-o', output_template,
'--print-json', # Output JSON with video info
url
]
self.log(f"Downloading embed: {url}", 'debug')
# Run yt-dlp
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error_msg = stderr.decode('utf-8', errors='replace').strip()
# Try to extract useful error message
if 'Video unavailable' in error_msg:
error_msg = 'Video unavailable or private'
elif 'age-restricted' in error_msg.lower():
error_msg = 'Video is age-restricted'
elif 'members only' in error_msg.lower():
error_msg = 'Video is members-only'
elif len(error_msg) > 200:
error_msg = error_msg[:200] + '...'
self.log(f"yt-dlp failed: {error_msg}", 'warning')
return {
'success': False,
'error': error_msg or f'yt-dlp exited with code {result.returncode}'
}
# Parse output JSON
stdout_text = stdout.decode('utf-8', errors='replace')
video_info = None
for line in stdout_text.strip().split('\n'):
try:
video_info = json.loads(line)
break
except json.JSONDecodeError:
continue
if not video_info:
# Try to find the downloaded file
files = list(output_dir.glob('embed_*'))
if files:
file_path = files[0]
return {
'success': True,
'file_path': str(file_path),
'filename': file_path.name,
'file_size': file_path.stat().st_size if file_path.exists() else None
}
return {
'success': False,
'error': 'Could not parse yt-dlp output'
}
# Extract file info
file_path = video_info.get('_filename') or video_info.get('filename')
# Handle potential path issues
if file_path:
file_path = Path(file_path)
if not file_path.exists():
# Try to find the file
possible_files = list(output_dir.glob(f"*{video_info.get('id', '')}*"))
if possible_files:
file_path = possible_files[0]
return {
'success': True,
'file_path': str(file_path) if file_path else None,
'filename': file_path.name if file_path else None,
'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
'title': video_info.get('title'),
'duration': video_info.get('duration'),
'uploader': video_info.get('uploader'),
'upload_date': video_info.get('upload_date'),
'video_id': video_info.get('id'),
'platform': video_info.get('extractor_key', video_info.get('extractor', 'unknown')).lower()
}
except asyncio.TimeoutError:
return {
'success': False,
'error': 'Download timed out'
}
except Exception as e:
self.log(f"Error downloading embed: {e}", 'error')
return {
'success': False,
'error': str(e)
}
async def get_video_info(self, url: str) -> Dict:
"""
Get video information without downloading
Args:
url: Video URL
Returns:
Dict with video metadata
"""
if not self.is_available():
return {'success': False, 'error': 'yt-dlp not available'}
try:
cmd = [
self.ytdlp_path,
'--no-playlist',
'--no-warnings',
'-j', # Output JSON
'--no-download',
url
]
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error_msg = stderr.decode('utf-8', errors='replace').strip()
return {
'success': False,
'error': error_msg or f'yt-dlp exited with code {result.returncode}'
}
video_info = json.loads(stdout.decode('utf-8'))
return {
'success': True,
'title': video_info.get('title'),
'duration': video_info.get('duration'),
'uploader': video_info.get('uploader'),
'upload_date': video_info.get('upload_date'),
'view_count': video_info.get('view_count'),
'like_count': video_info.get('like_count'),
'description': video_info.get('description'),
'thumbnail': video_info.get('thumbnail'),
'video_id': video_info.get('id'),
'platform': video_info.get('extractor_key', video_info.get('extractor', 'unknown')).lower(),
'formats': len(video_info.get('formats', []))
}
except Exception as e:
self.log(f"Error getting video info: {e}", 'error')
return {
'success': False,
'error': str(e)
}
@staticmethod
def detect_platform(url: str) -> Optional[str]:
"""Detect video platform from URL"""
url_lower = url.lower()
if 'youtube.com' in url_lower or 'youtu.be' in url_lower:
return 'youtube'
elif 'vimeo.com' in url_lower:
return 'vimeo'
elif 'dailymotion.com' in url_lower:
return 'dailymotion'
elif 'twitch.tv' in url_lower:
return 'twitch'
elif 'twitter.com' in url_lower or 'x.com' in url_lower:
return 'twitter'
elif 'tiktok.com' in url_lower:
return 'tiktok'
elif 'instagram.com' in url_lower:
return 'instagram'
elif 'reddit.com' in url_lower:
return 'reddit'
return None
@staticmethod
def is_supported_url(url: str) -> bool:
"""Check if URL is from a supported platform"""
return EmbedDownloader.detect_platform(url) is not None

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,529 @@
"""
Download files from external file hosting services
Supports: Bunkr, Pixeldrain, Gofile, Cyberdrop
"""
import asyncio
import re
from pathlib import Path
from typing import Dict, List, Optional
from urllib.parse import urlparse, parse_qs
import aiohttp
from modules.base_module import LoggingMixin, RateLimitMixin
class FileHostDownloader(LoggingMixin, RateLimitMixin):
"""
Download files from various file hosting services
Used for manual import of PPV content
"""
SUPPORTED_HOSTS = {
'bunkr': ['bunkr.sk', 'bunkr.si', 'bunkr.la', 'bunkrr.ru', 'bunkr.ph', 'bunkr.is', 'bunkr.ac', 'bunkr.cr'],
'pixeldrain': ['pixeldrain.com'],
'gofile': ['gofile.io'],
'cyberdrop': ['cyberdrop.me', 'cyberdrop.to', 'cyberdrop.cc'],
'fileditch': ['fileditchfiles.me', 'fileditch.me'],
}
# Bunkr CDN servers (food-themed) - try in order
BUNKR_CDNS = [
'i-soup.bunkr.ru',
'i-burger.bunkr.ru',
'i-pizza.bunkr.ru',
'i-taco.bunkr.ru',
'i-fries.bunkr.ru',
'i-hotdog.bunkr.ru',
'i-nachos.bunkr.ru',
'i-sushi.bunkr.ru',
'i-ramen.bunkr.ru',
'i-curry.bunkr.ru',
'i-kebab.bunkr.ru',
'i-pasta.bunkr.ru',
'i-steak.bunkr.ru',
'i-salad.bunkr.ru',
'i-sandwich.bunkr.ru',
'i-waffle.bunkr.ru',
'i-pancake.bunkr.ru',
'i-donut.bunkr.ru',
'i-cookie.bunkr.ru',
'i-cake.bunkr.ru',
'i-bacon.bunkr.ru',
'i-cheese.bunkr.ru',
'i-chicken.bunkr.ru',
'i-fish.bunkr.ru',
'i-noodle.bunkr.ru',
'i-rice.bunkr.ru',
'i-bread.bunkr.ru',
'burger.bunkr.ru',
'pizza.bunkr.ru',
'milkshake.bunkr.ru',
]
def __init__(self, log_callback=None, progress_callback=None):
self._init_logger('PaidContent', log_callback, default_module='FileHost')
self._init_rate_limiter(min_delay=1, max_delay=3)
self.progress_callback = progress_callback # Called with (downloaded_bytes, total_bytes, filename)
def detect_host(self, url: str) -> Optional[str]:
"""Detect which file host a URL belongs to"""
try:
parsed = urlparse(url)
domain = parsed.netloc.lower().replace('www.', '')
for host, domains in self.SUPPORTED_HOSTS.items():
if domain in domains:
return host
except Exception:
pass
return None
def is_supported_url(self, url: str) -> bool:
"""Check if URL is from a supported file host"""
return self.detect_host(url) is not None
async def download_url(self, url: str, save_dir: Path) -> Dict:
"""
Download file(s) from URL
Returns: {'success': bool, 'files': [paths], 'error': str}
"""
host = self.detect_host(url)
if not host:
return {'success': False, 'files': [], 'error': 'Unsupported host'}
handler = getattr(self, f'_download_{host}', None)
if not handler:
return {'success': False, 'files': [], 'error': f'No handler for {host}'}
try:
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
return await handler(url, save_dir)
except Exception as e:
self.log(f"Error downloading from {host}: {e}", 'error')
return {'success': False, 'files': [], 'error': str(e)}
async def _download_pixeldrain(self, url: str, save_dir: Path) -> Dict:
"""Download from Pixeldrain"""
# Extract file ID from URL
# Format: https://pixeldrain.com/u/FILEID or /l/LISTID
parsed = urlparse(url)
path_parts = parsed.path.strip('/').split('/')
if len(path_parts) < 2:
return {'success': False, 'files': [], 'error': 'Invalid Pixeldrain URL'}
url_type, file_id = path_parts[0], path_parts[1]
files = []
timeout = aiohttp.ClientTimeout(total=300)
async with aiohttp.ClientSession(timeout=timeout) as session:
if url_type == 'u':
# Single file
api_url = f"https://pixeldrain.com/api/file/{file_id}/info"
async with session.get(api_url) as resp:
if resp.status != 200:
return {'success': False, 'files': [], 'error': f'API error: {resp.status}'}
info = await resp.json()
download_url = f"https://pixeldrain.com/api/file/{file_id}"
filename = info.get('name', f'{file_id}.bin')
save_path = save_dir / self._sanitize_filename(filename)
await self._download_file(session, download_url, save_path)
files.append(str(save_path))
elif url_type == 'l':
# List (album)
api_url = f"https://pixeldrain.com/api/list/{file_id}"
async with session.get(api_url) as resp:
if resp.status != 200:
return {'success': False, 'files': [], 'error': f'API error: {resp.status}'}
data = await resp.json()
for i, item in enumerate(data.get('files', [])):
self._delay_between_items()
item_id = item['id']
filename = item.get('name', f'{i:03d}_{item_id}.bin')
download_url = f"https://pixeldrain.com/api/file/{item_id}"
save_path = save_dir / self._sanitize_filename(filename)
try:
await self._download_file(session, download_url, save_path)
files.append(str(save_path))
except Exception as e:
self.log(f"Failed to download {filename}: {e}", 'warning')
return {'success': True, 'files': files, 'error': None}
async def _download_gofile(self, url: str, save_dir: Path) -> Dict:
"""Download from Gofile"""
# Extract content ID from URL
# Format: https://gofile.io/d/CONTENTID
parsed = urlparse(url)
path_parts = parsed.path.strip('/').split('/')
if len(path_parts) < 2 or path_parts[0] != 'd':
return {'success': False, 'files': [], 'error': 'Invalid Gofile URL'}
content_id = path_parts[1]
files = []
timeout = aiohttp.ClientTimeout(total=300)
async with aiohttp.ClientSession(timeout=timeout) as session:
# Create guest account token (POST request required since API change)
async with session.post('https://api.gofile.io/accounts') as resp:
if resp.status != 200:
return {'success': False, 'files': [], 'error': 'Failed to get Gofile token'}
account_data = await resp.json()
if account_data.get('status') != 'ok':
return {'success': False, 'files': [], 'error': f"Gofile API error: {account_data.get('status')}"}
token = account_data.get('data', {}).get('token')
if not token:
return {'success': False, 'files': [], 'error': 'No Gofile token received'}
# Get content info
# Gofile requires x-website-token header (changed from query param in 2024)
headers = {
'Authorization': f'Bearer {token}',
'x-website-token': '4fd6sg89d7s6',
}
api_url = f"https://api.gofile.io/contents/{content_id}"
async with session.get(api_url, headers=headers) as resp:
if resp.status == 401:
return {'success': False, 'files': [], 'error': 'Gofile authentication failed - websiteToken may have changed'}
if resp.status != 200:
return {'success': False, 'files': [], 'error': f'Failed to get content: {resp.status}'}
content_data = await resp.json()
if content_data.get('status') == 'error-notPremium':
return {'success': False, 'files': [], 'error': 'Gofile requires premium account for API access - try direct download'}
if content_data.get('status') != 'ok':
error = content_data.get('data', {}).get('message', content_data.get('status', 'Unknown error'))
return {'success': False, 'files': [], 'error': error}
contents = content_data.get('data', {}).get('children', {})
for item_id, item in contents.items():
if item.get('type') != 'file':
continue
self._delay_between_items()
download_url = item.get('link')
filename = item.get('name', f'{item_id}.bin')
save_path = save_dir / self._sanitize_filename(filename)
try:
await self._download_file(session, download_url, save_path, headers=headers)
files.append(str(save_path))
except Exception as e:
self.log(f"Failed to download {filename}: {e}", 'warning')
return {'success': True, 'files': files, 'error': None}
async def _download_cyberdrop(self, url: str, save_dir: Path) -> Dict:
"""Download from Cyberdrop"""
# Cyberdrop albums: https://cyberdrop.me/a/ALBUMID
# Single files: https://cyberdrop.me/f/FILEID or direct CDN links
files = []
timeout = aiohttp.ClientTimeout(total=300)
async with aiohttp.ClientSession(timeout=timeout) as session:
parsed = urlparse(url)
path_parts = parsed.path.strip('/').split('/')
if len(path_parts) >= 2 and path_parts[0] == 'a':
# Album
album_url = url
async with session.get(album_url) as resp:
if resp.status != 200:
return {'success': False, 'files': [], 'error': f'Failed to fetch album: {resp.status}'}
html = await resp.text()
# Parse file links from HTML
# Pattern: href="https://fs-XXX.cyberdrop.to/FILE"
cdn_pattern = r'href="(https://[a-z0-9-]+\.cyberdrop\.[a-z]+/[^"]+)"'
matches = re.findall(cdn_pattern, html)
for i, file_url in enumerate(matches):
self._delay_between_items()
filename = file_url.split('/')[-1].split('?')[0]
if not filename:
filename = f'{i:03d}.bin'
save_path = save_dir / self._sanitize_filename(filename)
try:
await self._download_file(session, file_url, save_path)
files.append(str(save_path))
except Exception as e:
self.log(f"Failed to download {filename}: {e}", 'warning')
else:
# Single file or direct CDN link
filename = parsed.path.split('/')[-1] or 'download.bin'
save_path = save_dir / self._sanitize_filename(filename)
await self._download_file(session, url, save_path)
files.append(str(save_path))
return {'success': True, 'files': files, 'error': None}
async def _download_bunkr(self, url: str, save_dir: Path) -> Dict:
"""Download from Bunkr with CDN fallback support"""
# Bunkr albums: https://bunkr.sk/a/ALBUMID
# Single files: https://bunkr.sk/f/FILEID or https://bunkr.sk/v/VIDEOID
files = []
failed = []
timeout = aiohttp.ClientTimeout(total=600) # Increased for large files
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
parsed = urlparse(url)
path_parts = parsed.path.strip('/').split('/')
if len(path_parts) >= 2 and path_parts[0] == 'a':
# Album page
async with session.get(url) as resp:
if resp.status != 200:
return {'success': False, 'files': [], 'error': f'Failed to fetch album: {resp.status}'}
html = await resp.text()
# Parse file links from HTML - look for /f/ links
file_pattern = r'href="(/f/[^"]+)"'
matches = re.findall(file_pattern, html)
self.log(f"Found {len(matches)} files in Bunkr album", 'info')
for i, file_path in enumerate(matches):
self._delay_between_items()
# Make absolute URL
file_url = f"https://{parsed.netloc}{file_path}"
# Get direct download URL and file UUID
direct_url, file_uuid = await self._get_bunkr_direct_url_with_uuid(session, file_url)
if not direct_url:
self.log(f"Could not get direct URL for {file_url}", 'warning')
failed.append(file_url)
continue
filename = direct_url.split('/')[-1].split('?')[0]
if not filename:
filename = f'{i:03d}.bin'
save_path = save_dir / self._sanitize_filename(filename)
try:
await self._download_file(session, direct_url, save_path,
try_cdn_fallback=True, file_uuid=file_uuid)
files.append(str(save_path))
self.log(f"Downloaded: {filename}", 'info')
except Exception as e:
self.log(f"Failed to download {filename}: {e}", 'warning')
failed.append(filename)
else:
# Single file page
direct_url, file_uuid = await self._get_bunkr_direct_url_with_uuid(session, url)
if not direct_url:
return {'success': False, 'files': [], 'error': 'Could not get direct download URL'}
filename = direct_url.split('/')[-1].split('?')[0] or 'download.bin'
save_path = save_dir / self._sanitize_filename(filename)
await self._download_file(session, direct_url, save_path,
try_cdn_fallback=True, file_uuid=file_uuid)
files.append(str(save_path))
result = {'success': len(files) > 0, 'files': files, 'error': None}
if failed:
result['failed'] = failed
result['error'] = f'{len(failed)} files failed to download'
return result
async def _get_bunkr_direct_url_with_uuid(self, session: aiohttp.ClientSession, page_url: str) -> tuple:
"""Extract direct download URL and file UUID from Bunkr file page"""
try:
async with session.get(page_url) as resp:
if resp.status != 200:
return None, None
html = await resp.text()
file_uuid = None
# Extract file UUID first
uuid_patterns = [
r'data-v="([a-f0-9-]{36}\.[a-z0-9]+)"',
r'([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}\.[a-z0-9]+)',
]
for pattern in uuid_patterns:
match = re.search(pattern, html)
if match:
file_uuid = match.group(1)
break
# Try to find existing CDN URL in page
cdn_patterns = [
r'href="(https://[^"]*\.bunkr\.ru/[^"]+)"',
r'src="(https://[^"]*\.bunkr\.ru/[^"]+)"',
r'data-src="(https://[^"]*\.bunkr\.ru/[^"]+)"',
]
for pattern in cdn_patterns:
match = re.search(pattern, html)
if match:
url = match.group(1)
if await self._check_url_accessible(session, url):
return url, file_uuid
# If we have UUID, try CDNs
if file_uuid:
self.log(f"Found file UUID: {file_uuid}, trying CDNs...", 'debug')
for cdn in self.BUNKR_CDNS:
cdn_url = f"https://{cdn}/{file_uuid}"
if await self._check_url_accessible(session, cdn_url):
self.log(f"Found working CDN: {cdn}", 'debug')
return cdn_url, file_uuid
return None, file_uuid
except Exception as e:
self.log(f"Error getting Bunkr direct URL: {e}", 'warning')
return None, None
async def _check_url_accessible(self, session: aiohttp.ClientSession, url: str) -> bool:
"""Check if a URL is accessible (returns 200)"""
try:
async with session.head(url, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as resp:
return resp.status == 200
except Exception:
return False
async def _download_fileditch(self, url: str, save_dir: Path) -> Dict:
"""Download from FileDitch (Cloudflare-protected)"""
from modules.cloudflare_handler import CloudflareHandler
# Extract filename from URL: file.php?f=/b74/tLyJWGrzvSyRlJvBVDBa.mp4
parsed = urlparse(url)
params = parse_qs(parsed.query)
file_path = params.get('f', [''])[0]
if not file_path:
return {'success': False, 'files': [], 'error': 'Invalid FileDitch URL - no file parameter'}
filename = file_path.rsplit('/', 1)[-1] if '/' in file_path else file_path
if not filename:
return {'success': False, 'files': [], 'error': 'Could not extract filename from URL'}
save_path = save_dir / self._sanitize_filename(filename)
# Use CloudflareHandler to get cookies via FlareSolverr
cf_handler = CloudflareHandler(
module_name='FileDitch',
flaresolverr_url='http://localhost:8191/v1',
flaresolverr_enabled=True,
)
self.log('Bypassing Cloudflare for FileDitch via FlareSolverr...', 'info')
if not cf_handler.get_cookies_via_flaresolverr(url):
return {'success': False, 'files': [], 'error': 'Failed to bypass Cloudflare for FileDitch'}
cookies = cf_handler.get_cookies_dict()
user_agent = cf_handler.get_user_agent()
# Download with the obtained cookies
timeout = aiohttp.ClientTimeout(total=3600)
cookie_jar = aiohttp.CookieJar()
headers = {'User-Agent': user_agent or 'Mozilla/5.0'}
async with aiohttp.ClientSession(timeout=timeout, cookie_jar=cookie_jar, headers=headers) as session:
# Set cookies on session
for name, value in cookies.items():
cookie_jar.update_cookies({name: value}, response_url=url)
await self._download_file(session, url, save_path, headers=headers)
return {'success': True, 'files': [str(save_path)], 'error': None}
async def _download_file(self, session: aiohttp.ClientSession, url: str,
save_path: Path, headers: Dict = None,
try_cdn_fallback: bool = False, file_uuid: str = None) -> None:
"""Download a single file with streaming and optional CDN fallback"""
save_path.parent.mkdir(parents=True, exist_ok=True)
urls_to_try = [url]
# If CDN fallback enabled and we have a file UUID, add alternate CDNs
if try_cdn_fallback and file_uuid:
for cdn in self.BUNKR_CDNS:
alt_url = f"https://{cdn}/{file_uuid}"
if alt_url != url:
urls_to_try.append(alt_url)
last_error = None
for try_url in urls_to_try:
try:
self.log(f"Downloading: {save_path.name} from {try_url[:60]}...", 'info')
async with session.get(try_url, headers=headers) as resp:
if resp.status == 200:
total_size = int(resp.headers.get('content-length', 0))
downloaded = 0
last_log_pct = 0
with open(save_path, 'wb') as f:
async for chunk in resp.content.iter_chunked(65536): # 64KB chunks
f.write(chunk)
downloaded += len(chunk)
# Log and callback progress every 2%
if total_size > 0:
pct = int(downloaded * 100 / total_size)
if pct >= last_log_pct + 2:
self.log(f" {save_path.name}: {pct}% ({downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB)", 'info')
last_log_pct = pct
# Call progress callback if provided
if self.progress_callback:
try:
self.progress_callback(downloaded, total_size, save_path.name)
except Exception:
pass # Don't fail download due to callback error
self.log(f"Downloaded: {save_path.name} ({downloaded // (1024*1024)}MB)", 'info')
return # Success
else:
last_error = f"HTTP {resp.status}"
self.log(f"Download failed: {save_path.name} - {last_error}", 'warning')
except Exception as e:
last_error = str(e)
self.log(f"Download error: {save_path.name} - {last_error}", 'warning')
# Try next CDN
continue
raise Exception(f"Download failed after trying {len(urls_to_try)} URLs: {last_error}")
def _sanitize_filename(self, filename: str) -> str:
"""Sanitize filename for filesystem"""
if not filename:
return 'download.bin'
# Remove/replace invalid characters
filename = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', filename)
filename = filename.strip('. ')
return filename or 'download.bin'
@classmethod
def get_supported_domains(cls) -> List[str]:
"""Get list of all supported domains"""
domains = []
for host_domains in cls.SUPPORTED_HOSTS.values():
domains.extend(host_domains)
return domains

View File

@@ -0,0 +1,171 @@
"""
Filename parser for extracting dates and metadata from Fansly/paid content filenames.
Supports:
1. Fansly snowflake IDs: 871257582885416960.mp4
2. Embedded date format: 2023-05-11_at_15-51_id_513099759796367360-zRvVUZeP.mp4
3. Date-prefixed files: 2022-07-08.mp4 or 2022-07-08_video.mp4
"""
import re
from datetime import datetime, timezone
from typing import Optional, Dict, Tuple
from pathlib import Path
# Fansly epoch calibrated from known files
# Based on: 513099759796367360 = 2023-05-11 15:51 UTC
FANSLY_EPOCH_MS = 1561483337101
def decode_fansly_snowflake(snowflake_id: str) -> Optional[datetime]:
"""
Decode a Fansly snowflake ID to a datetime.
Fansly uses Twitter-style snowflake IDs where the timestamp
is encoded in the upper bits (shifted right by 22).
"""
try:
sid = int(snowflake_id)
# Timestamp is in upper bits
timestamp_ms = (sid >> 22) + FANSLY_EPOCH_MS
return datetime.fromtimestamp(timestamp_ms / 1000, tz=timezone.utc)
except (ValueError, OverflowError, OSError):
return None
def parse_filename(filename: str) -> Dict:
"""
Parse a filename and extract any date/metadata information.
Returns:
{
'original_filename': str,
'detected_date': datetime or None,
'fansly_id': str or None,
'date_source': str or None, # 'snowflake', 'embedded', 'prefix', None
'confidence': str, # 'high', 'medium', 'low'
}
"""
result = {
'original_filename': filename,
'detected_date': None,
'fansly_id': None,
'date_source': None,
'confidence': 'low',
}
# Get the base name without extension
name = Path(filename).stem
# Pattern 1: Embedded date format
# 2023-05-11_at_15-51_id_513099759796367360-zRvVUZeP-YcNs55W9.mp4
# 2026-01-24_at_06-22_id_871257582885416960_hash2_4547ab5367c6d7ea3a28ac4fc79df018.mp4
# Also handles spaces: 2023 05 11_at_15 51_id_513099759796367360
embedded_pattern = r'(\d{4})[-_ ](\d{2})[-_ ](\d{2})[-_ ]?at[-_ ](\d{2})[-_ ](\d{2})[-_ ]?id[-_ ](\d{15,20})'
match = re.search(embedded_pattern, name, re.IGNORECASE)
if match:
year, month, day, hour, minute, fansly_id = match.groups()
try:
result['detected_date'] = datetime(
int(year), int(month), int(day),
int(hour), int(minute), 0,
tzinfo=timezone.utc
)
result['fansly_id'] = fansly_id
result['date_source'] = 'embedded'
result['confidence'] = 'high'
return result
except ValueError:
pass
# Pattern 2: Date prefix (YYYY-MM-DD or YYYY_MM_DD)
# 2022-07-08.mp4 or 2022-07-08_video.mp4
date_prefix_pattern = r'^(\d{4})[-_](\d{2})[-_](\d{2})(?:[_\-\s]|$)'
match = re.match(date_prefix_pattern, name)
if match:
year, month, day = match.groups()
try:
result['detected_date'] = datetime(
int(year), int(month), int(day),
12, 0, 0, # Default to noon
tzinfo=timezone.utc
)
result['date_source'] = 'prefix'
result['confidence'] = 'high'
return result
except ValueError:
pass
# Pattern 3: Pure Fansly snowflake ID
# 871257582885416960.mp4 (15-20 digit number)
snowflake_pattern = r'^(\d{15,20})(?:_\d+)?$'
match = re.match(snowflake_pattern, name)
if match:
fansly_id = match.group(1)
decoded_date = decode_fansly_snowflake(fansly_id)
if decoded_date:
# Sanity check: date should be between 2020 and 2030
if 2020 <= decoded_date.year <= 2030:
result['detected_date'] = decoded_date
result['fansly_id'] = fansly_id
result['date_source'] = 'snowflake'
result['confidence'] = 'high'
return result
# Pattern 4: Fansly ID embedded anywhere in filename
# e.g., video_871257582885416960_hd.mp4
embedded_id_pattern = r'(\d{15,20})'
matches = re.findall(embedded_id_pattern, name)
for potential_id in matches:
decoded_date = decode_fansly_snowflake(potential_id)
if decoded_date and 2020 <= decoded_date.year <= 2030:
result['detected_date'] = decoded_date
result['fansly_id'] = potential_id
result['date_source'] = 'snowflake'
result['confidence'] = 'medium'
return result
return result
def parse_filenames(filenames: list) -> Dict:
"""
Parse multiple filenames and return analysis.
Returns:
{
'files': [parsed result for each file],
'earliest_date': datetime or None,
'latest_date': datetime or None,
'suggested_date': datetime or None, # Most common or earliest
'has_dates': bool,
}
"""
results = [parse_filename(f) for f in filenames]
dates = [r['detected_date'] for r in results if r['detected_date']]
analysis = {
'files': results,
'earliest_date': min(dates) if dates else None,
'latest_date': max(dates) if dates else None,
'suggested_date': min(dates) if dates else None, # Use earliest as default
'has_dates': len(dates) > 0,
}
return analysis
def format_date_for_display(dt: datetime) -> str:
"""Format datetime for display: 'May 11, 2023 at 3:51 PM'"""
if dt is None:
return ''
return dt.strftime('%b %d, %Y at %-I:%M %p')
def format_date_for_input(dt: datetime) -> Tuple[str, str]:
"""Format datetime for HTML inputs: (date_str, time_str)"""
if dt is None:
return ('', '')
return (dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M'))

View File

@@ -0,0 +1,14 @@
"""Backwards-compatibility shim — use xenforo_forum_client instead."""
from .xenforo_forum_client import XenForoForumClient
class HQCelebCornerClient(XenForoForumClient):
"""Legacy alias for XenForoForumClient, pre-configured for HQCelebCorner."""
def __init__(self, log_callback=None):
super().__init__(
service_id='hqcelebcorner',
base_url='https://www.hqcelebcorner.net',
cookie_path='/opt/media-downloader/cookies/forum_cookies_HQCelebCorner.json',
log_callback=log_callback,
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,312 @@
"""
Pydantic models for Paid Content feature
"""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, List, Optional, Any
@dataclass
class Attachment:
"""Represents a file attachment from a post"""
name: str
server_path: str
file_type: Optional[str] = None
extension: Optional[str] = None
download_url: Optional[str] = None
file_size: Optional[int] = None
width: Optional[int] = None
height: Optional[int] = None
duration: Optional[int] = None
needs_quality_recheck: bool = False
is_preview: bool = False
@classmethod
def from_api(cls, data: Dict, base_url: str = '') -> 'Attachment':
"""Create Attachment from API response"""
name = data.get('name', '')
path = data.get('path', '')
# Detect file type from extension
ext = ''
if '.' in name:
ext = name.rsplit('.', 1)[-1].lower()
file_type = 'unknown'
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
archive_exts = {'zip', 'rar', '7z', 'tar', 'gz'}
if ext in image_exts:
file_type = 'image'
elif ext in video_exts:
file_type = 'video'
elif ext in archive_exts:
file_type = 'archive'
elif ext in {'pdf', 'doc', 'docx', 'txt'}:
file_type = 'document'
return cls(
name=name,
server_path=path,
file_type=file_type,
extension=ext if ext else None,
download_url=f"{base_url}/data{path}" if base_url and path else None
)
def to_dict(self) -> Dict:
"""Convert to dictionary for database storage"""
d = {
'name': self.name,
'server_path': self.server_path,
'file_type': self.file_type,
'extension': self.extension,
'download_url': self.download_url,
'file_size': self.file_size,
'width': self.width,
'height': self.height,
'duration': self.duration
}
if self.needs_quality_recheck:
d['needs_quality_recheck'] = 1
return d
@dataclass
class Post:
"""Represents a post from a creator"""
post_id: str
service_id: str
platform: str
creator_id: str
title: Optional[str] = None
content: Optional[str] = None
published_at: Optional[str] = None
added_at: Optional[str] = None
edited_at: Optional[str] = None
attachments: List[Attachment] = field(default_factory=list)
embed_urls: List[str] = field(default_factory=list)
is_pinned: bool = False
pinned_at: Optional[str] = None
auto_tags: List[str] = field(default_factory=list) # Tag names to auto-apply on sync
tagged_users: List[str] = field(default_factory=list) # Instagram users tagged in the post
@classmethod
def from_api(cls, data: Dict, service_id: str, platform: str, creator_id: str, base_url: str = '') -> 'Post':
"""Create Post from API response"""
# Parse attachments
attachments = []
for att_data in data.get('attachments', []):
attachments.append(Attachment.from_api(att_data, base_url))
# Also check file field (some APIs use this instead of attachments)
if 'file' in data and data['file']:
file_data = data['file']
if isinstance(file_data, dict):
attachments.append(Attachment.from_api(file_data, base_url))
elif isinstance(file_data, str):
attachments.append(Attachment(
name=file_data.split('/')[-1] if '/' in file_data else file_data,
server_path=file_data
))
# Parse dates
published = data.get('published')
added = data.get('added')
edited = data.get('edited')
# Content: use 'content' if available, fallback to 'substring' (list endpoint returns truncated)
content = data.get('content') or data.get('substring') or ''
# Single post endpoint returns HTML content (e.g. <p>text</p>), strip tags
if content and '<' in content:
import re
content = re.sub(r'<br\s*/?>', '\n', content)
content = re.sub(r'</p>\s*<p>', '\n\n', content)
content = re.sub(r'<[^>]+>', '', content)
content = content.strip()
title = data.get('title')
# OnlyFans posts on Coomer have the post text in 'title' and empty 'content'.
# Copy title to content and clear title (OF posts don't have real titles).
if not content and title:
content = title
title = None
return cls(
post_id=str(data.get('id', '')),
service_id=service_id,
platform=platform,
creator_id=creator_id,
title=title,
content=content,
published_at=published,
added_at=added,
edited_at=edited,
attachments=attachments,
embed_urls=data.get('embed', []) or []
)
def to_dict(self) -> Dict:
"""Convert to dictionary for database storage"""
return {
'post_id': self.post_id,
'title': self.title,
'content': self.content,
'published_at': self.published_at,
'added_at': self.added_at,
'edited_at': self.edited_at,
'has_attachments': 1 if self.attachments else 0,
'attachment_count': len(self.attachments),
'embed_count': len(self.embed_urls),
'is_pinned': 1 if self.is_pinned else 0,
'pinned_at': self.pinned_at
}
@dataclass
class Message:
"""Represents a chat message from/to a creator"""
message_id: str
platform: str
service_id: str
creator_id: str # Platform-specific creator ID
text: Optional[str] = None
sent_at: Optional[str] = None
is_from_creator: bool = True
is_tip: bool = False
tip_amount: Optional[float] = None
price: Optional[float] = None
is_free: bool = True
is_purchased: bool = False
reply_to_message_id: Optional[str] = None
attachments: List[Attachment] = field(default_factory=list)
def to_dict(self) -> Dict:
"""Convert to dictionary for database storage"""
return {
'message_id': self.message_id,
'text': self.text,
'sent_at': self.sent_at,
'is_from_creator': 1 if self.is_from_creator else 0,
'is_tip': 1 if self.is_tip else 0,
'tip_amount': self.tip_amount,
'price': self.price,
'is_free': 1 if self.is_free else 0,
'is_purchased': 1 if self.is_purchased else 0,
'has_attachments': 1 if self.attachments else 0,
'attachment_count': len(self.attachments),
'reply_to_message_id': self.reply_to_message_id,
}
@dataclass
class Creator:
"""Represents a creator from Coomer/Kemono"""
creator_id: str
service_id: str
platform: str
username: str
display_name: Optional[str] = None
profile_image_url: Optional[str] = None
banner_image_url: Optional[str] = None
bio: Optional[str] = None
post_count: int = 0
@classmethod
def from_api(cls, data: Dict, service_id: str, platform: str, base_url: str = None) -> 'Creator':
"""Create Creator from API response"""
creator_id = str(data.get('id', ''))
# Construct image domain - use .st instead of .party (coomer.party redirects to coomer.st)
img_domain = None
if base_url and creator_id:
from urllib.parse import urlparse
parsed = urlparse(base_url)
# Convert .party to .st for image URLs (coomer.party/kemono.party images are at .st)
netloc = parsed.netloc.replace('.party', '.st')
img_domain = f"img.{netloc}"
# Construct profile image URL from icon endpoint
profile_image_url = data.get('profile_image')
if not profile_image_url and img_domain:
# Icon URLs are at img.{domain}/icons/{platform}/{creator_id}
profile_image_url = f"https://{img_domain}/icons/{platform}/{creator_id}"
# Construct banner image URL
banner_image_url = data.get('banner_image')
if not banner_image_url and img_domain:
# Banner URLs are at img.{domain}/banners/{platform}/{creator_id}
banner_image_url = f"https://{img_domain}/banners/{platform}/{creator_id}"
return cls(
creator_id=creator_id,
service_id=service_id,
platform=platform,
username=data.get('name', ''),
display_name=data.get('name'),
profile_image_url=profile_image_url,
banner_image_url=banner_image_url,
post_count=data.get('post_count', 0)
)
def to_dict(self) -> Dict:
"""Convert to dictionary for database storage"""
return {
'service_id': self.service_id,
'platform': self.platform,
'creator_id': self.creator_id,
'username': self.username,
'display_name': self.display_name,
'profile_image_url': self.profile_image_url,
'banner_image_url': self.banner_image_url,
'bio': self.bio,
'post_count': self.post_count
}
@dataclass
class SyncResult:
"""Result of a creator sync operation"""
success: bool
new_posts: int = 0
new_attachments: int = 0
downloaded_files: int = 0
failed_files: int = 0
skipped_files: int = 0
error: Optional[str] = None
downloaded_file_info: Optional[List[Dict]] = None # List of {file_path, filename, source, content_type}
def to_dict(self) -> Dict:
return {
'success': self.success,
'new_posts': self.new_posts,
'new_attachments': self.new_attachments,
'downloaded_files': self.downloaded_files,
'failed_files': self.failed_files,
'skipped_files': self.skipped_files,
'error': self.error
}
@dataclass
class DownloadResult:
"""Result of a download operation"""
success: bool
file_path: Optional[str] = None
file_hash: Optional[str] = None
file_size: Optional[int] = None
error: Optional[str] = None
is_duplicate: bool = False
def to_dict(self) -> Dict:
return {
'success': self.success,
'file_path': self.file_path,
'file_hash': self.file_hash,
'file_size': self.file_size,
'error': self.error,
'is_duplicate': self.is_duplicate
}

View File

@@ -0,0 +1,729 @@
"""
OnlyFans Direct API Client
Downloads content directly from the OnlyFans API using browser-extracted
credentials and dynamic request signing.
"""
import asyncio
import aiohttp
import re
from datetime import datetime
from typing import List, Optional, Dict, Any, Callable
from urllib.parse import urlparse, urlencode
from modules.base_module import LoggingMixin, RateLimitMixin
from .models import Post, Attachment, Message
from .onlyfans_signing import OnlyFansSigner
class OnlyFansClient(LoggingMixin, RateLimitMixin):
"""
API client for downloading content directly from OnlyFans.
API Endpoints:
- Base URL: https://onlyfans.com/api2/v2
- Auth: Requires browser-extracted credentials (sess, auth_id, x-bc, User-Agent)
- Signing: Every request needs dynamic sign/time/app-token headers
- GET /users/me - Verify auth
- GET /users/{username} - Get user profile
- GET /users/{user_id}/posts?limit=50&offset={offset} - Get posts (paginated)
"""
BASE_URL = "https://onlyfans.com/api2/v2"
SERVICE_ID = "onlyfans_direct"
PLATFORM = "onlyfans"
def __init__(
self,
auth_config: Dict[str, str],
signing_url: Optional[str] = None,
log_callback: Optional[Callable] = None,
):
"""
Args:
auth_config: Dict with keys: sess, auth_id, auth_uid (optional), x_bc, user_agent
signing_url: Optional custom URL for signing rules
log_callback: Optional logging callback
"""
self._init_logger('PaidContent', log_callback, default_module='OnlyFansDirect')
# More conservative rate limiting than Fansly (OF is stricter)
self._init_rate_limiter(
min_delay=1.5, max_delay=3.0,
batch_delay_min=3, batch_delay_max=6
)
self.auth_config = auth_config
self._session: Optional[aiohttp.ClientSession] = None
self._signer = OnlyFansSigner(rules_url=signing_url)
async def _get_session(self) -> aiohttp.ClientSession:
"""Get or create aiohttp session with OnlyFans headers"""
if self._session is None or self._session.closed:
# Build cookie string
cookies = f"sess={self.auth_config['sess']}; auth_id={self.auth_config['auth_id']}"
auth_uid = self.auth_config.get('auth_uid')
if auth_uid:
cookies += f"; auth_uid_{self.auth_config['auth_id']}={auth_uid}"
headers = {
'Accept': 'application/json, text/plain, */*',
'User-Agent': self.auth_config.get('user_agent', ''),
'x-bc': self.auth_config.get('x_bc', ''),
'Cookie': cookies,
'Origin': 'https://onlyfans.com',
'Referer': 'https://onlyfans.com/',
}
timeout = aiohttp.ClientTimeout(total=60)
self._session = aiohttp.ClientSession(headers=headers, timeout=timeout)
return self._session
async def _sign_request(self, endpoint: str) -> Dict[str, str]:
"""
Compute signing headers for an API request.
Args:
endpoint: API path (e.g. "/users/me") - will be prefixed with /api2/v2
Returns:
Dict with sign, time, app-token, user-id headers
"""
user_id = self.auth_config.get('auth_id', '0')
# Sign with full URL path (matching OF-Scraper)
full_path = f"/api2/v2{endpoint}"
sign_headers = await self._signer.sign(full_path, user_id)
sign_headers['user-id'] = user_id
return sign_headers
async def _api_request(self, endpoint: str, params: Optional[Dict] = None) -> Optional[Dict]:
"""
Make a signed API request to OnlyFans.
Handles 401 (auth failure), 429 (rate limit), and general errors.
Auto-retries on 429 with exponential backoff.
Args:
endpoint: API path (e.g. "/users/me")
params: Optional query parameters
Returns:
Parsed JSON response or None on failure
"""
session = await self._get_session()
# Include query params in the signing path (OF-Scraper does this)
sign_endpoint = endpoint
if params:
sign_endpoint = f"{endpoint}?{urlencode(params)}"
sign_headers = await self._sign_request(sign_endpoint)
url = f"{self.BASE_URL}{endpoint}"
max_retries = 3
for attempt in range(max_retries):
try:
async with session.get(url, params=params, headers=sign_headers) as resp:
if resp.status == 200:
return await resp.json()
elif resp.status == 401:
self.log("OnlyFans auth failed (401) - credentials may be expired", 'error')
return None
elif resp.status == 429:
retry_after = int(resp.headers.get('Retry-After', 30))
wait = min(retry_after * (attempt + 1), 120)
self.log(f"Rate limited (429), waiting {wait}s (attempt {attempt + 1}/{max_retries})", 'warning')
await asyncio.sleep(wait)
# Refresh signing headers for retry (timestamp changes)
sign_headers = await self._sign_request(sign_endpoint)
continue
elif resp.status == 404:
self.log(f"Not found (404): {endpoint}", 'debug')
return None
else:
text = await resp.text()
self.log(f"API error: HTTP {resp.status} for {endpoint}: {text[:200]}", 'warning')
return None
except asyncio.TimeoutError:
self.log(f"Request timeout for {endpoint} (attempt {attempt + 1})", 'warning')
if attempt < max_retries - 1:
await asyncio.sleep(5 * (attempt + 1))
sign_headers = await self._sign_request(sign_endpoint)
continue
return None
except Exception as e:
self.log(f"Request error for {endpoint}: {e}", 'error')
return None
return None
@staticmethod
def _strip_html(text: str) -> str:
"""Strip HTML tags and convert common entities to plain text"""
if not text:
return ''
text = re.sub(r'<br\s*/?>', '\n', text)
text = re.sub(r'<[^>]+>', '', text)
text = text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&#x27;', "'").replace('&quot;', '"')
return text.strip()
async def close(self):
"""Close the aiohttp session"""
if self._session and not self._session.closed:
await self._session.close()
self._session = None
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()
async def check_auth(self) -> Dict[str, Any]:
"""
Verify credentials by calling /users/me.
Returns:
Dict with 'valid' bool and optionally 'user_id', 'username', 'name'
"""
self._delay_between_items()
try:
data = await self._api_request("/users/me")
if data and data.get('id'):
return {
'valid': True,
'user_id': str(data['id']),
'username': data.get('username', ''),
'name': data.get('name', ''),
}
return {'valid': False, 'error': 'Invalid credentials or unexpected response'}
except Exception as e:
self.log(f"Error checking auth: {e}", 'error')
return {'valid': False, 'error': str(e)}
async def get_user_info(self, username: str) -> Optional[Dict[str, Any]]:
"""
Get user profile info.
Args:
username: The OnlyFans username
Returns:
Normalized user info dict or None
"""
self._delay_between_items()
try:
data = await self._api_request(f"/users/{username}")
if not data or not data.get('id'):
self.log(f"User not found: {username}", 'warning')
return None
return {
'user_id': str(data['id']),
'username': data.get('username', username),
'display_name': data.get('name', ''),
'avatar_url': data.get('avatar'),
'banner_url': data.get('header'),
'bio': self._strip_html(data.get('rawAbout') or data.get('about') or ''),
'join_date': (data.get('joinDate') or '')[:10] or None,
'posts_count': data.get('postsCount', 0),
}
except Exception as e:
self.log(f"Error getting user info for {username}: {e}", 'error')
return None
async def get_single_post(self, post_id: str) -> Optional[Post]:
"""
Fetch a single post by its OnlyFans post ID.
Args:
post_id: The OnlyFans post ID
Returns:
Post object or None
"""
self._delay_between_items()
data = await self._api_request(f"/posts/{post_id}")
if not data:
self.log(f"Post {post_id} not found", 'warning')
return None
user_id = str(data.get('author', {}).get('id', data.get('authorId', '')))
post = self._parse_post(data, user_id)
return post
async def get_posts(
self,
user_id: str,
username: str,
since_date: Optional[str] = None,
until_date: Optional[str] = None,
days_back: Optional[int] = None,
max_posts: Optional[int] = None,
progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[Post]:
"""
Fetch posts from a creator's timeline using offset-based pagination.
Args:
user_id: The OnlyFans numeric user ID
username: The username (for logging/reference)
since_date: Only fetch posts after this date (ISO format)
until_date: Only fetch posts before this date (ISO format)
days_back: Fetch posts from the last N days
max_posts: Maximum number of posts to fetch
progress_callback: Called with (page, total_posts) during fetching
Returns:
List of Post objects
"""
self.log(f"Fetching posts for {username} (user_id: {user_id})", 'info')
# Calculate date filters - use naive datetimes to avoid tz comparison issues
since_dt = None
until_dt = None
if days_back:
from datetime import timedelta
since_date = (datetime.now() - timedelta(days=days_back)).isoformat()
if since_date:
try:
dt = datetime.fromisoformat(since_date.replace('Z', '+00:00'))
since_dt = dt.replace(tzinfo=None) # Normalize to naive
except (ValueError, TypeError):
pass
if until_date:
try:
dt = datetime.fromisoformat(until_date.replace('Z', '+00:00'))
until_dt = dt.replace(tzinfo=None) # Normalize to naive
except (ValueError, TypeError):
pass
if since_dt:
self.log(f"Date filter: since_date={since_dt.isoformat()}", 'debug')
all_posts: List[Post] = []
offset = 0
page_size = 50
page = 0
consecutive_old = 0 # Track consecutive old posts for early stop
while True:
self._delay_between_items()
params = {
'limit': str(page_size),
'offset': str(offset),
'order': 'publish_date_desc',
}
data = await self._api_request(f"/users/{user_id}/posts", params=params)
if not data:
break
# OF returns a list of posts directly
posts_list = data if isinstance(data, list) else data.get('list', [])
if not posts_list:
break
page_had_old_post = False
for post_data in posts_list:
post = self._parse_post(post_data, user_id)
if not post:
continue
# Check date filters using published_at
if post.published_at and since_dt:
try:
post_dt = datetime.fromisoformat(post.published_at.replace('Z', '+00:00'))
post_dt_naive = post_dt.replace(tzinfo=None) # Normalize to naive
if post_dt_naive < since_dt:
self.log(f"Reached posts older than since_date ({post.published_at}), stopping", 'debug')
return all_posts
except (ValueError, TypeError) as e:
self.log(f"Date comparison error: {e} (post_date={post.published_at})", 'warning')
if post.published_at and until_dt:
try:
post_dt = datetime.fromisoformat(post.published_at.replace('Z', '+00:00'))
post_dt_naive = post_dt.replace(tzinfo=None)
if post_dt_naive > until_dt:
continue
except (ValueError, TypeError):
pass
all_posts.append(post)
if max_posts and len(all_posts) >= max_posts:
self.log(f"Reached max_posts limit: {max_posts}", 'debug')
return all_posts
page += 1
if progress_callback:
progress_callback(page, len(all_posts))
# If we got fewer results than page_size, we've reached the end
if len(posts_list) < page_size:
break
offset += page_size
self._delay_between_batches()
# Also fetch pinned posts (they may not appear in the timeline)
self._delay_between_items()
pinned_data = await self._api_request(
f"/users/{user_id}/posts",
params={'limit': '50', 'offset': '0', 'order': 'publish_date_desc', 'pinned': '1'},
)
if pinned_data:
pinned_list = pinned_data if isinstance(pinned_data, list) else pinned_data.get('list', [])
existing_ids = {p.post_id for p in all_posts}
for post_data in pinned_list:
post = self._parse_post(post_data, user_id)
if post and post.post_id not in existing_ids:
all_posts.append(post)
self.log(f"Fetched {len(all_posts)} posts for {username}", 'info')
return all_posts
def _parse_post(self, post_data: Dict, user_id: str) -> Optional[Post]:
"""
Parse an OnlyFans post into a Post model.
Args:
post_data: Raw post data from API
user_id: Creator's user ID
Returns:
Post object or None if parsing fails
"""
try:
post_id = str(post_data.get('id', ''))
if not post_id:
return None
# Parse timestamp - OF uses ISO format strings
published_at = None
raw_date = post_data.get('postedAt') or post_data.get('createdAt')
if raw_date:
try:
if isinstance(raw_date, str):
published_at = raw_date
elif isinstance(raw_date, (int, float)):
published_at = datetime.fromtimestamp(raw_date).isoformat()
except (ValueError, TypeError, OSError):
pass
# Content text
content = self._strip_html(post_data.get('rawText') or post_data.get('text') or '')
# Parse media attachments
attachments = []
media_list = post_data.get('media', []) or []
for media_item in media_list:
attachment = self._parse_attachment(media_item)
if attachment:
attachments.append(attachment)
# Extract embed URLs from content text
embed_urls = []
if content:
url_pattern = r'https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/|vimeo\.com/|dailymotion\.com/video/)\S+'
embed_urls = re.findall(url_pattern, content)
return Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=user_id,
title=None,
content=content,
published_at=published_at,
added_at=datetime.now().isoformat(),
attachments=attachments,
embed_urls=embed_urls,
is_pinned=bool(post_data.get('isPinned')),
pinned_at=post_data.get('pinnedAt'),
)
except Exception as e:
self.log(f"Error parsing post: {e}", 'error')
return None
def _parse_attachment(self, media_item: Dict) -> Optional[Attachment]:
"""
Parse an OnlyFans media item into an Attachment.
OF media structure:
{
id, type, source: {source: url, width, height, duration},
full: {source: url, ...}, preview: {source: url, ...}
}
Prefers 'full' quality (OF's standard since 2024), falls back to 'source'.
Args:
media_item: Raw media dict from API
Returns:
Attachment object or None
"""
try:
media_id = str(media_item.get('id', ''))
media_type = media_item.get('type', '').lower()
# Map OF media types to our file types
type_map = {
'photo': 'image',
'video': 'video',
'audio': 'audio',
'gif': 'image',
}
file_type = type_map.get(media_type, 'unknown')
# Get download URL - prefer 'full' quality, fallback to 'source'
download_url = None
width = None
height = None
duration = None
# Current OF API nests media under 'files' key
files = media_item.get('files') or media_item
# Try 'full' first (higher quality)
full_data = files.get('full')
if full_data and isinstance(full_data, dict):
download_url = full_data.get('url') or full_data.get('source')
width = full_data.get('width')
height = full_data.get('height')
duration = full_data.get('duration')
# Fallback to 'source'
if not download_url:
source_data = files.get('source')
if source_data and isinstance(source_data, dict):
download_url = source_data.get('url') or source_data.get('source')
if not width:
width = source_data.get('width')
if not height:
height = source_data.get('height')
if not duration:
duration = source_data.get('duration')
# For videos without a direct URL, get metadata from media item
can_view = media_item.get('canView', True)
if not download_url and media_type == 'video':
# OF DRM videos use FairPlay SAMPLE-AES encryption — cannot be downloaded.
# Get dimensions/duration for metadata, then fall through to preview frame.
if not duration:
duration = media_item.get('duration')
if not width:
width = (full_data or {}).get('width')
if not height:
height = (full_data or {}).get('height')
# Fallback to 'preview' for any content type
# For DRM videos (canView=true), downloads the preview frame image (shown with lock overlay)
# For PPV videos (canView=false), there's no preview — marked unavailable
if not download_url:
preview_data = files.get('preview')
if preview_data and isinstance(preview_data, dict):
download_url = preview_data.get('url') or preview_data.get('source')
if not width:
width = preview_data.get('width')
if not height:
height = preview_data.get('height')
# Some OF responses have src directly
if not download_url:
download_url = media_item.get('src')
# Determine extension from URL
ext = ''
if download_url:
parsed = urlparse(download_url)
path = parsed.path
if '.' in path:
ext = path.rsplit('.', 1)[-1].lower()
# Clean up common issues
if ext in ('jpeg',):
ext = 'jpg'
elif media_type == 'photo':
ext = 'jpg'
elif media_type == 'video':
ext = 'mp4'
filename = f"{media_id}.{ext}" if ext else str(media_id)
# Override file_type based on actual extension (OF sometimes misreports type)
video_exts = {'mp4', 'mov', 'webm', 'avi', 'mkv', 'flv', 'm4v', 'wmv', 'mpg', 'mpeg'}
if ext in video_exts and file_type != 'video':
file_type = 'video'
# Duration may be in seconds (float or int)
if duration is not None:
try:
duration = int(float(duration))
except (ValueError, TypeError):
duration = None
# Check if content is actually locked (canView=false) vs just missing URL
can_view = media_item.get('canView', True)
is_preview = not can_view
if not download_url and not can_view:
self.log(f"PPV/locked content: {filename}", 'debug')
# Detect preview-only: no full/source URL but got a preview URL
if not is_preview and download_url:
has_full = False
if full_data and isinstance(full_data, dict):
has_full = bool(full_data.get('url') or full_data.get('source'))
if not has_full:
source_data = files.get('source')
if source_data and isinstance(source_data, dict):
has_full = bool(source_data.get('url') or source_data.get('source'))
elif not source_data:
has_full = False
if not has_full and not media_item.get('src'):
# Only got URL from preview fallback
is_preview = True
return Attachment(
name=filename,
server_path=f"/onlyfans/{media_id}",
file_type=file_type,
extension=ext if ext else None,
download_url=download_url,
file_size=None,
width=width,
height=height,
duration=duration,
is_preview=is_preview,
)
except Exception as e:
self.log(f"Error parsing attachment: {e}", 'error')
return None
# ==================== MESSAGES ====================
async def get_messages(self, user_id: str, max_messages: int = 500) -> List[Message]:
"""
Fetch messages from a conversation with a creator.
Uses GET /chats/{user_id}/messages with cursor-based pagination.
The 'id' param is used as cursor for older messages.
Args:
user_id: OnlyFans numeric user ID of the creator
max_messages: Maximum number of messages to fetch
Returns:
List of Message objects
"""
messages = []
cursor_id = None
page = 0
while len(messages) < max_messages:
page += 1
params = {'limit': 50, 'order': 'desc'}
if cursor_id:
params['id'] = cursor_id
data = await self._api_request(f"/chats/{user_id}/messages", params=params)
if not data:
break
# Response is a dict with 'list' key containing messages
msg_list = data.get('list', []) if isinstance(data, dict) else data
if not msg_list:
break
for msg_data in msg_list:
msg = self._parse_message(msg_data, user_id)
if msg:
messages.append(msg)
self.log(f"Fetched page {page}: {len(msg_list)} messages (total: {len(messages)})", 'debug')
# Use the last message's id as cursor for next page
if len(msg_list) < 50:
break # Last page
last_id = msg_list[-1].get('id')
if last_id and str(last_id) != str(cursor_id):
cursor_id = last_id
else:
break
self.log(f"Fetched {len(messages)} messages for user {user_id}", 'info')
return messages
def _parse_message(self, msg_data: Dict, creator_user_id: str) -> Optional[Message]:
"""
Parse an OnlyFans message into a Message model.
Args:
msg_data: Raw message dict from API
creator_user_id: Numeric user ID of the creator (to determine direction)
Returns:
Message object or None
"""
try:
msg_id = str(msg_data.get('id', ''))
if not msg_id:
return None
# Determine if message is from creator
from_user = msg_data.get('fromUser', {})
from_user_id = str(from_user.get('id', ''))
is_from_creator = (from_user_id == str(creator_user_id))
# Parse text
text = self._strip_html(msg_data.get('text') or '')
# Parse timestamp
created_at = msg_data.get('createdAt')
sent_at = None
if created_at:
try:
sent_at = datetime.fromisoformat(created_at.replace('Z', '+00:00')).isoformat()
except (ValueError, TypeError):
sent_at = created_at
# PPV/price info
price = msg_data.get('price')
is_free = msg_data.get('isFree', True)
is_purchased = msg_data.get('isOpened', False) or msg_data.get('canPurchase') is False
is_tip = msg_data.get('isTip', False)
tip_amount = msg_data.get('tipAmount')
# Parse media attachments (same structure as posts)
attachments = []
media_list = msg_data.get('media', []) or []
for media_item in media_list:
att = self._parse_attachment(media_item)
if att:
attachments.append(att)
return Message(
message_id=msg_id,
platform=self.PLATFORM,
service_id=self.SERVICE_ID,
creator_id=str(creator_user_id),
text=text if text else None,
sent_at=sent_at,
is_from_creator=is_from_creator,
is_tip=bool(is_tip),
tip_amount=float(tip_amount) if tip_amount else None,
price=float(price) if price else None,
is_free=bool(is_free),
is_purchased=bool(is_purchased),
attachments=attachments,
)
except Exception as e:
self.log(f"Error parsing message: {e}", 'error')
return None

View File

@@ -0,0 +1,109 @@
"""
OnlyFans Request Signing Module
Handles the dynamic request signing required by the OnlyFans API.
Fetches signing rules from the DATAHOARDERS/dynamic-rules GitHub repo
and computes SHA-1 based signatures for each API request.
Isolated module so it's easy to update when OF changes their signing scheme.
"""
import hashlib
import time
from typing import Dict, Optional
import aiohttp
RULES_URL = "https://raw.githubusercontent.com/DATAHOARDERS/dynamic-rules/main/onlyfans.json"
class OnlyFansSigner:
"""
Computes request signatures for the OnlyFans API.
Uses dynamic rules fetched from a public GitHub repo (same source as OF-Scraper).
Rules are cached locally and refreshed every 6 hours.
"""
RULES_TTL = 6 * 3600 # 6 hours
def __init__(self, rules_url: Optional[str] = None):
self.rules_url = rules_url or RULES_URL
self._rules: Optional[Dict] = None
self._rules_fetched_at: float = 0
@property
def rules_stale(self) -> bool:
"""Check if cached rules need refreshing"""
if self._rules is None:
return True
return (time.time() - self._rules_fetched_at) > self.RULES_TTL
async def get_rules(self) -> Dict:
"""
Fetch signing rules, using cache if fresh.
Returns:
Dict with keys: static_param, format, checksum_indexes,
checksum_constants, checksum_constant, app_token
"""
if not self.rules_stale:
return self._rules
timeout = aiohttp.ClientTimeout(total=15)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(self.rules_url) as resp:
if resp.status != 200:
if self._rules is not None:
# Use stale cache rather than failing
return self._rules
raise RuntimeError(
f"Failed to fetch OF signing rules: HTTP {resp.status}"
)
self._rules = await resp.json(content_type=None)
self._rules_fetched_at = time.time()
return self._rules
async def sign(self, endpoint_path: str, user_id: str = "0") -> Dict[str, str]:
"""
Compute signing headers for an OnlyFans API request.
Args:
endpoint_path: The full URL path (e.g. "/api2/v2/users/me")
user_id: The authenticated user's ID (from auth_id cookie)
Returns:
Dict with 'sign', 'time', 'app-token' headers
"""
rules = await self.get_rules()
# Timestamp in milliseconds (matching OF-Scraper's implementation)
timestamp = str(round(time.time() * 1000))
# 1. Build the message to hash
msg = "\n".join([
rules["static_param"],
timestamp,
endpoint_path,
str(user_id),
])
# 2. SHA-1 hash
sha1_hash = hashlib.sha1(msg.encode("utf-8")).hexdigest()
sha1_bytes = sha1_hash.encode("ascii")
# 3. Checksum from indexed byte positions + single constant
# (matching OF-Scraper's implementation)
checksum_indexes = rules["checksum_indexes"]
checksum_constant = rules.get("checksum_constant", 0)
checksum = sum(sha1_bytes[i] for i in checksum_indexes) + checksum_constant
# 4. Build the sign header using the format template
# Typical format: "53760:{}:{:x}:69723085"
sign_value = rules["format"].format(sha1_hash, abs(checksum))
return {
"sign": sign_value,
"time": timestamp,
"app-token": rules["app_token"],
}

View File

@@ -0,0 +1,755 @@
"""
Pornhub Client - Fetches creator info and videos using yt-dlp
"""
import asyncio
import html as html_module
import json
import os
import re
import subprocess
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
class PornhubClient(LoggingMixin):
"""
Client for fetching Pornhub creator information and videos using yt-dlp
Supports:
- Pornstar pages (pornhub.com/pornstar/name)
- Channel pages (pornhub.com/channels/name)
- User pages (pornhub.com/users/name)
- Model pages (pornhub.com/model/name)
"""
SERVICE_ID = 'pornhub'
PLATFORM = 'pornhub'
# Quality presets for yt-dlp
# Pornhub serves single combined streams with IDs like '1080p', '720p', etc.
# NOT separate video+audio streams like YouTube
QUALITY_PRESETS = {
'best': 'bestvideo+bestaudio/best',
'1080p': 'bestvideo[height<=1080]+bestaudio/best[height<=1080]/best',
'720p': 'bestvideo[height<=720]+bestaudio/best[height<=720]/best',
'480p': 'bestvideo[height<=480]+bestaudio/best[height<=480]/best',
}
def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Pornhub')
# Find yt-dlp executable
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
if not self.ytdlp_path:
self.log("yt-dlp not found, Pornhub support will be disabled", 'warning')
# Store database reference for cookie access
self.unified_db = unified_db
self._cookies_file = None
# Cache for profile page HTML (avoid re-fetching for avatar/banner/bio)
self._profile_page_cache: Dict[str, Optional[str]] = {}
def _find_ytdlp(self) -> Optional[str]:
"""Find yt-dlp executable"""
common_paths = [
'/opt/media-downloader/venv/bin/yt-dlp',
'/usr/local/bin/yt-dlp',
'/usr/bin/yt-dlp',
'/opt/homebrew/bin/yt-dlp',
os.path.expanduser('~/.local/bin/yt-dlp'),
]
for path in common_paths:
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
try:
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def is_available(self) -> bool:
"""Check if yt-dlp is available"""
return self.ytdlp_path is not None
def _get_cookies_file(self) -> Optional[str]:
"""Get path to cookies file, creating it from database if needed"""
if self._cookies_file and os.path.exists(self._cookies_file):
return self._cookies_file
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
# Support both {"cookies": [...]} and [...] formats
if isinstance(data, dict) and 'cookies' in data:
cookies_list = data['cookies']
elif isinstance(data, list):
cookies_list = data
else:
cookies_list = []
if cookies_list:
# Write cookies to temp file in Netscape format
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='pornhub_cookies_')
with os.fdopen(fd, 'w') as f:
f.write("# Netscape HTTP Cookie File\n")
for cookie in cookies_list:
domain = cookie.get('domain', '')
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
expiry = str(int(cookie.get('expirationDate', 0)))
name = cookie.get('name', '')
value = cookie.get('value', '')
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
self.log(f"Loaded {len(cookies_list)} cookies from pornhub scraper", 'debug')
return self._cookies_file
except Exception as e:
self.log(f"Could not load cookies: {e}", 'debug')
return None
def _get_cookies_list(self) -> Optional[list]:
"""Get cookies as a list of dicts for aiohttp requests"""
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
if isinstance(data, dict) and 'cookies' in data:
return data['cookies']
elif isinstance(data, list):
return data
except Exception as e:
self.log(f"Could not load cookies list: {e}", 'debug')
return None
def _get_base_cmd(self) -> List[str]:
"""Get base yt-dlp command with cookies if available"""
cmd = [self.ytdlp_path]
cookies_file = self._get_cookies_file()
if cookies_file:
cmd.extend(['--cookies', cookies_file])
return cmd
def cleanup(self):
"""Clean up temporary files"""
if self._cookies_file and os.path.exists(self._cookies_file):
try:
os.unlink(self._cookies_file)
except Exception:
pass
self._cookies_file = None
self._profile_page_cache.clear()
@staticmethod
def extract_creator_id(url: str) -> Optional[Tuple[str, str]]:
"""
Extract creator type and identifier from Pornhub URL
Returns:
Tuple of (type, id) where type is 'pornstar', 'channels', 'users', or 'model'
or None if not a valid Pornhub creator URL
"""
patterns = [
(r'pornhub\.com/pornstar/([a-zA-Z0-9_-]+)', 'pornstar'),
(r'pornhub\.com/channels/([a-zA-Z0-9_-]+)', 'channels'),
(r'pornhub\.com/users/([a-zA-Z0-9_-]+)', 'users'),
(r'pornhub\.com/model/([a-zA-Z0-9_-]+)', 'model'),
]
for pattern, creator_type in patterns:
match = re.search(pattern, url)
if match:
return (creator_type, match.group(1))
return None
@staticmethod
def normalize_creator_url(creator_id: str, creator_type: str = 'pornstar') -> str:
"""Convert creator ID to a consistent URL format
Args:
creator_id: Creator name/identifier (may be 'type/name' format)
creator_type: Default type if not embedded in creator_id
"""
# Already a full URL
if creator_id.startswith('http://') or creator_id.startswith('https://'):
return creator_id
# Handle 'type/name' format from URL parser
if '/' in creator_id:
parts = creator_id.split('/', 1)
creator_type = parts[0]
creator_id = parts[1]
return f"https://www.pornhub.com/{creator_type}/{creator_id}"
def _get_listing_url(self, url: str) -> str:
"""Get the URL to use for listing videos from a creator page.
For pornstars and models, append /videos to get the video listing.
For channels and users, the base URL already lists videos.
"""
# Parse out the type
parsed = self.extract_creator_id(url)
if parsed:
creator_type, _ = parsed
if creator_type in ('pornstar', 'model'):
# Strip any trailing slash and append /videos
url = url.rstrip('/')
if not url.endswith('/videos'):
url = f"{url}/videos"
return url
async def get_creator_info(self, url: str) -> Optional[Dict]:
"""
Get creator information using yt-dlp + profile page scraping
Returns dict with creator metadata or None if not found
"""
if not self.is_available():
return None
creator_type_id = self.extract_creator_id(url)
creator_type = creator_type_id[0] if creator_type_id else 'pornstar'
# Try to scrape the display name from the profile page first
creator_name = None
try:
page_html = await self.get_profile_page(url)
if page_html:
# Look for <h1 itemprop="name">Name</h1> inside nameSubscribe div
name_match = re.search(r'<div class="nameSubscribe">.*?<h1[^>]*>\s*(.+?)\s*</h1>', page_html, re.DOTALL)
if name_match:
creator_name = html_module.unescape(name_match.group(1).strip())
self.log(f"Found creator name from profile page: {creator_name}", 'debug')
except Exception as e:
self.log(f"Could not scrape creator name: {e}", 'debug')
# If page scraping didn't find a name, try yt-dlp
if not creator_name:
try:
listing_url = self._get_listing_url(url)
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
'--playlist-items', '1',
listing_url
]
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode == 0:
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
data = json.loads(line)
playlist_title = data.get('playlist_title') or ''
creator_name = (data.get('channel') or data.get('uploader')
or playlist_title.replace(' - Videos', '') or None)
if creator_name:
creator_name = html_module.unescape(creator_name)
break
except json.JSONDecodeError:
continue
except Exception as e:
self.log(f"yt-dlp creator info failed: {e}", 'debug')
# Fall back to deriving name from URL slug
if not creator_name and creator_type_id:
creator_name = creator_type_id[1].replace('-', ' ').title()
if creator_name:
return {
'creator_id': creator_type_id[1] if creator_type_id else None,
'creator_name': creator_name,
'creator_url': url,
'creator_type': creator_type,
}
return None
async def get_creator_videos(self, url: str, since_date: str = None,
max_videos: int = None,
progress_callback=None) -> List[Dict]:
"""
Get all videos from a creator page using --flat-playlist for speed.
Args:
url: Pornhub creator URL
since_date: Only fetch videos published after this date (ISO format)
max_videos: Maximum number of videos to fetch
progress_callback: Callback function(count) for progress updates
Returns:
List of video metadata dicts
"""
if not self.is_available():
return []
try:
listing_url = self._get_listing_url(url)
# Use --flat-playlist for fast listing (avoids per-video HTTP requests)
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
'--socket-timeout', '30',
'--retries', '3',
listing_url
]
if max_videos:
cmd.extend(['--playlist-items', f'1:{max_videos}'])
self.log(f"Fetching videos from: {url}", 'info')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error = stderr.decode('utf-8', errors='replace')
self.log(f"Failed to get creator videos: {error}", 'warning')
return []
videos = []
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
data = json.loads(line)
# Skip non-video entries
if data.get('_type') == 'playlist':
continue
video_id = data.get('id')
if not video_id:
continue
# Flat-playlist doesn't provide upload_date for Pornhub, but check anyway
upload_date = data.get('upload_date')
if upload_date:
try:
upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
except ValueError:
pass
# Decode HTML entities in title (flat-playlist returns them encoded)
title = html_module.unescape(data.get('title', f'Video {video_id}'))
# Build video URL
video_url = (data.get('webpage_url') or data.get('url')
or f"https://www.pornhub.com/view_video.php?viewkey={video_id}")
videos.append({
'video_id': video_id,
'title': title,
'description': data.get('description', ''),
'upload_date': upload_date,
'duration': data.get('duration'),
'view_count': data.get('view_count'),
'thumbnail': data.get('thumbnail'),
'url': video_url,
})
if progress_callback:
progress_callback(len(videos))
if max_videos and len(videos) >= max_videos:
break
except json.JSONDecodeError:
continue
self.log(f"Found {len(videos)} videos", 'info')
return videos
except Exception as e:
self.log(f"Error getting creator videos: {e}", 'error')
return []
async def download_video(self, video_url: str, output_dir: Path, quality: str = 'best',
progress_callback=None) -> Dict:
"""
Download a video
Args:
video_url: Pornhub video URL
output_dir: Directory to save the video
quality: Quality preset
progress_callback: Callback for download progress
Returns:
Dict with success status and file info
"""
if not self.is_available():
return {'success': False, 'error': 'yt-dlp not available'}
try:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
cmd = self._get_base_cmd() + [
'--no-warnings',
'-f', format_str,
'-o', output_template,
'--print-json',
'--no-playlist',
'--user-agent', 'Mozilla/5.0',
'--referer', 'https://www.pornhub.com/',
'--merge-output-format', 'mp4',
'--concurrent-fragments', '4',
'--no-part',
'--retries', '20',
video_url
]
self.log(f"Downloading video: {video_url}", 'debug')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error_msg = stderr.decode('utf-8', errors='replace').strip()
if 'Video unavailable' in error_msg or 'not available' in error_msg:
error_msg = 'Video unavailable or private'
elif 'premium' in error_msg.lower():
error_msg = 'Video requires premium access'
elif len(error_msg) > 200:
error_msg = error_msg[:200] + '...'
return {'success': False, 'error': error_msg}
# Parse output JSON
video_info = None
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
try:
video_info = json.loads(line)
break
except json.JSONDecodeError:
continue
if not video_info:
# Try to find downloaded file
files = list(output_dir.glob('*.mp4'))
if files:
file_path = max(files, key=lambda f: f.stat().st_mtime)
return {
'success': True,
'file_path': str(file_path),
'filename': file_path.name,
'file_size': file_path.stat().st_size
}
return {'success': False, 'error': 'Could not find downloaded file'}
file_path = video_info.get('_filename') or video_info.get('filename')
if file_path:
file_path = Path(file_path)
return {
'success': True,
'file_path': str(file_path) if file_path else None,
'filename': file_path.name if file_path else None,
'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
'title': video_info.get('title'),
'duration': video_info.get('duration'),
'video_id': video_info.get('id'),
'upload_date': video_info.get('upload_date'),
'timestamp': video_info.get('timestamp'),
'thumbnail': video_info.get('thumbnail'),
}
except Exception as e:
self.log(f"Error downloading video: {e}", 'error')
return {'success': False, 'error': str(e)}
async def get_profile_page(self, url: str) -> Optional[str]:
"""Fetch profile page HTML via aiohttp (with cookies if available).
Results are cached to avoid re-fetching for avatar/banner/bio."""
# Strip /videos suffix for profile page
base_url = re.sub(r'/videos/?$', '', url)
if base_url in self._profile_page_cache:
return self._profile_page_cache[base_url]
try:
import aiohttp
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
# Build simple cookies dict for the session
cookies_dict = {}
cookies_list = self._get_cookies_list()
if cookies_list:
for cookie in cookies_list:
name = cookie.get('name', '')
value = cookie.get('value', '')
if name:
cookies_dict[name] = value
async with aiohttp.ClientSession(cookies=cookies_dict) as session:
async with session.get(
base_url,
headers=headers,
timeout=aiohttp.ClientTimeout(total=15)
) as resp:
if resp.status == 200:
text = await resp.text()
self._profile_page_cache[base_url] = text
return text
except Exception as e:
self.log(f"Could not fetch profile page: {e}", 'debug')
self._profile_page_cache[base_url] = None
return None
async def get_profile_image(self, url: str) -> Optional[str]:
"""Scrape profile page for avatar/photo URL"""
try:
page_html = await self.get_profile_page(url)
if not page_html:
return None
# Look for avatar image: <img id="getAvatar" src="...">
avatar_match = re.search(r'<img[^>]*id=["\']getAvatar["\'][^>]*src=["\']([^"\']+)["\']', page_html)
if avatar_match:
self.log("Found Pornhub profile avatar", 'debug')
return avatar_match.group(1)
# Try og:image meta tag
og_match = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_html)
if not og_match:
og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:image"', page_html)
if og_match:
return og_match.group(1)
except Exception as e:
self.log(f"Could not fetch profile image: {e}", 'debug')
return None
async def get_profile_bio(self, url: str) -> Optional[str]:
"""Scrape bio/about section from profile page"""
try:
page_html = await self.get_profile_page(url)
if not page_html:
return None
# Look for aboutMeSection -> div with the actual text
# Structure: <section class="aboutMeSection ..."><div class="title">About Name</div><div>Bio text</div></section>
about_match = re.search(
r'<section\s+class="aboutMeSection[^"]*"[^>]*>.*?<div class="title">[^<]*</div>\s*<div>\s*(.*?)\s*</div>',
page_html, re.DOTALL
)
if about_match:
bio_text = re.sub(r'<[^>]+>', '', about_match.group(1)).strip()
if bio_text:
self.log("Found Pornhub profile bio", 'debug')
return html_module.unescape(bio_text)
# Fallback: look for biographyAbout section
bio_match = re.search(
r'class="biographyAbout[^"]*"[^>]*>.*?<div class="content[^"]*">(.*?)</div>',
page_html, re.DOTALL
)
if bio_match:
bio_text = re.sub(r'<[^>]+>', '', bio_match.group(1)).strip()
if bio_text:
self.log("Found Pornhub profile bio (fallback)", 'debug')
return html_module.unescape(bio_text)
except Exception as e:
self.log(f"Could not fetch profile bio: {e}", 'debug')
return None
async def get_profile_banner(self, url: str) -> Optional[str]:
"""Scrape banner/cover image if available"""
try:
page_html = await self.get_profile_page(url)
if not page_html:
return None
# Look for cover image: <img id="coverPictureDefault" src="...">
cover_match = re.search(
r'<img[^>]*id=["\']coverPictureDefault["\'][^>]*src=["\']([^"\']+)["\']',
page_html
)
if cover_match:
self.log("Found Pornhub profile banner", 'debug')
return cover_match.group(1)
# Fallback: any img inside coverImage div
cover_match = re.search(
r'<div class="coverImage">\s*<img[^>]*src=["\']([^"\']+)["\']',
page_html, re.DOTALL
)
if cover_match:
self.log("Found Pornhub profile banner (div)", 'debug')
return cover_match.group(1)
except Exception as e:
self.log(f"Could not fetch profile banner: {e}", 'debug')
return None
async def get_profile_info(self, url: str) -> Optional[Dict]:
"""Scrape all profile info from the page in one pass"""
page_html = await self.get_profile_page(url)
if not page_html:
return None
info = {}
# Extract infoPiece data (Gender, Birth Place, Height, etc.)
info_pieces = re.findall(
r'<div class="infoPiece">\s*<span>\s*(.*?)\s*</span>\s*(.*?)\s*</div>',
page_html, re.DOTALL
)
for label, value in info_pieces:
label = re.sub(r'<[^>]+>', '', label).strip().rstrip(':')
value = re.sub(r'<[^>]+>', '', value).strip()
if label and value:
info[label.lower().replace(' ', '_')] = value
return info if info else None
async def get_joined_date(self, url: str) -> Optional[str]:
"""Extract a joined/career start date from profile info"""
try:
profile_info = await self.get_profile_info(url)
if not profile_info:
return None
# Pornstar pages have "Career Start and End: 2011 to Present"
career = profile_info.get('career_start_and_end')
if career:
# Extract start year: "2011 to Present" -> "2011"
match = re.match(r'(\d{4})', career)
if match:
return match.group(1)
# User/model pages might not have career info but could have other dates
return None
except Exception as e:
self.log(f"Could not get joined date: {e}", 'debug')
return None
async def get_creator(self, url: str) -> Optional[Creator]:
"""
Get Creator object from creator URL
"""
info = await self.get_creator_info(url)
if not info:
return None
# Build creator_id as 'type/name' format
creator_type_id = self.extract_creator_id(url)
if creator_type_id:
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}"
else:
creator_id = info.get('creator_id', '')
# Profile image is already fetched during get_creator_info (page was cached)
profile_image = await self.get_profile_image(url)
return Creator(
creator_id=creator_id,
service_id='pornhub',
platform='pornhub',
username=info.get('creator_name', 'Unknown'),
display_name=info.get('creator_name'),
profile_image_url=profile_image,
)
async def get_posts(self, url: str, since_date: str = None,
max_videos: int = None, progress_callback=None) -> List[Post]:
"""
Get videos as Post objects
"""
videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
# Get creator_id from URL
creator_type_id = self.extract_creator_id(url)
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" if creator_type_id else ''
posts = []
for video in videos:
# Create attachment for the video
attachment = Attachment(
name=f"{video['title']}.mp4",
file_type='video',
extension='.mp4',
server_path=video['url'],
download_url=video['url'],
duration=video.get('duration'),
)
post = Post(
post_id=video['video_id'],
service_id='pornhub',
platform='pornhub',
creator_id=creator_id,
title=video['title'],
content=video.get('description') or video['title'],
published_at=video.get('upload_date'),
attachments=[attachment],
)
posts.append(post)
return posts

View File

@@ -0,0 +1,678 @@
"""
Reddit Client for Paid Content - Uses gallery-dl to fetch subreddit posts and download media.
Adapts the gallery-dl + metadata parsing pattern from reddit_community_monitor.py
to produce Post/Attachment objects for the paid content system.
"""
import asyncio
import json
import os
import shutil
import subprocess
import tempfile
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Dict, List, Optional
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class RedditClient(LoggingMixin):
"""
Client for fetching Reddit subreddit content via gallery-dl.
gallery-dl downloads files during fetch, so attachments come with local_path
already set. The sync handler moves files to their final location.
"""
SERVICE_ID = 'reddit'
PLATFORM = 'reddit'
def __init__(self, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Reddit')
self.unified_db = unified_db
self.gallery_dl_path = shutil.which('gallery-dl') or '/opt/media-downloader/venv/bin/gallery-dl'
def get_subreddit_info(self, subreddit: str) -> Optional[Dict]:
"""Get basic subreddit info by checking the Reddit JSON API.
Returns dict with creator_id and creator_name.
"""
import urllib.request
import urllib.error
try:
# Quick check via Reddit's public JSON endpoint
url = f'https://www.reddit.com/r/{subreddit}/about.json'
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
})
with urllib.request.urlopen(req, timeout=15) as resp:
data = json.loads(resp.read().decode())
sub_data = data.get('data', {})
display_name = sub_data.get('display_name', subreddit)
title = sub_data.get('title', '')
# Extract icon — community_icon is higher res, icon_img is fallback
icon_url = (sub_data.get('community_icon') or sub_data.get('icon_img') or '').split('?')[0]
# HTML entities in URLs
icon_url = icon_url.replace('&amp;', '&') if icon_url else None
# Extract banner — banner_background_image is the main one
banner_url = sub_data.get('banner_background_image') or sub_data.get('mobile_banner_image') or ''
banner_url = banner_url.split('?')[0] if banner_url else None
if banner_url:
banner_url = banner_url.replace('&amp;', '&')
# Build bio from title + public description
public_desc = sub_data.get('public_description', '')
bio_parts = []
if title:
bio_parts.append(title)
if public_desc and public_desc != title:
bio_parts.append(public_desc)
subscribers = sub_data.get('subscribers')
if subscribers:
bio_parts.append(f"{subscribers:,} subscribers")
bio = ''.join(bio_parts) if bio_parts else None
# Subreddit creation date
created_utc = sub_data.get('created_utc')
joined_date = None
if created_utc:
try:
joined_date = datetime.fromtimestamp(created_utc, tz=timezone.utc).strftime('%Y-%m-%d')
except (ValueError, OSError):
pass
# Use the subreddit title as display name (e.g. "Reddit Pics")
# Fall back to r/name format if no title
friendly_name = title if title else f'r/{display_name}'
return {
'creator_id': display_name.lower(),
'creator_name': f'r/{display_name}',
'display_name': friendly_name,
'bio': bio,
'joined_date': joined_date,
'profile_image_url': icon_url or None,
'banner_image_url': banner_url or None,
}
except urllib.error.HTTPError as e:
if e.code == 404:
self.log(f"Subreddit r/{subreddit} not found (404)", 'warning')
return None
elif e.code == 403:
# Private/quarantined — still exists, return basic info
self.log(f"Subreddit r/{subreddit} is private/quarantined", 'warning')
return {
'creator_id': subreddit.lower(),
'creator_name': f'r/{subreddit}',
}
else:
self.log(f"HTTP {e.code} checking r/{subreddit}", 'warning')
# Return basic info and let sync verify
return {
'creator_id': subreddit.lower(),
'creator_name': f'r/{subreddit}',
}
except Exception as e:
self.log(f"Error getting subreddit info for r/{subreddit}: {e}", 'error')
return None
def get_posts(self, subreddit: str, since_date: str = None, max_posts: int = 0,
progress_callback=None) -> tuple:
"""Fetch posts and download media from a subreddit using gallery-dl.
Args:
subreddit: Subreddit name (without r/)
since_date: ISO date string; skip posts older than this
max_posts: Maximum posts to fetch (0 = unlimited)
progress_callback: Optional callable(downloaded_count, skipped_count, latest_file)
for live progress updates
Returns:
Tuple of (List[Post], temp_dir_path) — caller must clean up temp_dir
when done moving files. Returns ([], None) on failure.
"""
temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_')
try:
downloaded = self.run_gallery_dl(subreddit, temp_dir, since_date, max_posts,
progress_callback=progress_callback)
if not downloaded:
shutil.rmtree(temp_dir, ignore_errors=True)
return [], None
# Group files by post using metadata sidecars
grouped = self._group_files_by_post(downloaded, temp_dir, subreddit)
if not grouped:
shutil.rmtree(temp_dir, ignore_errors=True)
return [], None
posts = []
for post_id, post_data in grouped.items():
attachments = []
for file_path in post_data['files']:
ext = file_path.suffix.lower()
file_type = self._detect_file_type(ext)
attachments.append(Attachment(
name=file_path.name,
file_type=file_type,
extension=ext,
server_path=str(file_path), # temp path, will be moved
download_url=None, # Already downloaded
file_size=file_path.stat().st_size if file_path.exists() else None,
))
if not attachments:
continue
post = Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=subreddit.lower(),
title=post_data.get('title'),
content=post_data.get('title'),
published_at=post_data.get('date'),
attachments=attachments,
)
posts.append(post)
self.log(f"Parsed {len(posts)} posts with {sum(len(p.attachments) for p in posts)} attachments from r/{subreddit}", 'info')
return posts, temp_dir
except Exception as e:
self.log(f"Error fetching posts from r/{subreddit}: {e}", 'error')
shutil.rmtree(temp_dir, ignore_errors=True)
return [], None
def run_gallery_dl(self, subreddit: str, temp_dir: str,
since_date: str = None, max_posts: int = 0,
progress_callback=None, batch_callback=None,
batch_size: int = 50) -> dict:
"""Run gallery-dl to download media from a subreddit.
Streams stdout line-by-line. Calls progress_callback for status updates
and batch_callback with lists of new file paths for incremental processing.
Args:
progress_callback: Called with (dl_count, skip_count, total_seen)
batch_callback: Called with (new_files: List[Path]) every batch_size files
batch_size: How many files to accumulate before calling batch_callback
Returns:
Dict with dl_count, skip_count, total.
"""
import time
# Use a separate download archive for paid content reddit
archive_dir = '/opt/media-downloader/data/cache'
os.makedirs(archive_dir, exist_ok=True)
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
cmd = [
self.gallery_dl_path,
'--write-metadata',
'--download-archive', archive_path,
'-d', temp_dir,
]
# REST API mode to avoid shared OAuth rate limits
cmd.extend(['-o', 'extractor.reddit.api=rest'])
# Limit posts (0 = unlimited)
if max_posts > 0:
cmd.extend(['--range', f'1-{max_posts}'])
# Date filtering
if since_date:
try:
cutoff = since_date[:10] # YYYY-MM-DD
cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"])
except (ValueError, IndexError):
pass
cmd.append(f'https://www.reddit.com/r/{subreddit}/new/')
# Check for Reddit cookies file
cookies_file = self._get_cookies_file()
if cookies_file:
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
cmd.extend(['--cookies', temp_cookie_file])
self.log(f"Running gallery-dl for r/{subreddit}", 'info')
self.log(f"Command: {' '.join(cmd)}", 'debug')
dl_count = 0
skip_count = 0
pending_files = []
try:
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
start_time = time.time()
timeout_secs = 7200 # 2 hours
while True:
if time.time() - start_time > timeout_secs:
proc.kill()
self.log(f"gallery-dl timed out for r/{subreddit}", 'error')
break
line = proc.stdout.readline()
if not line and proc.poll() is not None:
break
if not line:
continue
line = line.strip()
if not line:
continue
if line.startswith('# '):
# Skipped file (already in archive)
skip_count += 1
else:
# Downloaded file — gallery-dl prints the full path
dl_count += 1
file_path = Path(line)
if file_path.exists() and not file_path.name.endswith('.json'):
pending_files.append(file_path)
total = dl_count + skip_count
if progress_callback and total % 5 == 0:
progress_callback(dl_count, skip_count, total)
# Flush batch for processing
if batch_callback and len(pending_files) >= batch_size:
batch_callback(list(pending_files))
pending_files.clear()
proc.wait()
# Final batch
if batch_callback and pending_files:
batch_callback(list(pending_files))
pending_files.clear()
if progress_callback:
progress_callback(dl_count, skip_count, dl_count + skip_count)
returncode = proc.returncode
if returncode not in (None, 0, 1, 4, 5):
stderr = proc.stderr.read()
self.log(f"gallery-dl returned code {returncode} for r/{subreddit}", 'warning')
if stderr:
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
except Exception as e:
self.log(f"gallery-dl failed for r/{subreddit}: {e}", 'error')
self.log(f"gallery-dl done for r/{subreddit}: {dl_count} downloaded, {skip_count} skipped", 'info')
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
def _group_files_by_post(self, files: List[Path], temp_dir: str,
subreddit: str) -> Dict[str, Dict]:
"""Group downloaded files by Reddit post ID using metadata JSON sidecars.
Adapted from reddit_community_monitor.py:_group_files_by_post
Returns:
Dict mapping reddit_post_id -> {
'files': [Path],
'title': str,
'date': str,
'source_url': str
}
"""
posts: Dict[str, Dict] = {}
for file_path in files:
# Look for matching metadata JSON sidecar
json_path = file_path.with_suffix(file_path.suffix + '.json')
if not json_path.exists():
json_path = file_path.with_suffix('.json')
metadata = {}
if json_path.exists():
try:
with open(json_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
except (json.JSONDecodeError, Exception) as e:
self.log(f"Failed to parse metadata for {file_path.name}: {e}", 'debug')
# Extract Reddit post ID
reddit_post_id = None
for key in ('id', 'reddit_id', 'parent_id'):
if key in metadata:
reddit_post_id = str(metadata[key])
break
if not reddit_post_id:
# Filename-based fallback: subreddit_postid_num.ext
parts = file_path.stem.split('_')
if len(parts) >= 2:
reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1]
else:
reddit_post_id = file_path.stem
# Extract post date
post_date = None
if 'date' in metadata:
date_val = metadata['date']
if isinstance(date_val, str):
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
try:
utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=timezone.utc)
post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S')
break
except ValueError:
continue
if not post_date:
post_date = date_val
elif isinstance(date_val, (int, float)):
try:
post_date = datetime.fromtimestamp(date_val, tz=timezone.utc).isoformat()
except (ValueError, OSError):
pass
if not post_date and 'created_utc' in metadata:
try:
post_date = datetime.fromtimestamp(metadata['created_utc'], tz=timezone.utc).isoformat()
except (ValueError, OSError):
pass
if not post_date:
post_date = datetime.now().isoformat()
title = metadata.get('title', metadata.get('description', ''))
sub = metadata.get('subreddit', subreddit)
source_url = f"https://www.reddit.com/r/{sub}/comments/{reddit_post_id}" if sub else ''
if reddit_post_id not in posts:
posts[reddit_post_id] = {
'files': [],
'title': title,
'date': post_date,
'source_url': source_url,
}
posts[reddit_post_id]['files'].append(file_path)
return posts
def _get_cookies_file(self) -> Optional[str]:
"""Get Reddit cookies JSON from the scrapers table if configured."""
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT cookies FROM scrapers WHERE name = 'reddit' AND cookies IS NOT NULL"
)
row = cursor.fetchone()
if row and row[0]:
return row[0]
except Exception as e:
self.log(f"Could not load Reddit cookies: {e}", 'debug')
return None
def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool:
"""Convert JSON cookies array to Netscape cookie file format."""
try:
cookies = json.loads(cookies_json)
if not isinstance(cookies, list):
return False
with open(output_path, 'w') as f:
f.write("# Netscape HTTP Cookie File\n")
f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n")
for cookie in cookies:
domain = cookie.get('domain', '')
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0)))
if expires is None:
expires = 0
expires = str(int(float(expires)))
name = cookie.get('name', '')
value = cookie.get('value', '')
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n")
return True
except Exception as e:
self.log(f"Failed to write Netscape cookie file: {e}", 'error')
return False
def get_pullpush_post_ids(self, subreddit: str, after_ts: int = 0,
before_ts: int = None,
progress_callback=None) -> List[Dict]:
"""Fetch all historical post IDs for a subreddit from the Pullpush (Pushshift) API.
Paginates through the full archive using created_utc ascending order.
Rate-limited to ~1 request per 2 seconds.
Args:
subreddit: Subreddit name (without r/)
after_ts: Unix timestamp to start from (0 = beginning of time)
before_ts: Unix timestamp to stop at (None = no upper limit)
progress_callback: Optional callable(fetched_count, message)
Returns:
List of dicts: [{id, title, created_utc, url, is_gallery}, ...]
"""
import time
import urllib.request
import urllib.error
base_url = 'https://api.pullpush.io/reddit/search/submission/'
all_posts = []
current_after = after_ts
page = 0
while True:
params = (
f'subreddit={subreddit}'
f'&size=100'
f'&sort=asc'
f'&sort_type=created_utc'
f'&after={current_after}'
)
if before_ts is not None:
params += f'&before={before_ts}'
url = f'{base_url}?{params}'
page += 1
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
})
with urllib.request.urlopen(req, timeout=30) as resp:
data = json.loads(resp.read().decode())
except urllib.error.HTTPError as e:
if e.code == 429:
self.log(f"Pullpush rate limited, waiting 5s...", 'warning')
time.sleep(5)
continue
self.log(f"Pullpush HTTP {e.code} for r/{subreddit}: {e}", 'error')
break
except Exception as e:
self.log(f"Pullpush request failed for r/{subreddit}: {e}", 'error')
break
posts = data.get('data', [])
if not posts:
break
for post in posts:
all_posts.append({
'id': post.get('id', ''),
'title': post.get('title', ''),
'created_utc': post.get('created_utc', 0),
'url': post.get('url', ''),
'is_gallery': post.get('is_gallery', False),
'selftext': post.get('selftext', ''),
})
last_ts = posts[-1].get('created_utc', 0)
if progress_callback:
progress_callback(len(all_posts),
f"Fetched {len(all_posts)} post IDs (page {page})")
# Handle stuck pagination — same timestamp repeating
if last_ts <= current_after:
current_after = last_ts + 1
else:
current_after = last_ts
# If we got fewer than 100, we've reached the end
if len(posts) < 100:
break
# Rate limit: 2s between requests
time.sleep(2)
self.log(f"Pullpush: fetched {len(all_posts)} total post IDs for r/{subreddit}", 'info')
return all_posts
def run_gallery_dl_urls(self, urls_file: str, temp_dir: str,
progress_callback=None, batch_callback=None,
batch_size: int = 50) -> dict:
"""Run gallery-dl with --input-file to download specific Reddit post URLs.
Same streaming/batch pattern as run_gallery_dl() but reads URLs from a file
instead of scraping a subreddit listing.
Args:
urls_file: Path to file containing one URL per line
temp_dir: Directory for gallery-dl to download into
progress_callback: Called with (dl_count, skip_count, total_seen)
batch_callback: Called with (new_files: List[Path]) every batch_size files
batch_size: How many files to accumulate before calling batch_callback
Returns:
Dict with dl_count, skip_count, total.
"""
import time
# Same archive as normal Reddit paid content sync
archive_dir = '/opt/media-downloader/data/cache'
os.makedirs(archive_dir, exist_ok=True)
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
cmd = [
self.gallery_dl_path,
'--write-metadata',
'--download-archive', archive_path,
'-d', temp_dir,
'-o', 'extractor.reddit.api=rest',
'--input-file', urls_file,
]
# Check for Reddit cookies file
cookies_file = self._get_cookies_file()
if cookies_file:
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
cmd.extend(['--cookies', temp_cookie_file])
self.log(f"Running gallery-dl with input file ({urls_file})", 'info')
self.log(f"Command: {' '.join(cmd)}", 'debug')
dl_count = 0
skip_count = 0
pending_files = []
try:
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
start_time = time.time()
timeout_secs = 14400 # 4 hours for backfill (can be large)
while True:
if time.time() - start_time > timeout_secs:
proc.kill()
self.log("gallery-dl backfill timed out", 'error')
break
line = proc.stdout.readline()
if not line and proc.poll() is not None:
break
if not line:
continue
line = line.strip()
if not line:
continue
if line.startswith('# '):
skip_count += 1
else:
dl_count += 1
file_path = Path(line)
if file_path.exists() and not file_path.name.endswith('.json'):
pending_files.append(file_path)
total = dl_count + skip_count
if progress_callback:
progress_callback(dl_count, skip_count, total)
if batch_callback and len(pending_files) >= batch_size:
batch_callback(list(pending_files))
pending_files.clear()
proc.wait()
# Final batch
if batch_callback and pending_files:
batch_callback(list(pending_files))
pending_files.clear()
if progress_callback:
progress_callback(dl_count, skip_count, dl_count + skip_count)
returncode = proc.returncode
if returncode not in (None, 0, 1, 4, 5):
stderr = proc.stderr.read()
self.log(f"gallery-dl backfill returned code {returncode}", 'warning')
if stderr:
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
except Exception as e:
self.log(f"gallery-dl backfill failed: {e}", 'error')
self.log(f"gallery-dl backfill done: {dl_count} downloaded, {skip_count} skipped", 'info')
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
@staticmethod
def _detect_file_type(ext: str) -> str:
"""Detect file type from extension."""
ext = ext.lower().lstrip('.')
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg'}
if ext in image_exts:
return 'image'
elif ext in video_exts:
return 'video'
return 'unknown'

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,259 @@
"""
Snapchat Client for Paid Content - Wraps SnapchatClientDownloader for paid content system.
Maps spotlights and highlights to the Post/Attachment model used by the paid content scraper.
"""
from datetime import datetime
from typing import Dict, List, Optional
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
class SnapchatPaidContentClient(LoggingMixin):
"""
Client for fetching Snapchat creator content via the existing SnapchatClientDownloader.
Each spotlight/highlight collection maps to one Post with snaps as Attachments.
"""
SERVICE_ID = 'snapchat'
PLATFORM = 'snapchat'
def __init__(self, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Snapchat')
self.unified_db = unified_db
self._downloader = None
def _get_downloader(self):
"""Lazy-init the underlying SnapchatClientDownloader."""
if self._downloader is None:
from modules.snapchat_client_module import SnapchatClientDownloader
self._downloader = SnapchatClientDownloader(
show_progress=False,
use_database=False,
log_callback=self.log_callback,
unified_db=self.unified_db,
)
return self._downloader
def get_creator_info(self, username: str) -> Optional[Dict]:
"""Get creator information from profile page __NEXT_DATA__.
Returns dict with display_name and avatar_url if found.
"""
downloader = self._get_downloader()
profile_url = f"https://story.snapchat.com/@{username}"
html = downloader._fetch_page(profile_url)
if not html:
return {'creator_id': username, 'creator_name': username}
data = downloader._extract_next_data(html)
display_name = username
avatar_url = None
if data:
props = data.get('props', {}).get('pageProps', {})
# userProfile uses a $case/userInfo wrapper
user_profile = props.get('userProfile', {})
user_info = user_profile.get('userInfo', {})
if user_info:
name = user_info.get('displayName', '').strip()
if name:
display_name = name
# Bitmoji 3D avatar URL (best quality)
bitmoji = user_info.get('bitmoji3d') or {}
if isinstance(bitmoji, dict):
avatar_url = bitmoji.get('avatarUrl') or bitmoji.get('url')
# linkPreview OG images as avatar (preview/square.jpeg — good quality)
if not avatar_url:
link_preview = props.get('linkPreview', {})
for img_key in ('facebookImage', 'twitterImage'):
img = link_preview.get(img_key, {})
if isinstance(img, dict) and img.get('url'):
avatar_url = img['url']
break
# pageMetadata.pageTitle sometimes has the display name
if display_name == username:
page_meta = props.get('pageMetadata', {})
page_title = page_meta.get('pageTitle', '')
# Format: "DisplayName (@username) | Snapchat..."
if page_title and '(@' in page_title:
name_part = page_title.split('(@')[0].strip()
if name_part:
display_name = name_part
return {
'creator_id': username,
'creator_name': display_name,
'profile_image_url': avatar_url,
}
def get_creator(self, username: str) -> Optional[Creator]:
"""Get Creator model for a Snapchat user."""
info = self.get_creator_info(username)
if not info:
return None
return Creator(
creator_id=username,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
username=info.get('creator_name', username),
display_name=info.get('creator_name'),
profile_image_url=info.get('profile_image_url'),
)
def get_posts(self, username: str, since_date: str = None) -> List[Post]:
"""Fetch spotlights and highlights as Post objects.
Args:
username: Snapchat username (without @)
since_date: ISO date string; skip snaps older than this
Returns:
List of Post objects (one per spotlight/highlight collection)
"""
downloader = self._get_downloader()
# Parse cutoff date
cutoff_dt = None
if since_date:
try:
if 'T' in since_date:
cutoff_dt = datetime.fromisoformat(since_date.replace('Z', '+00:00').replace('+00:00', ''))
else:
cutoff_dt = datetime.strptime(since_date[:10], '%Y-%m-%d')
except (ValueError, IndexError):
pass
# Discover content from profile (spotlights, highlights, stories)
profile_content = downloader.get_profile_content(username)
self.log(f"Found {len(profile_content.get('spotlights', []))} spotlights, "
f"{len(profile_content.get('highlight_collections', []))} highlights, "
f"{'stories' if profile_content.get('story_collection') else 'no stories'} "
f"for @{username}", 'info')
posts = []
# Process story snaps (inline from profile page — no extra HTTP requests)
story_collection = profile_content.get('story_collection')
if story_collection and story_collection.snaps:
post = self._collection_to_post(story_collection, username, cutoff_dt)
if post and post.attachments:
posts.append(post)
# Process highlights (inline from profile page — no extra HTTP requests)
for collection in profile_content.get('highlight_collections', []):
post = self._collection_to_post(collection, username, cutoff_dt)
if post and post.attachments:
posts.append(post)
# Process spotlights (still requires per-URL fetch for full metadata)
for url in profile_content.get('spotlights', []):
collection = downloader.get_spotlight_metadata(url)
if not collection:
continue
post = self._collection_to_post(collection, username, cutoff_dt)
if post and post.attachments:
posts.append(post)
self.log(f"Mapped {len(posts)} posts with attachments for @{username}", 'info')
return posts
def _collection_to_post(self, collection, username: str, cutoff_dt=None) -> Optional[Post]:
"""Convert a SnapCollection to a Post with Attachments."""
if not collection.snaps:
return None
# Use the earliest snap timestamp as the post date
timestamps = [s.timestamp for s in collection.snaps if s.timestamp]
if timestamps:
earliest = min(timestamps)
published_at = earliest.strftime('%Y-%m-%d')
else:
published_at = None
# Skip if all snaps are older than cutoff
if cutoff_dt and timestamps:
latest = max(timestamps)
if latest < cutoff_dt:
return None
attachments = []
for snap in collection.snaps:
if not snap.media_url:
continue
# Determine extension from media type
ext = '.mp4' if snap.media_type == 'video' else '.jpg'
name = f"{snap.media_id}{ext}" if snap.media_id else f"snap_{snap.index}{ext}"
attachment = Attachment(
name=name,
file_type=snap.media_type,
extension=ext,
server_path=snap.media_url,
download_url=snap.media_url,
width=snap.width if snap.width else None,
height=snap.height if snap.height else None,
duration=snap.duration_ms // 1000 if snap.duration_ms else None,
)
attachments.append(attachment)
if not attachments:
return None
# Build content/title from collection metadata
title = collection.title or None
content = collection.title if collection.title else None
# Tag as spotlight or highlight
tag_name = collection.collection_type.title() # "Spotlight" or "Highlight"
return Post(
post_id=collection.collection_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=username,
title=title,
content=content,
published_at=published_at,
attachments=attachments,
auto_tags=[tag_name],
)
def download_snap(self, media_url: str, output_path: str) -> bool:
"""Download a single snap file via curl_cffi.
Args:
media_url: Direct URL to the media file
output_path: Local path to save the file
Returns:
True if download succeeded
"""
import os
downloader = self._get_downloader()
session = downloader._get_session()
try:
url = media_url.replace('&amp;', '&')
resp = session.get(url, timeout=60)
if resp.status_code == 200 and len(resp.content) > 0:
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'wb') as f:
f.write(resp.content)
return True
else:
self.log(f"Download failed: HTTP {resp.status_code}, size={len(resp.content)}", 'warning')
return False
except Exception as e:
self.log(f"Download error: {e}", 'error')
return False

View File

@@ -0,0 +1,508 @@
"""
Soundgasm + Liltsome Archive Client for Paid Content
Handles:
- Soundgasm profile scraping (no auth/Cloudflare needed)
- Liltsome archive (liltsome.yerf.org) as supplementary source
- Bracket tag parsing from audio titles: [F4M] [Whisper] etc.
- Direct HTTP audio downloads (.m4a)
"""
import asyncio
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import quote
import aiohttp
import aiofiles
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
# ---------------------------------------------------------------------------
# Bracket tag helpers
# ---------------------------------------------------------------------------
def parse_bracket_tags(title: str) -> Tuple[str, List[str]]:
"""Extract [bracket] tags from a title, normalize, return (clean_title, tags)."""
tags = re.findall(r'\[([^\]]+)\]', title)
clean_title = re.sub(r'\s*\[[^\]]+\]\s*', ' ', title).strip()
normalized: List[str] = []
seen: Set[str] = set()
for tag in tags:
tag_lower = tag.strip().lower()
if tag_lower and tag_lower not in seen:
seen.add(tag_lower)
normalized.append(tag_lower)
return clean_title, normalized
def format_tag_display(tag_lower: str) -> str:
"""Format a normalized lowercase tag for display.
Gender tags (f4m, m4f, f4a …) → uppercase.
Everything else → title case.
"""
if re.match(r'^[a-z]+\d[a-z]+$', tag_lower):
return tag_lower.upper()
return tag_lower.title()
# ---------------------------------------------------------------------------
# SoundgasmClient
# ---------------------------------------------------------------------------
class SoundgasmClient(LoggingMixin):
"""Client for fetching audio from Soundgasm and the Liltsome archive."""
SERVICE_ID = 'soundgasm'
PLATFORM = 'soundgasm'
SOUNDGASM_BASE = 'https://soundgasm.net'
LILTSOME_BASE = 'https://liltsome.yerf.org'
LILTSOME_LIBRARY_URL = f'{LILTSOME_BASE}/data/library.json'
LILTSOME_CACHE_PATH = Path('/opt/media-downloader/data/liltsome_library.json')
LILTSOME_ETAG_PATH = Path('/opt/media-downloader/data/liltsome_library.json.etag')
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
def __init__(self, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Soundgasm')
self._liltsome_data: Optional[Dict] = None # cached in-memory per sync run
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def get_profile_info(self, username: str) -> Optional[Dict]:
"""Return basic profile info (post count) from Soundgasm and/or Liltsome."""
post_count = 0
source = None
# Try Soundgasm profile page first
try:
sg_posts = await self._fetch_soundgasm_profile(username)
if sg_posts is not None:
post_count = len(sg_posts)
source = 'soundgasm'
except Exception as e:
self.log(f"Soundgasm profile fetch failed for {username}: {e}", 'debug')
# Also check Liltsome for additional posts
try:
lt_entries = await self._get_liltsome_entries(username)
if lt_entries:
post_count = max(post_count, len(lt_entries))
if source is None:
source = 'liltsome'
except Exception as e:
self.log(f"Liltsome lookup failed for {username}: {e}", 'debug')
if post_count == 0 and source is None:
return None
return {
'username': username,
'post_count': post_count,
'source': source,
}
async def get_posts(self, username: str, known_post_ids: Optional[Set[str]] = None,
progress_callback=None) -> List[Post]:
"""Fetch posts from both Soundgasm and Liltsome, deduplicating by post_id."""
known = known_post_ids or set()
posts: List[Post] = []
seen_ids: Set[str] = set(known)
# 1. Soundgasm (may fail if account deleted — that's OK)
try:
sg_posts = await self._fetch_soundgasm_posts(username, seen_ids)
for p in sg_posts:
if p.post_id not in seen_ids:
seen_ids.add(p.post_id)
posts.append(p)
self.log(f"Soundgasm: {len(sg_posts)} new posts for {username}", 'info')
except Exception as e:
self.log(f"Soundgasm fetch failed for {username} (account may be deleted): {e}", 'warning')
if progress_callback:
progress_callback(len(posts))
# 2. Liltsome archive (always)
try:
lt_posts = await self._fetch_liltsome_posts(username, seen_ids)
for p in lt_posts:
if p.post_id not in seen_ids:
seen_ids.add(p.post_id)
posts.append(p)
self.log(f"Liltsome: {len(lt_posts)} new posts for {username}", 'info')
except Exception as e:
self.log(f"Liltsome fetch failed for {username}: {e}", 'warning')
if progress_callback:
progress_callback(len(posts))
return posts
async def download_audio(self, download_url: str, output_path: Path) -> Dict:
"""Download an audio file via direct HTTP GET."""
try:
output_path.parent.mkdir(parents=True, exist_ok=True)
timeout = aiohttp.ClientTimeout(total=300)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(download_url, headers=self.HEADERS) as resp:
if resp.status != 200:
return {'success': False, 'error': f'HTTP {resp.status}'}
async with aiofiles.open(str(output_path), 'wb') as f:
total = 0
async for chunk in resp.content.iter_chunked(65536):
await f.write(chunk)
total += len(chunk)
return {
'success': True,
'file_path': str(output_path),
'file_size': total,
}
except Exception as e:
self.log(f"Download failed for {download_url}: {e}", 'error')
return {'success': False, 'error': str(e)}
# ------------------------------------------------------------------
# Soundgasm scraping
# ------------------------------------------------------------------
async def _fetch_soundgasm_profile(self, username: str) -> Optional[List[Dict]]:
"""Scrape the Soundgasm profile page, return list of {slug, title, plays}."""
url = f'{self.SOUNDGASM_BASE}/u/{username}'
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url, headers=self.HEADERS) as resp:
if resp.status == 404:
return None
if resp.status != 200:
self.log(f"Soundgasm profile returned {resp.status}", 'warning')
return None
html = await resp.text()
# Parse .sound-details divs for links
entries: List[Dict] = []
# Pattern: <a href="https://soundgasm.net/u/{username}/{slug}">title</a>
# (profile page uses absolute URLs)
for m in re.finditer(
r'<a\s+href="(?:https?://soundgasm\.net)?/u/' + re.escape(username) + r'/([^"]+)"[^>]*>\s*([^<]+)',
html, re.IGNORECASE
):
slug = m.group(1).strip()
title = m.group(2).strip()
entries.append({'slug': slug, 'title': title})
return entries
async def _fetch_soundgasm_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
"""Fetch full post details from Soundgasm for new posts."""
profile_entries = await self._fetch_soundgasm_profile(username)
if not profile_entries:
return []
posts: List[Post] = []
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
for entry in profile_entries:
slug = entry['slug']
if slug in seen_ids:
continue
try:
detail = await self._fetch_soundgasm_detail(session, username, slug)
if detail is None:
continue
title_raw = detail.get('title', entry.get('title', slug))
clean_title, tags = parse_bracket_tags(title_raw)
description = detail.get('description', '')
audio_url = detail.get('audio_url')
if not audio_url:
continue
# Determine extension from URL
ext = '.m4a'
if audio_url:
url_path = audio_url.split('?')[0]
if '.' in url_path.split('/')[-1]:
ext = '.' + url_path.split('/')[-1].rsplit('.', 1)[1]
filename = f"{slug}{ext}"
attachment = Attachment(
name=filename,
file_type='audio',
extension=ext.lstrip('.'),
server_path=f'/u/{username}/{slug}',
download_url=audio_url,
)
post = Post(
post_id=slug,
service_id='soundgasm',
platform='soundgasm',
creator_id=username,
title=clean_title or None,
content=description or None,
published_at=None, # Soundgasm has no dates
attachments=[attachment],
auto_tags=tags,
)
posts.append(post)
except Exception as e:
self.log(f"Error fetching Soundgasm detail for {slug}: {e}", 'debug')
return posts
async def _fetch_soundgasm_detail(self, session: aiohttp.ClientSession,
username: str, slug: str) -> Optional[Dict]:
"""Fetch a single Soundgasm audio detail page and extract metadata."""
url = f'{self.SOUNDGASM_BASE}/u/{username}/{slug}'
async with session.get(url, headers=self.HEADERS) as resp:
if resp.status != 200:
return None
html = await resp.text()
# Title: <div aria-label="title"...>Title Text</div>
# or from the page title tag
title = None
title_match = re.search(r'aria-label="title"[^>]*>([^<]+)', html)
if title_match:
title = title_match.group(1).strip()
if not title:
title_match = re.search(r'<title>([^<]+)</title>', html, re.IGNORECASE)
if title_match:
title = title_match.group(1).strip()
# Remove " - Soundgasm" suffix if present
title = re.sub(r'\s*[-–—]\s*Soundgasm.*$', '', title, flags=re.IGNORECASE).strip()
# Description: <div class="jp-description">...</div>
description = None
desc_match = re.search(r'class="jp-description"[^>]*>(.*?)</div>', html, re.DOTALL)
if desc_match:
desc_html = desc_match.group(1)
# Strip HTML tags
description = re.sub(r'<br\s*/?>', '\n', desc_html)
description = re.sub(r'<[^>]+>', '', description).strip()
# Audio URL: m4a: "https://..."
audio_url = None
audio_match = re.search(r'm4a:\s*"([^"]+)"', html)
if audio_match:
audio_url = audio_match.group(1)
if not audio_url:
return None
return {
'title': title or slug,
'description': description,
'audio_url': audio_url,
}
# ------------------------------------------------------------------
# Liltsome archive
# ------------------------------------------------------------------
async def _ensure_liltsome_cache(self) -> bool:
"""Download/refresh the Liltsome library.json using ETag-based invalidation.
Returns True if cache is available (fresh or existing), False otherwise.
"""
etag_file = self.LILTSOME_ETAG_PATH
cache_file = self.LILTSOME_CACHE_PATH
stored_etag = None
if etag_file.exists():
try:
stored_etag = etag_file.read_text().strip()
except Exception:
pass
timeout = aiohttp.ClientTimeout(total=600) # 131MB can take a while
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
# HEAD request to check ETag
async with session.head(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
if resp.status != 200:
self.log(f"Liltsome HEAD returned {resp.status}", 'warning')
return cache_file.exists()
remote_etag = resp.headers.get('ETag', '').strip()
if stored_etag and remote_etag and stored_etag == remote_etag and cache_file.exists():
self.log("Liltsome cache is fresh (ETag match)", 'debug')
return True
# Download the full library
self.log("Downloading Liltsome library.json (this may take a while)...", 'info')
async with session.get(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
if resp.status != 200:
self.log(f"Liltsome GET returned {resp.status}", 'warning')
return cache_file.exists()
cache_file.parent.mkdir(parents=True, exist_ok=True)
async with aiofiles.open(str(cache_file), 'wb') as f:
async for chunk in resp.content.iter_chunked(262144):
await f.write(chunk)
new_etag = resp.headers.get('ETag', remote_etag or '').strip()
if new_etag:
etag_file.write_text(new_etag)
self.log("Liltsome library.json downloaded successfully", 'info')
self._liltsome_data = None # force re-parse
return True
except Exception as e:
self.log(f"Failed to refresh Liltsome cache: {e}", 'warning')
return cache_file.exists()
async def _load_liltsome_data(self) -> Optional[Dict]:
"""Load and cache the Liltsome library data in memory."""
if self._liltsome_data is not None:
return self._liltsome_data
cache_file = self.LILTSOME_CACHE_PATH
if not cache_file.exists():
return None
try:
data = await asyncio.to_thread(self._read_liltsome_json, cache_file)
self._liltsome_data = data
return data
except Exception as e:
self.log(f"Failed to parse Liltsome library.json: {e}", 'error')
return None
@staticmethod
def _read_liltsome_json(path: Path) -> Dict:
"""Read and parse the Liltsome JSON file (blocking, run in thread)."""
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
async def _get_liltsome_entries(self, username: str) -> Optional[List[Dict]]:
"""Find artist entries in Liltsome data by username (case-insensitive).
library.json structure: {"artists": [{"id": "name", "files": {"audio": [...]}}]}
"""
await self._ensure_liltsome_cache()
data = await self._load_liltsome_data()
if not data:
return None
username_lower = username.lower()
# Top-level is {"artists": [...]}
artists = data.get('artists', []) if isinstance(data, dict) else data
for artist in artists:
artist_id = str(artist.get('id', '')).lower()
artist_name = str(artist.get('name', '')).lower()
if artist_id == username_lower or artist_name == username_lower:
# Audio entries are in files.audio
files = artist.get('files', {})
if isinstance(files, dict):
return files.get('audio', [])
return []
return None
async def _fetch_liltsome_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
"""Convert Liltsome archive entries to Post objects."""
entries = await self._get_liltsome_entries(username)
if not entries:
return []
posts: List[Post] = []
for entry in entries:
filename = entry.get('filename', '')
path = entry.get('path', '')
title_raw = entry.get('title', filename)
entry_tags = entry.get('tags', []) # already lowercase in Liltsome
duration = None
file_size = entry.get('size')
if isinstance(entry.get('metadata'), dict):
duration = entry['metadata'].get('duration')
# Build post_id: prefix with liltsome- to avoid collision
sanitized_name = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) if filename else path
post_id = f'liltsome-{sanitized_name}'
if post_id in seen_ids:
continue
# Parse bracket tags from title for clean_title
clean_title, title_tags = parse_bracket_tags(title_raw)
# Merge: use Liltsome's pre-parsed tags + any extra from title
all_tags_set: Set[str] = set()
all_tags: List[str] = []
for t in entry_tags:
t_lower = t.strip().lower()
if t_lower and t_lower not in all_tags_set:
all_tags_set.add(t_lower)
all_tags.append(t_lower)
for t in title_tags:
if t not in all_tags_set:
all_tags_set.add(t)
all_tags.append(t)
# Build download URL
download_url = f'{self.LILTSOME_BASE}/audio_files/{quote(path, safe="/")}' if path else None
# Determine extension
ext = 'm4a'
if filename and '.' in filename:
ext = filename.rsplit('.', 1)[1].lower()
elif path and '.' in path:
ext = path.rsplit('.', 1)[1].lower()
attachment = Attachment(
name=f"{sanitized_name}.{ext}" if not filename.endswith(f'.{ext}') else filename,
file_type='audio',
extension=ext,
server_path=path or filename,
download_url=download_url,
file_size=file_size,
duration=duration,
)
post = Post(
post_id=post_id,
service_id='soundgasm',
platform='soundgasm',
creator_id=username,
title=clean_title or None,
content=None,
published_at=None,
attachments=[attachment],
auto_tags=all_tags,
)
posts.append(post)
return posts

View File

@@ -0,0 +1,827 @@
"""
TikTok Client for Paid Content - Uses yt-dlp for listing and gallery-dl for downloading
Adapts the hybrid approach from modules/tiktok_module.py into the paid content client pattern.
"""
import asyncio
import html as html_module
import json
import os
import re
import subprocess
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import aiohttp
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
class TikTokClient(LoggingMixin):
"""
Client for fetching TikTok creator information and videos.
Uses yt-dlp for listing (fast flat-playlist) and gallery-dl for downloading
(handles carousels/slideshows properly).
"""
SERVICE_ID = 'tiktok'
PLATFORM = 'tiktok'
def __init__(self, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='TikTok')
self.ytdlp_path = self._find_executable('yt-dlp')
self.gallery_dl_path = self._find_executable('gallery-dl')
self.unified_db = unified_db
self._cookies_file = None
self._last_pinned_posts = {}
if not self.ytdlp_path:
self.log("yt-dlp not found, TikTok listing will be disabled", 'warning')
if not self.gallery_dl_path:
self.log("gallery-dl not found, TikTok downloading will be disabled", 'warning')
def _find_executable(self, name: str) -> Optional[str]:
"""Find an executable by name"""
common_paths = [
f'/opt/media-downloader/venv/bin/{name}',
f'/usr/local/bin/{name}',
f'/usr/bin/{name}',
f'/opt/homebrew/bin/{name}',
os.path.expanduser(f'~/.local/bin/{name}'),
]
for path in common_paths:
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
try:
result = subprocess.run(['which', name], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def is_available(self) -> bool:
"""Check if both yt-dlp and gallery-dl are available"""
return self.ytdlp_path is not None and self.gallery_dl_path is not None
def cleanup(self):
"""Clean up any temporary files"""
if self._cookies_file and os.path.exists(self._cookies_file):
try:
os.unlink(self._cookies_file)
except Exception:
pass
def _get_cookies_file(self) -> Optional[str]:
"""Get path to cookies file, creating from database if needed."""
if self._cookies_file and os.path.exists(self._cookies_file):
return self._cookies_file
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
# Check for tiktok scraper cookies
for scraper_id in ('tiktok', 'tiktok_client'):
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
if isinstance(data, dict) and 'cookies' in data:
cookies_list = data['cookies']
elif isinstance(data, list):
cookies_list = data
else:
cookies_list = []
if cookies_list:
import tempfile
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='tiktok_cookies_')
with os.fdopen(fd, 'w') as f:
f.write("# Netscape HTTP Cookie File\n")
for cookie in cookies_list:
domain = cookie.get('domain', '')
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
expiry = str(int(cookie.get('expirationDate', 0)))
name = cookie.get('name', '')
value = cookie.get('value', '')
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
self.log(f"Loaded {len(cookies_list)} TikTok cookies", 'debug')
return self._cookies_file
except Exception as e:
self.log(f"Could not load TikTok cookies: {e}", 'debug')
return None
def _save_cookies_back(self):
"""Read updated cookies from temp file and save back to database.
yt-dlp and gallery-dl update the cookies file with refreshed tokens
from TikTok (e.g. msToken), so we need to persist those changes."""
if not self._cookies_file or not os.path.exists(self._cookies_file):
return
if not self.unified_db:
return
try:
import http.cookiejar
jar = http.cookiejar.MozillaCookieJar(self._cookies_file)
jar.load(ignore_discard=True, ignore_expires=True)
updated_cookies = []
for cookie in jar:
updated_cookies.append({
'name': cookie.name,
'value': cookie.value,
'domain': cookie.domain,
'path': cookie.path,
'secure': cookie.secure,
'expirationDate': cookie.expires or 0,
})
if not updated_cookies:
return
# Merge updated cookies back to DB
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('tiktok',))
row = cursor.fetchone()
if row and row[0]:
existing_data = json.loads(row[0])
existing_cookies = existing_data if isinstance(existing_data, list) else existing_data.get('cookies', [])
# Merge: updated cookies override existing by name+domain
cookie_map = {(c.get('name'), c.get('domain')): c for c in existing_cookies}
for c in updated_cookies:
cookie_map[(c['name'], c['domain'])] = c
final_cookies = list(cookie_map.values())
else:
final_cookies = updated_cookies
self.unified_db.save_scraper_cookies('tiktok', final_cookies, merge=False)
self.log(f"Saved {len(final_cookies)} refreshed cookies back to DB", 'debug')
# Clear cached file so next use gets fresh cookies from DB
self._cookies_file = None
except Exception as e:
self.log(f"Failed to save cookies back: {e}", 'debug')
def _get_base_cmd(self) -> List[str]:
"""Get base yt-dlp command with cookies if available."""
cmd = [self.ytdlp_path]
cookies_file = self._get_cookies_file()
if cookies_file:
cmd.extend(['--cookies', cookies_file])
return cmd
@staticmethod
def extract_username(url: str) -> Optional[str]:
"""Extract username from TikTok URL"""
match = re.search(r'tiktok\.com/@([a-zA-Z0-9_.]+)', url)
if match:
return match.group(1)
return None
@staticmethod
def normalize_creator_url(username: str) -> str:
"""Convert username to a consistent URL format"""
if username.startswith('http://') or username.startswith('https://'):
return username
username = username.lstrip('@')
return f"https://www.tiktok.com/@{username}"
async def _resolve_channel_id(self, username: str) -> Optional[str]:
"""Resolve a TikTok username to a channel_id (secUid).
When yt-dlp can't extract the secondary user ID from the profile page,
we try to find a video URL from TikTok's embed/RSS and then extract
the channel_id (secUid) from that video's metadata via yt-dlp.
"""
if not self.ytdlp_path:
return None
try:
# Step 1: Get a video URL from this user via the oembed embed HTML
video_url = None
async with aiohttp.ClientSession() as session:
# The oembed HTML often contains a video ID we can use
oembed_url = f"https://www.tiktok.com/oembed?url=https://www.tiktok.com/@{username}"
async with session.get(oembed_url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
if resp.status == 200:
data = await resp.json()
embed_html = data.get('html', '')
# Extract video URL from embed iframe
match = re.search(r'cite="(https://www\.tiktok\.com/@[^"]+/video/\d+)"', embed_html)
if not match:
match = re.search(r'data-video-id="(\d+)"', embed_html)
if match:
video_url = f"https://www.tiktok.com/@{username}/video/{match.group(1)}"
else:
video_url = match.group(1)
if not video_url:
# oembed thumbnail_url sometimes contains the video ID
thumb = data.get('thumbnail_url', '')
vid_match = re.search(r'/video/(\d+)', thumb)
if vid_match:
video_url = f"https://www.tiktok.com/@{username}/video/{vid_match.group(1)}"
if not video_url:
# Step 1b: Check if we have any existing video URLs in the database
if self.unified_db:
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT a.download_url FROM paid_content_attachments a
JOIN paid_content_posts p ON a.post_id = p.id
JOIN paid_content_creators c ON p.creator_id = c.id
WHERE c.username = ? AND a.download_url LIKE '%tiktok.com%'
LIMIT 1
""", (username,))
row = cursor.fetchone()
if row and row[0]:
video_url = row[0]
except Exception:
pass
if not video_url:
self.log(f"No video URL found for @{username} to resolve channel_id", 'debug')
return None
# Step 2: Use yt-dlp to get the channel_id from the single video
self.log(f"Resolving channel_id from video: {video_url}", 'debug')
cmd = self._get_base_cmd() + [
'-j',
'--no-warnings',
'--no-download',
'--socket-timeout', '30',
video_url
]
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode == 0:
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line.strip():
continue
try:
video_data = json.loads(line)
channel_id = video_data.get('channel_id') or video_data.get('playlist_id')
if channel_id:
self.log(f"Resolved @{username} channel_id: {channel_id[:30]}...", 'info')
return channel_id
except json.JSONDecodeError:
continue
except Exception as e:
self.log(f"Failed to resolve channel_id for @{username}: {e}", 'debug')
return None
async def get_creator_info(self, url: str) -> Optional[Dict]:
"""Get creator information using yt-dlp + profile page scraping"""
username = self.extract_username(url)
if not username:
return None
profile_url = self.normalize_creator_url(username)
creator_name = username
# Try yt-dlp for display name from video metadata
if self.ytdlp_path:
try:
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
'--playlist-items', '1',
'--socket-timeout', '30',
profile_url
]
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode == 0:
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
data = json.loads(line)
creator_name = (data.get('channel') or data.get('uploader')
or data.get('playlist_title') or username)
break
except json.JSONDecodeError:
continue
else:
# Fallback: try tiktokuser: scheme if secondary user ID extraction fails
err_text = stderr.decode('utf-8', errors='replace')
if 'secondary user ID' in err_text or 'Unable to extract' in err_text:
channel_id = await self._resolve_channel_id(username)
if channel_id:
fb_cmd = self._get_base_cmd() + [
'--no-warnings', '--flat-playlist',
'-j', '--playlist-items', '1', '--socket-timeout', '30',
f"tiktokuser:{channel_id}"
]
fb_result = await asyncio.create_subprocess_exec(
*fb_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
)
fb_stdout, _ = await fb_result.communicate()
if fb_result.returncode == 0:
for line in fb_stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
data = json.loads(line)
creator_name = (data.get('channel') or data.get('uploader')
or data.get('playlist_title') or username)
break
except json.JSONDecodeError:
continue
except Exception as e:
self.log(f"Failed to get creator info via yt-dlp: {e}", 'debug')
# Scrape profile page for avatar and bio
profile_image = None
bio = None
try:
profile_image, bio, page_name = await self._scrape_profile_page(profile_url)
if page_name and creator_name == username:
creator_name = page_name
except Exception as e:
self.log(f"Failed to scrape profile page: {e}", 'debug')
return {
'creator_id': username,
'creator_name': creator_name,
'creator_url': profile_url,
'profile_image_url': profile_image,
'bio': bio,
}
async def _fetch_profile_with_cookies(self, url: str) -> Optional[str]:
"""Fetch TikTok profile page using curl_cffi with cookies from database."""
cookies_file = self._get_cookies_file()
if not cookies_file:
return None
try:
from curl_cffi import requests as cf_requests
import http.cookiejar
# Load cookies from the Netscape file
jar = http.cookiejar.MozillaCookieJar(cookies_file)
jar.load(ignore_discard=True, ignore_expires=True)
# Try multiple browser versions for curl_cffi compatibility
for _browser in ("chrome136", "chrome131", "chrome"):
try:
session = cf_requests.Session(impersonate=_browser)
break
except Exception:
continue
else:
session = cf_requests.Session()
for cookie in jar:
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain)
resp = session.get(url, timeout=15)
if resp.status_code == 200 and 'avatarLarger' in resp.text:
self.log("Fetched TikTok profile with cookies (curl_cffi)", 'debug')
return resp.text
elif 'captcha' in resp.text.lower():
self.log("TikTok profile still returned captcha with cookies", 'debug')
session.close()
except Exception as e:
self.log(f"curl_cffi profile fetch failed: {e}", 'debug')
return None
async def _scrape_profile_page(self, url: str) -> tuple:
"""
Scrape TikTok profile page for avatar and bio from embedded JSON data.
TikTok embeds user data in __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag.
Returns (profile_image_url, bio, display_name).
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
profile_image = None
bio = None
display_name = None
try:
page_html = None
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
if resp.status == 200:
page_html = await resp.text()
# If we got a captcha page, try curl_cffi with cookies
if not page_html or ('captcha' in page_html.lower() and 'avatarLarger' not in page_html):
page_html = await self._fetch_profile_with_cookies(url)
if not page_html:
return (None, None, None)
# Try structured JSON first (__UNIVERSAL_DATA_FOR_REHYDRATION__)
rehydration_match = re.search(
r'<script[^>]*id="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>(.*?)</script>',
page_html, re.DOTALL
)
if rehydration_match:
try:
rdata = json.loads(rehydration_match.group(1))
user_detail = (rdata.get('__DEFAULT_SCOPE__', {})
.get('webapp.user-detail', {}))
user = user_detail.get('userInfo', {}).get('user', {})
if user:
avatar_val = user.get('avatarLarger') or user.get('avatarMedium')
if avatar_val and not avatar_val.endswith('.mp4'):
profile_image = avatar_val
self.log("Found TikTok profile avatar (rehydration)", 'debug')
sig_val = user.get('signature', '')
if sig_val and sig_val.strip():
bio = sig_val.strip()
self.log("Found TikTok bio (rehydration)", 'debug')
nick_val = user.get('nickname')
if nick_val:
display_name = nick_val
self.log(f"Found TikTok display name (rehydration): {display_name}", 'debug')
# Extract pinned post IDs
pinned_list = user_detail.get('pinnedList', [])
if pinned_list:
self._last_pinned_posts = {}
for item in pinned_list:
vid = str(item.get('id', ''))
if vid:
self._last_pinned_posts[vid] = {'pinned_at': None}
if self._last_pinned_posts:
self.log(f"Found {len(self._last_pinned_posts)} pinned TikTok posts", 'debug')
except (json.JSONDecodeError, KeyError):
pass
# Fallback: regex extraction from raw HTML
# Use json.loads to decode values (handles \uXXXX, surrogate pairs, and raw UTF-8)
if not profile_image:
avatar_match = re.search(r'"avatarLarger":"([^"]+)"', page_html)
if not avatar_match:
avatar_match = re.search(r'"avatarMedium":"([^"]+)"', page_html)
if avatar_match:
try:
avatar_url = json.loads(f'"{avatar_match.group(1)}"')
except (json.JSONDecodeError, ValueError):
avatar_url = avatar_match.group(1)
if avatar_url and not avatar_url.endswith('.mp4'):
profile_image = avatar_url
self.log("Found TikTok profile avatar", 'debug')
if not bio:
sig_match = re.search(r'"signature":"([^"]*)"', page_html)
if sig_match:
try:
raw_bio = json.loads(f'"{sig_match.group(1)}"')
except (json.JSONDecodeError, ValueError):
raw_bio = sig_match.group(1)
if raw_bio and raw_bio.strip():
bio = raw_bio.strip()
self.log("Found TikTok bio", 'debug')
if not display_name:
nick_match = re.search(r'"nickname":"([^"]+)"', page_html)
if nick_match:
try:
display_name = json.loads(f'"{nick_match.group(1)}"')
except (json.JSONDecodeError, ValueError):
display_name = nick_match.group(1)
self.log(f"Found TikTok display name: {display_name}", 'debug')
# Extract banner/cover from "coverLarger" field
# (stored separately, not returned here but could be used later)
except asyncio.TimeoutError:
self.log("TikTok profile page request timed out", 'debug')
except Exception as e:
self.log(f"Error scraping TikTok profile: {e}", 'debug')
return (profile_image, bio, display_name)
async def get_creator_videos(self, url: str, since_date: str = None,
max_videos: int = None,
progress_callback=None) -> List[Dict]:
"""
Get all videos from a TikTok profile using yt-dlp --flat-playlist -j.
Uses JSON output to properly handle multi-line descriptions/titles.
Returns list of video metadata dicts with video_id and upload_date.
"""
if not self.ytdlp_path:
return []
username = self.extract_username(url)
if not username:
return []
profile_url = self.normalize_creator_url(username)
try:
# Use yt-dlp flat-playlist with JSON output for full metadata
cmd = self._get_base_cmd() + [
'--flat-playlist',
'-j',
'--no-warnings',
'--socket-timeout', '30',
profile_url
]
self.log(f"Fetching TikTok videos for @{username}", 'info')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error = stderr.decode('utf-8', errors='replace')
# Fallback: if yt-dlp can't extract secondary user ID, try tiktokuser: scheme
if 'secondary user ID' in error or 'Unable to extract' in error:
self.log(f"yt-dlp can't extract user ID for @{username}, trying channel_id fallback", 'info')
channel_id = await self._resolve_channel_id(username)
if channel_id:
fallback_cmd = self._get_base_cmd() + [
'--flat-playlist',
'-j',
'--no-warnings',
'--socket-timeout', '30',
f"tiktokuser:{channel_id}"
]
fb_result = await asyncio.create_subprocess_exec(
*fallback_cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await fb_result.communicate()
if fb_result.returncode == 0:
self.log(f"Fallback tiktokuser: succeeded for @{username}", 'info')
else:
fb_error = stderr.decode('utf-8', errors='replace')
self.log(f"Fallback also failed for @{username}: {fb_error}", 'warning')
return []
else:
self.log(f"Could not resolve channel_id for @{username}", 'warning')
return []
else:
self.log(f"Failed to list TikTok videos: {error}", 'warning')
return []
lines = stdout.decode('utf-8', errors='replace').strip().split('\n')
# Parse since_date for filtering
cutoff_str = None
if since_date:
try:
if 'T' in since_date:
cutoff_dt = datetime.fromisoformat(since_date.replace('Z', '+00:00').replace('+00:00', ''))
else:
cutoff_dt = datetime.strptime(since_date[:10], '%Y-%m-%d')
cutoff_str = cutoff_dt.strftime('%Y%m%d')
except (ValueError, IndexError):
pass
videos = []
for line in lines:
if not line.strip():
continue
try:
data = json.loads(line)
except json.JSONDecodeError:
continue
video_id = str(data.get('id', ''))
if not video_id:
continue
upload_date = data.get('upload_date', '')
title = data.get('title', '')
description = data.get('description', '')
# Skip posts where yt-dlp returned no metadata at all
# When cookies are expired, yt-dlp returns no date, no title,
# and no description. Real posts with empty captions still have
# upload_date, so we use that as the key signal.
if not upload_date and not title and not description:
self.log(f"Skipping TikTok {video_id}: no metadata (cookies may be expired)", 'debug')
continue
title = title or description or f"TikTok video #{video_id}"
description = description or title
# Filter by date if cutoff specified
if cutoff_str and upload_date and upload_date < cutoff_str:
continue
# Format upload_date to ISO
formatted_date = None
if upload_date and len(upload_date) == 8 and upload_date.isdigit():
formatted_date = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}"
video_url = data.get('url') or f"https://www.tiktok.com/@{username}/video/{video_id}"
videos.append({
'video_id': video_id,
'title': title,
'description': description,
'upload_date': formatted_date,
'url': video_url,
'username': username,
})
if progress_callback:
progress_callback(len(videos))
if max_videos and len(videos) >= max_videos:
break
self.log(f"Found {len(videos)} TikTok videos for @{username}", 'info')
self._save_cookies_back()
return videos
except Exception as e:
self.log(f"Error getting TikTok videos: {e}", 'error')
self._save_cookies_back()
return []
async def download_video(self, video_url: str, output_dir: Path, username: str = '') -> Dict:
"""
Download a TikTok video/carousel using gallery-dl.
gallery-dl handles both regular videos and carousel/slideshow posts.
Returns dict with success status and list of downloaded files.
"""
if not self.gallery_dl_path:
return {'success': False, 'error': 'gallery-dl not available'}
try:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
cmd = [
self.gallery_dl_path,
'--write-metadata',
'-D', str(output_dir),
'-f', '{id}_{num}.{extension}',
]
# Add cookies for age-restricted / login-required content
cookies_file = self._get_cookies_file()
if cookies_file:
cmd.extend(['--cookies', cookies_file])
cmd.append(video_url)
self.log(f"Downloading TikTok: {video_url}", 'debug')
# Snapshot existing files before download so we only pick up new ones
existing_files = set(f.name for f in output_dir.iterdir() if f.is_file())
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
# Find newly downloaded files (exclude .json metadata and audio-only files)
downloaded_files = []
for f in output_dir.iterdir():
if f.is_file() and f.name not in existing_files and f.suffix.lower() not in ('.json',):
# Skip audio-only files
if f.suffix.lower() in ('.mp3', '.m4a', '.aac', '.wav', '.ogg'):
continue
downloaded_files.append(f)
if result.returncode != 0:
# gallery-dl exit code 4 = partial failure (e.g. slideshow images OK but audio failed)
# If we got media files, treat as success
if downloaded_files:
self.log(f"gallery-dl partial failure (code {result.returncode}) but {len(downloaded_files)} files downloaded", 'debug')
else:
error_msg = stderr.decode('utf-8', errors='replace').strip()
if 'not available' in error_msg.lower() or '404' in error_msg:
error_msg = 'Video not available (deleted or private)'
elif len(error_msg) > 200:
error_msg = error_msg[:200] + '...'
return {'success': False, 'error': error_msg}
if not downloaded_files:
return {'success': False, 'error': 'No files downloaded'}
# Sort by name to maintain carousel order (e.g. id_1.jpg, id_2.jpg)
downloaded_files.sort(key=lambda f: f.name)
primary_file = downloaded_files[0]
# Determine if this is a photo carousel (multiple images)
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp'}
is_carousel = len(downloaded_files) > 1 and all(
f.suffix.lower() in image_exts for f in downloaded_files
)
self._save_cookies_back()
return {
'success': True,
'file_path': str(primary_file),
'filename': primary_file.name,
'file_size': primary_file.stat().st_size,
'all_files': [str(f) for f in downloaded_files],
'file_count': len(downloaded_files),
'is_carousel': is_carousel,
}
except Exception as e:
self.log(f"Error downloading TikTok video: {e}", 'error')
self._save_cookies_back()
return {'success': False, 'error': str(e)}
async def get_creator(self, url: str) -> Optional[Creator]:
"""Get Creator object from URL"""
info = await self.get_creator_info(url)
if not info:
return None
username = info.get('creator_id', '')
return Creator(
creator_id=username,
service_id='tiktok',
platform='tiktok',
username=info.get('creator_name', username),
display_name=info.get('creator_name'),
profile_image_url=info.get('profile_image_url'),
bio=info.get('bio'),
)
async def get_posts(self, url: str, since_date: str = None,
max_videos: int = None, progress_callback=None) -> List[Post]:
"""Get TikTok videos as Post objects"""
videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
username = self.extract_username(url) or ''
posts = []
for video in videos:
# Each TikTok post could be video or carousel
# We create a single attachment for now; the actual download determines type
attachment = Attachment(
name=f"{video['video_id']}.mp4",
file_type='video',
extension='.mp4',
server_path=video['url'],
download_url=video['url'],
)
post = Post(
post_id=video['video_id'],
service_id='tiktok',
platform='tiktok',
creator_id=username,
title=None,
content=video.get('description') or video.get('title', ''),
published_at=video.get('upload_date'),
attachments=[attachment],
)
posts.append(post)
return posts

View File

@@ -0,0 +1,751 @@
"""
Twitch Clips Client - Fetches channel clips using yt-dlp
"""
import aiohttp
import asyncio
import hashlib
import json
import os
import re
import subprocess
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
class TwitchThumbnailCache:
"""Cache for Twitch clip thumbnails"""
def __init__(self, cache_dir: str = None):
self.cache_dir = Path(cache_dir or '/opt/media-downloader/data/cache/twitch_thumbnails')
self.cache_dir.mkdir(parents=True, exist_ok=True)
def _get_cache_path(self, thumbnail_url: str) -> Path:
"""Get local cache path for a thumbnail URL"""
# Create a hash of the URL for the filename
url_hash = hashlib.md5(thumbnail_url.encode()).hexdigest()
# Extract extension from URL or default to jpg
ext = '.jpg'
if '.png' in thumbnail_url.lower():
ext = '.png'
elif '.webp' in thumbnail_url.lower():
ext = '.webp'
return self.cache_dir / f"{url_hash}{ext}"
def get_cached(self, thumbnail_url: str) -> Optional[str]:
"""Get cached thumbnail path if it exists"""
cache_path = self._get_cache_path(thumbnail_url)
if cache_path.exists():
return str(cache_path)
return None
async def cache_thumbnail(self, thumbnail_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
"""Download and cache a thumbnail, return local path"""
if not thumbnail_url:
return None
# Check if already cached
cache_path = self._get_cache_path(thumbnail_url)
if cache_path.exists():
return str(cache_path)
# Download thumbnail
try:
close_session = False
if session is None:
session = aiohttp.ClientSession()
close_session = True
try:
async with session.get(thumbnail_url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
if resp.status == 200:
content = await resp.read()
with open(cache_path, 'wb') as f:
f.write(content)
return str(cache_path)
finally:
if close_session:
await session.close()
except Exception:
pass
return None
async def cache_thumbnails_batch(self, thumbnail_urls: List[str], max_concurrent: int = 5) -> Dict[str, str]:
"""Cache multiple thumbnails in parallel, return url->local_path mapping"""
result = {}
# Filter out already cached
to_download = []
for url in thumbnail_urls:
if not url:
continue
cached = self.get_cached(url)
if cached:
result[url] = cached
else:
to_download.append(url)
if not to_download:
return result
# Download in batches
async with aiohttp.ClientSession() as session:
semaphore = asyncio.Semaphore(max_concurrent)
async def download_one(url: str):
async with semaphore:
path = await self.cache_thumbnail(url, session)
if path:
result[url] = path
await asyncio.gather(*[download_one(url) for url in to_download])
return result
class TwitchClient(LoggingMixin):
"""
Client for fetching Twitch channel clips using yt-dlp
Supports:
- Channel clips URLs (twitch.tv/username/clips)
- Fetching channel metadata
- Listing all clips from a channel
- Downloading clips
"""
# Quality presets for yt-dlp
QUALITY_PRESETS = {
'best': 'best',
'1080p': 'best[height<=1080]',
'720p': 'best[height<=720]',
'480p': 'best[height<=480]',
}
def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None, cache_dir: str = None):
self._init_logger('PaidContent', log_callback, default_module='Twitch')
# Find yt-dlp executable
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
if not self.ytdlp_path:
self.log("yt-dlp not found, Twitch support will be disabled", 'warning')
# Store database reference for cookie access
self.unified_db = unified_db
self._cookies_file = None
# Initialize thumbnail cache
self.thumbnail_cache = TwitchThumbnailCache(cache_dir)
def _find_ytdlp(self) -> Optional[str]:
"""Find yt-dlp executable"""
common_paths = [
'/opt/media-downloader/venv/bin/yt-dlp', # Prefer venv version (kept up to date)
'/usr/local/bin/yt-dlp',
'/usr/bin/yt-dlp',
'/opt/homebrew/bin/yt-dlp',
os.path.expanduser('~/.local/bin/yt-dlp'),
]
for path in common_paths:
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
try:
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def is_available(self) -> bool:
"""Check if yt-dlp is available"""
return self.ytdlp_path is not None
def _get_cookies_file(self) -> Optional[str]:
"""Get path to cookies file, creating it from database if needed"""
if self._cookies_file and os.path.exists(self._cookies_file):
return self._cookies_file
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
# Try twitch-specific cookies first, then fall back to ytdlp
for scraper_id in ['twitch', 'ytdlp']:
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
# Support both {"cookies": [...]} and [...] formats
if isinstance(data, dict) and 'cookies' in data:
cookies_list = data['cookies']
elif isinstance(data, list):
cookies_list = data
else:
cookies_list = []
if cookies_list:
# Write cookies to temp file in Netscape format
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='twitch_cookies_')
with os.fdopen(fd, 'w') as f:
f.write("# Netscape HTTP Cookie File\n")
for cookie in cookies_list:
domain = cookie.get('domain', '')
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
expiry = str(int(cookie.get('expirationDate', 0)))
name = cookie.get('name', '')
value = cookie.get('value', '')
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
self.log(f"Loaded {len(cookies_list)} cookies from {scraper_id} scraper", 'debug')
return self._cookies_file
except Exception as e:
self.log(f"Could not load cookies: {e}", 'debug')
return None
def _get_base_cmd(self) -> List[str]:
"""Get base yt-dlp command with cookies if available"""
cmd = [self.ytdlp_path]
cookies_file = self._get_cookies_file()
if cookies_file:
cmd.extend(['--cookies', cookies_file])
return cmd
def cleanup(self):
"""Clean up temporary files"""
if self._cookies_file and os.path.exists(self._cookies_file):
try:
os.unlink(self._cookies_file)
except Exception:
pass
self._cookies_file = None
@staticmethod
def extract_channel_name(url: str) -> Optional[str]:
"""
Extract channel name from Twitch URL
Supports:
- twitch.tv/username
- twitch.tv/username/clips
- m.twitch.tv/username/clips
"""
patterns = [
r'twitch\.tv/([a-zA-Z0-9_]+)(?:/clips)?',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1).lower()
return None
@staticmethod
def normalize_clips_url(channel_name: str) -> str:
"""Convert channel name to clips URL with all-time filter"""
return f"https://www.twitch.tv/{channel_name}/clips?filter=clips&range=all"
async def get_channel_info(self, channel_url: str, count_clips: bool = True) -> Optional[Dict]:
"""
Get channel information and optionally count all clips
"""
if not self.is_available():
return None
channel_name = self.extract_channel_name(channel_url)
if not channel_name:
return None
try:
clips_url = self.normalize_clips_url(channel_name)
# First get basic info from first clip
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
'--playlist-items', '1',
clips_url
]
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
self.log(f"Failed to get channel info: {stderr.decode()}", 'warning')
return None
first_clip_data = None
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
first_clip_data = json.loads(line)
break
except json.JSONDecodeError:
continue
if not first_clip_data:
return None
# Count all clips if requested (this can take a while for channels with many clips)
clip_count = 0
if count_clips:
self.log(f"Counting clips for {channel_name}...", 'debug')
count_cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'--print', 'id',
clips_url
]
count_result = await asyncio.create_subprocess_exec(
*count_cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
count_stdout, _ = await count_result.communicate()
if count_result.returncode == 0:
clip_count = len([l for l in count_stdout.decode('utf-8', errors='replace').strip().split('\n') if l])
self.log(f"Found {clip_count} clips for {channel_name}", 'info')
return {
'channel_id': channel_name,
'channel_name': channel_name,
'channel_url': f"https://www.twitch.tv/{channel_name}",
'clips_url': clips_url,
'thumbnail': first_clip_data.get('thumbnail'),
'clip_count': clip_count,
}
except Exception as e:
self.log(f"Error getting channel info: {e}", 'error')
return None
async def get_channel_clips(self, channel_url: str, since_date: str = None,
max_clips: int = None, progress_callback=None,
cache_thumbnails: bool = True) -> List[Dict]:
"""
Get all clips from a channel
Args:
channel_url: Twitch channel URL
since_date: Only fetch clips created after this date (ISO format)
max_clips: Maximum number of clips to fetch
progress_callback: Callback function(count) for progress updates
cache_thumbnails: Whether to download and cache thumbnails locally
Returns:
List of clip metadata dicts with cached thumbnail paths
"""
if not self.is_available():
return []
channel_name = self.extract_channel_name(channel_url)
if not channel_name:
self.log(f"Could not extract channel name from URL: {channel_url}", 'error')
return []
try:
clips_url = self.normalize_clips_url(channel_name)
# Use flat-playlist for faster extraction (full metadata available in flat mode for Twitch clips)
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
clips_url
]
# Add date filter at yt-dlp level for efficiency
if since_date:
try:
from datetime import datetime
# Convert ISO date to YYYYMMDD format for yt-dlp
date_obj = datetime.fromisoformat(since_date.replace('Z', '+00:00'))
dateafter = date_obj.strftime('%Y%m%d')
cmd.extend(['--dateafter', dateafter])
self.log(f"Filtering clips after {dateafter}", 'debug')
except (ValueError, AttributeError):
pass
if max_clips:
cmd.extend(['--playlist-items', f'1:{max_clips}'])
self.log(f"Fetching clips from channel: {channel_name}", 'info')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error = stderr.decode('utf-8', errors='replace')
self.log(f"Failed to get channel clips: {error}", 'warning')
return []
clips = []
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
data = json.loads(line)
clip_id = data.get('id')
if not clip_id:
continue
# Parse timestamp to ISO format
timestamp = data.get('timestamp')
upload_date = data.get('upload_date')
if timestamp:
try:
upload_date = datetime.fromtimestamp(timestamp).isoformat()
except (ValueError, OSError):
pass
elif upload_date:
# Convert YYYYMMDD to ISO format
try:
upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
except ValueError:
pass
# Check if clip is newer than since_date
if since_date and upload_date and upload_date <= since_date:
self.log(f"Reached clip from {upload_date}, stopping", 'debug')
break
# Extract clip slug from URL
clip_url = data.get('url') or data.get('webpage_url', '')
clip_slug = clip_url.split('/')[-1] if clip_url else clip_id
clips.append({
'clip_id': clip_id,
'clip_slug': clip_slug,
'title': data.get('title', f'Clip {clip_id}'),
'upload_date': upload_date,
'timestamp': timestamp,
'duration': data.get('duration'),
'view_count': data.get('view_count'),
'thumbnail': data.get('thumbnail'),
'url': clip_url,
'language': data.get('language'),
'channel_name': channel_name,
})
if progress_callback:
progress_callback(len(clips))
if max_clips and len(clips) >= max_clips:
break
except json.JSONDecodeError:
continue
self.log(f"Found {len(clips)} clips", 'info')
# Cache thumbnails if requested
if cache_thumbnails and clips:
thumbnail_urls = [c.get('thumbnail') for c in clips if c.get('thumbnail')]
if thumbnail_urls:
self.log(f"Caching {len(thumbnail_urls)} thumbnails...", 'debug')
cached_paths = await self.thumbnail_cache.cache_thumbnails_batch(thumbnail_urls)
# Update clips with cached thumbnail paths
for clip in clips:
thumb_url = clip.get('thumbnail')
if thumb_url and thumb_url in cached_paths:
clip['thumbnail_cached'] = cached_paths[thumb_url]
self.log(f"Cached {len(cached_paths)} thumbnails", 'debug')
return clips
except Exception as e:
self.log(f"Error getting channel clips: {e}", 'error')
return []
async def download_clip(self, clip_url: str, output_dir: Path, quality: str = 'best',
progress_callback=None) -> Dict:
"""
Download a clip
Args:
clip_url: Twitch clip URL
output_dir: Directory to save the clip
quality: Quality preset
progress_callback: Callback for download progress
Returns:
Dict with success status and file info
"""
if not self.is_available():
return {'success': False, 'error': 'yt-dlp not available'}
try:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Output template preserves title and ID
output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
cmd = self._get_base_cmd() + [
'--no-warnings',
'-f', format_str,
'-o', output_template,
'--print-json',
clip_url
]
self.log(f"Downloading clip: {clip_url}", 'debug')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error_msg = stderr.decode('utf-8', errors='replace').strip()
if len(error_msg) > 200:
error_msg = error_msg[:200] + '...'
return {'success': False, 'error': error_msg}
# Parse output JSON
clip_info = None
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
try:
clip_info = json.loads(line)
break
except json.JSONDecodeError:
continue
if not clip_info:
# Try to find downloaded file
files = list(output_dir.glob('*.mp4'))
if files:
file_path = max(files, key=lambda f: f.stat().st_mtime)
return {
'success': True,
'file_path': str(file_path),
'filename': file_path.name,
'file_size': file_path.stat().st_size
}
return {'success': False, 'error': 'Could not find downloaded file'}
file_path = clip_info.get('_filename') or clip_info.get('filename')
if file_path:
file_path = Path(file_path)
return {
'success': True,
'file_path': str(file_path) if file_path else None,
'filename': file_path.name if file_path else None,
'file_size': file_path.stat().st_size if file_path and file_path.exists() else clip_info.get('filesize'),
'title': clip_info.get('title'),
'duration': clip_info.get('duration'),
'clip_id': clip_info.get('id'),
'upload_date': clip_info.get('upload_date'),
'thumbnail': clip_info.get('thumbnail'),
}
except Exception as e:
self.log(f"Error downloading clip: {e}", 'error')
return {'success': False, 'error': str(e)}
async def get_channel_avatar(self, channel_name: str) -> Optional[str]:
"""
Try to fetch channel avatar from Twitch
Note: This requires either Twitch API credentials or scraping.
Returns None if avatar cannot be fetched.
"""
profile = await self.get_channel_profile(channel_name)
return profile.get('avatar') if profile else None
async def get_channel_profile(self, channel_name: str) -> Optional[Dict]:
"""
Fetch channel profile info using Twitch's GQL API.
Returns dict with avatar, banner, display_name, bio, joined_date, external_links
"""
try:
import aiohttp
async with aiohttp.ClientSession() as session:
headers = {
'Client-Id': 'kimne78kx3ncx6brgo4mv6wki5h1ko', # Public Twitch web client ID
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
}
# GQL query for comprehensive user info
query = '''
query {
user(login: "%s") {
id
login
displayName
description
createdAt
profileImageURL(width: 300)
bannerImageURL
offlineImageURL
channel {
socialMedias {
name
url
}
}
}
}
''' % channel_name
async with session.post(
'https://gql.twitch.tv/gql',
headers=headers,
json={'query': query},
timeout=aiohttp.ClientTimeout(total=15)
) as resp:
if resp.status == 200:
data = await resp.json()
user = data.get('data', {}).get('user')
if not user:
self.log(f"Twitch user not found: {channel_name}", 'warning')
return None
result = {}
# Avatar
if user.get('profileImageURL'):
result['avatar'] = user['profileImageURL']
# Banner - prefer offlineImageURL (larger), fall back to bannerImageURL
if user.get('offlineImageURL'):
result['banner'] = user['offlineImageURL']
elif user.get('bannerImageURL'):
result['banner'] = user['bannerImageURL']
# Display name
if user.get('displayName'):
result['display_name'] = user['displayName']
# Bio/description
if user.get('description'):
result['bio'] = user['description']
# Joined date (format: "Jun 10, 2016")
if user.get('createdAt'):
try:
created_dt = datetime.fromisoformat(user['createdAt'].replace('Z', '+00:00'))
result['joined_date'] = created_dt.strftime('%b %d, %Y')
self.log(f"Found Twitch joined date: {result['joined_date']}", 'debug')
except (ValueError, TypeError):
pass
# Social links
social_medias = user.get('channel', {}).get('socialMedias', [])
if social_medias:
links = []
for social in social_medias:
name = social.get('name', 'Link')
url = social.get('url', '')
if url:
# Capitalize first letter of name
title = name.capitalize() if name else 'Link'
links.append({'title': title, 'url': url})
if links:
result['external_links'] = json.dumps(links)
self.log(f"Found {len(links)} Twitch external links", 'debug')
if result:
self.log(f"Fetched Twitch profile via GQL for {channel_name}: {list(result.keys())}", 'debug')
return result
except Exception as e:
self.log(f"Could not fetch Twitch profile: {e}", 'debug')
return None
async def get_creator(self, channel_url: str) -> Optional[Creator]:
"""
Get Creator object from channel URL
"""
info = await self.get_channel_info(channel_url)
if not info:
return None
channel_name = info.get('channel_name') or self.extract_channel_name(channel_url)
# Try to get the actual channel avatar (not clip thumbnail)
avatar_url = await self.get_channel_avatar(channel_name)
return Creator(
creator_id=info.get('channel_id') or channel_name,
service_id='twitch',
platform='twitch',
username=channel_name or 'Unknown',
display_name=channel_name,
profile_image_url=avatar_url, # Use actual avatar, not clip thumbnail
post_count=info.get('clip_count', 0)
)
async def get_posts(self, channel_url: str, since_date: str = None,
max_clips: int = None, progress_callback=None) -> List[Post]:
"""
Get clips as Post objects
"""
clips = await self.get_channel_clips(channel_url, since_date, max_clips, progress_callback)
posts = []
for clip in clips:
# Create attachment for the clip
attachment = Attachment(
name=f"{clip['title']}.mp4",
file_type='video',
extension='.mp4',
server_path=clip['url'], # Use URL as server_path
download_url=clip['url'],
duration=clip.get('duration'),
)
post = Post(
post_id=clip['clip_id'],
service_id='twitch',
platform='twitch',
creator_id=clip.get('channel_name', ''),
title=clip['title'],
content='', # Clips don't have descriptions
published_at=clip.get('upload_date'),
attachments=[attachment],
)
posts.append(post)
return posts

View File

@@ -0,0 +1,484 @@
"""
Utility functions for Paid Content feature
"""
import re
from typing import Optional, Tuple
from urllib.parse import urlparse
def _extract_xenforo_search_query(parsed) -> Optional[str]:
"""Extract the 'q' search parameter from a XenForo search URL."""
from urllib.parse import parse_qs, unquote_plus
qs = parse_qs(parsed.query)
query = qs.get('q', [''])[0]
if not query:
m = re.search(r'[&?]q=([^&]+)', parsed.query)
if m:
query = unquote_plus(m.group(1))
return query or None
def parse_creator_url(url: str) -> Optional[Tuple[str, str, str]]:
"""
Parse a Coomer/Kemono/YouTube/Twitch/Fansly creator URL
Args:
url: URL like https://coomer.party/onlyfans/user/creatorid
or https://www.youtube.com/@channelhandle
or https://www.youtube.com/channel/UCxxxxx
or https://www.twitch.tv/username/clips
or https://fansly.com/username
Returns:
Tuple of (service_id, platform, creator_id) or None if invalid
"""
try:
parsed = urlparse(url)
host = parsed.netloc.lower()
# Handle YouTube URLs
if 'youtube.com' in host or 'youtu.be' in host:
channel_id = _extract_youtube_channel_id(url)
if channel_id:
return ('youtube', 'youtube', channel_id)
return None
# Handle Twitch URLs
if 'twitch.tv' in host:
channel_name = _extract_twitch_channel_name(url)
if channel_name:
return ('twitch', 'twitch', channel_name)
return None
# Handle Fansly URLs (direct API)
if 'fansly.com' in host:
username = _extract_fansly_username(url)
if username:
return ('fansly_direct', 'fansly', username)
return None
# Handle OnlyFans URLs (direct API)
if 'onlyfans.com' in host:
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if path_parts:
username = path_parts[0]
if username.lower() not in ('my', 'api2', 'settings', 'search', 'notifications', 'chats', 'vault', 'lists', 'bookmarks', 'statements', 'help', 'terms', 'privacy', 'dmca', 'contact'):
return ('onlyfans_direct', 'onlyfans', username)
return None
# Handle Pornhub URLs
if 'pornhub.com' in host:
creator_id = _extract_pornhub_creator_id(url)
if creator_id:
return ('pornhub', 'pornhub', creator_id)
return None
# Handle XHamster URLs
if 'xhamster' in host:
creator_id = _extract_xhamster_creator_id(url)
if creator_id:
return ('xhamster', 'xhamster', creator_id)
return None
# Handle TikTok URLs
if 'tiktok.com' in host:
username = _extract_tiktok_username(url)
if username:
return ('tiktok', 'tiktok', username)
return None
# Handle Instagram URLs
if 'instagram.com' in host:
username = _extract_instagram_username(url)
if username:
return ('instagram', 'instagram', username)
return None
# Handle BestEyeCandy URLs
if 'besteyecandy.com' in host:
cid_match = re.search(r'cid-(\d+)', parsed.path)
slug_match = re.search(r'/([^/]+)\.html$', parsed.path)
if cid_match and slug_match:
slug = slug_match.group(1)
return ('besteyecandy', 'besteyecandy', f"{cid_match.group(1)}/{slug}")
elif cid_match:
return ('besteyecandy', 'besteyecandy', cid_match.group(1))
return None
# Handle Coppermine gallery URLs
# Match: domain.com/gallery/, domain.com/cpg/, domain.com/coppermine/
# Also match direct index.php/thumbnails.php/displayimage.php pages
if any(p in parsed.path.lower() for p in ['/gallery/', '/cpg/', '/coppermine/']) or \
re.search(r'(?:index|thumbnails|displayimage)\.php', parsed.path):
# Normalize to gallery root
base_path = re.sub(
r'(?:index|thumbnails|displayimage)\.php.*$', '', parsed.path
)
base_path = base_path.rstrip('/')
if base_path:
# Use domain + path as creator_id (e.g. kylie-jenner.org/gallery)
creator_id = host.replace('www.', '') + base_path
return ('coppermine', 'coppermine', creator_id)
# Handle Bellazon URLs (forum threads as creators)
if 'bellazon' in host:
match = re.search(r'/topic/(\d+)-([^/]+)', parsed.path)
if match:
topic_id = match.group(1)
return ('bellazon', 'bellazon', topic_id)
return None
# Handle Reddit URLs
if 'reddit.com' in host:
# Handle reddit.com/r/subreddit, old.reddit.com/r/subreddit, etc.
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if len(path_parts) >= 2 and path_parts[0] == 'r':
subreddit = path_parts[1].lower()
return ('reddit', 'reddit', subreddit)
return None
# Handle Snapchat URLs
if 'snapchat.com' in host:
# Handle snapchat.com/@username and story.snapchat.com/@username
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if path_parts:
username = path_parts[0].lstrip('@')
if username:
return ('snapchat', 'snapchat', username)
return None
# Handle HQCelebCorner URLs
if 'hqcelebcorner' in host:
query = _extract_xenforo_search_query(parsed)
if query:
return ('hqcelebcorner', 'hqcelebcorner', query)
return None
# Handle PicturePub URLs
if 'picturepub' in host:
query = _extract_xenforo_search_query(parsed)
if query:
return ('picturepub', 'picturepub', query)
return None
# Handle Soundgasm URLs
if 'soundgasm.net' in host:
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if len(path_parts) >= 2 and path_parts[0] in ('u', 'user'):
return ('soundgasm', 'soundgasm', path_parts[1])
return None
# Handle Liltsome URLs (archive, maps to soundgasm platform)
if 'liltsome.yerf.org' in host:
# Hash-based routing: /#/artist/{name}
fragment = parsed.fragment # e.g. "/artist/kinkyshibby"
if fragment:
parts = [p for p in fragment.strip('/').split('/') if p]
if len(parts) >= 2 and parts[0] == 'artist':
return ('soundgasm', 'soundgasm', parts[1])
return None
# Determine service (Coomer/Kemono)
if 'coomer' in host:
service_id = 'coomer'
elif 'kemono' in host:
service_id = 'kemono'
else:
return None
# Parse path: /platform/user/creatorid
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if len(path_parts) >= 3 and path_parts[1] == 'user':
platform = path_parts[0]
creator_id = path_parts[2]
return (service_id, platform, creator_id)
return None
except Exception:
return None
def _extract_youtube_channel_id(url: str) -> Optional[str]:
"""
Extract channel identifier from various YouTube URL formats
Supports:
- youtube.com/channel/UC...
- youtube.com/@handle
- youtube.com/c/channelname
- youtube.com/user/username
"""
patterns = [
r'youtube\.com/channel/([a-zA-Z0-9_-]+)',
r'youtube\.com/@([a-zA-Z0-9_.-]+)',
r'youtube\.com/c/([a-zA-Z0-9_-]+)',
r'youtube\.com/user/([a-zA-Z0-9_-]+)',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def _extract_twitch_channel_name(url: str) -> Optional[str]:
"""
Extract channel name from Twitch URL
Supports:
- twitch.tv/username
- twitch.tv/username/clips
- m.twitch.tv/username/clips
"""
patterns = [
r'twitch\.tv/([a-zA-Z0-9_]+)(?:/clips)?',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1).lower()
return None
def _extract_fansly_username(url: str) -> Optional[str]:
"""
Extract username from Fansly URL
Supports:
- fansly.com/username
- fansly.com/username/posts
- fansly.com/username/media
"""
patterns = [
r'fansly\.com/([a-zA-Z0-9_.-]+)(?:/(?:posts|media))?',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
username = match.group(1)
# Filter out known non-username paths
if username.lower() not in ('explore', 'search', 'settings', 'notifications', 'messages', 'live'):
return username
return None
def _extract_pornhub_creator_id(url: str) -> Optional[str]:
"""Extract creator identifier from Pornhub URL, returns 'type/name' format"""
patterns = [
r'pornhub\.com/pornstar/([a-zA-Z0-9_-]+)',
r'pornhub\.com/channels/([a-zA-Z0-9_-]+)',
r'pornhub\.com/users/([a-zA-Z0-9_-]+)',
r'pornhub\.com/model/([a-zA-Z0-9_-]+)',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
# Store as "type/name" to preserve the URL type
type_match = re.search(r'pornhub\.com/(pornstar|channels|users|model)/', url)
return f"{type_match.group(1)}/{match.group(1)}" if type_match else match.group(1)
return None
def _extract_xhamster_creator_id(url: str) -> Optional[str]:
"""Extract creator identifier from XHamster URL, returns 'type/name' format"""
patterns = [
r'xhamster\d*\.com/creators/([a-zA-Z0-9_-]+)',
r'xhamster\d*\.com/channels/([a-zA-Z0-9_-]+)',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
type_match = re.search(r'xhamster\d*\.com/(creators|channels)/', url)
return f"{type_match.group(1)}/{match.group(1)}" if type_match else match.group(1)
return None
def _extract_tiktok_username(url: str) -> Optional[str]:
"""Extract username from TikTok URL"""
match = re.search(r'tiktok\.com/@([a-zA-Z0-9_.]+)', url)
if match:
return match.group(1)
return None
def _extract_instagram_username(url: str) -> Optional[str]:
"""Extract username from Instagram URL"""
match = re.search(r'instagram\.com/([a-zA-Z0-9_.]+)/?', url)
if match:
username = match.group(1).lower()
non_usernames = {
'explore', 'reels', 'stories', 'p', 'tv', 'accounts',
'direct', 'about', 'legal', 'developer', 'privacy',
'terms', 'help', 'api', 'reel', 'tags'
}
if username not in non_usernames:
return username
return None
def parse_post_url(url: str) -> Optional[Tuple[str, str, str, str]]:
"""
Parse a Coomer/Kemono post URL
Args:
url: URL like https://coomer.party/onlyfans/user/creatorid/post/postid
Returns:
Tuple of (service_id, platform, creator_id, post_id) or None if invalid
"""
try:
parsed = urlparse(url)
host = parsed.netloc.lower()
# Determine service
if 'coomer' in host:
service_id = 'coomer'
elif 'kemono' in host:
service_id = 'kemono'
else:
return None
# Parse path: /platform/user/creatorid/post/postid
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if len(path_parts) >= 5 and path_parts[1] == 'user' and path_parts[3] == 'post':
platform = path_parts[0]
creator_id = path_parts[2]
post_id = path_parts[4]
return (service_id, platform, creator_id, post_id)
return None
except Exception:
return None
def format_file_size(size_bytes: int) -> str:
"""Format file size in human-readable format"""
if size_bytes is None:
return 'Unknown'
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if abs(size_bytes) < 1024.0:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024.0
return f"{size_bytes:.1f} PB"
def sanitize_filename(name: str, max_length: int = 200) -> str:
"""
Sanitize a string for use in a filename
Args:
name: String to sanitize
max_length: Maximum length of result
Returns:
Sanitized filename
"""
if not name:
return 'unnamed'
# Remove/replace invalid characters
name = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', name)
name = re.sub(r'\s+', '-', name.strip())
name = name.strip('.-')
if len(name) > max_length:
name = name[:max_length]
return name or 'unnamed'
def extract_platform_from_domain(domain: str) -> Optional[str]:
"""Extract platform name from domain"""
domain = domain.lower().replace('www.', '')
platform_domains = {
'onlyfans.com': 'onlyfans',
'fansly.com': 'fansly',
'patreon.com': 'patreon',
'fanbox.cc': 'fanbox',
'gumroad.com': 'gumroad',
'subscribestar.com': 'subscribestar',
'subscribestar.adult': 'subscribestar',
'discord.com': 'discord',
'discord.gg': 'discord',
'candfans.jp': 'candfans',
}
return platform_domains.get(domain)
def detect_content_type(filename: str) -> str:
"""Detect content type from filename extension"""
if not filename:
return 'unknown'
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg', '3gp'}
audio_exts = {'mp3', 'wav', 'flac', 'aac', 'm4a', 'ogg', 'wma'}
archive_exts = {'zip', 'rar', '7z', 'tar', 'gz', 'bz2'}
document_exts = {'pdf', 'doc', 'docx', 'txt', 'rtf', 'odt'}
if ext in image_exts:
return 'image'
elif ext in video_exts:
return 'video'
elif ext in audio_exts:
return 'audio'
elif ext in archive_exts:
return 'archive'
elif ext in document_exts:
return 'document'
else:
return 'unknown'
def get_service_platforms(service_id: str) -> list:
"""Get supported platforms for a service"""
platforms = {
'coomer': ['onlyfans', 'fansly', 'candfans'],
'kemono': ['patreon', 'fanbox', 'gumroad', 'subscribestar', 'discord'],
'youtube': ['youtube'],
'twitch': ['twitch'],
'fansly_direct': ['fansly'],
'onlyfans_direct': ['onlyfans'],
'pornhub': ['pornhub'],
'xhamster': ['xhamster'],
'tiktok': ['tiktok'],
'instagram': ['instagram'],
'soundgasm': ['soundgasm'],
'bellazon': ['bellazon'],
'besteyecandy': ['besteyecandy'],
'snapchat': ['snapchat'],
'reddit': ['reddit'],
'coppermine': ['coppermine'],
'hqcelebcorner': ['hqcelebcorner'],
'picturepub': ['picturepub'],
}
return platforms.get(service_id, [])
def get_service_base_url(service_id: str) -> Optional[str]:
"""
Get base URL for a service.
Note: For dynamic URLs, use the database (paid_content_services table).
These are fallback defaults only.
"""
# Import here to avoid circular dependency
from .api_client import PaidContentAPIClient
return PaidContentAPIClient.DEFAULT_SERVICE_URLS.get(service_id)

View File

@@ -0,0 +1,744 @@
"""
Generic XenForo Forum Client for Paid Content
Scrapes XenForo-based celebrity image forums (HQCelebCorner, PicturePub, etc.)
treating each celebrity name as a "creator" and each matching thread as a post.
Images are hosted on external hosts (imagebam, pixhost, imagetwist, etc.)
and resolved via ImageHostHandler from forum_downloader.
"""
import asyncio
import html
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse, unquote_plus
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class XenForoForumClient(LoggingMixin):
"""Generic client for scraping XenForo-based forum threads."""
FLARESOLVERR_URL = 'http://localhost:8191/v1'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
# External image host domains to look for in post links
IMAGE_HOST_DOMAINS = [
'imagebam.com', 'pixhost.to', 'imagetwist.com', 'imgur.com',
'imgbox.com', 'postimg.cc', 'postimages.org', 'catbox.moe',
'turboimagehost.com', 'imageban.ru', 'img.yt', 'acidimg.cc',
'pixxxels.cc', 'imx.to', 'imgbb.com', 'ibb.co',
]
def __init__(self, service_id: str, base_url: str, cookie_path: str, log_callback=None):
self.SERVICE_ID = service_id
self.BASE_URL = base_url.rstrip('/')
self.COOKIE_PATH = cookie_path
self._init_logger('PaidContent', log_callback, default_module=service_id)
self._cookies: Optional[Dict[str, str]] = None
self._image_host_handler = None
# ------------------------------------------------------------------
# Cookie handling
# ------------------------------------------------------------------
def _load_cookies(self) -> Dict[str, str]:
"""Load Playwright-format cookies and convert to {name: value} dict."""
if self._cookies is not None:
return self._cookies
try:
cookie_path = Path(self.COOKIE_PATH)
if cookie_path.exists():
with open(cookie_path, 'r') as f:
raw_cookies = json.load(f)
self._cookies = {c['name']: c['value'] for c in raw_cookies}
self.log(f"Loaded {len(self._cookies)} cookies from {self.COOKIE_PATH}", 'debug')
else:
self.log(f"Cookie file not found: {self.COOKIE_PATH}", 'warning')
self._cookies = {}
except Exception as e:
self.log(f"Error loading cookies: {e}", 'warning')
self._cookies = {}
return self._cookies
def _get_cookie_header(self) -> str:
"""Build Cookie header string from loaded cookies."""
cookies = self._load_cookies()
return '; '.join(f'{k}={v}' for k, v in cookies.items())
def _get_request_headers(self) -> Dict[str, str]:
"""Get headers with cookies for authenticated requests."""
headers = dict(self.HEADERS)
cookie_str = self._get_cookie_header()
if cookie_str:
headers['Cookie'] = cookie_str
return headers
# ------------------------------------------------------------------
# Image host handling
# ------------------------------------------------------------------
def _get_image_host_handler(self):
"""Get or create ImageHostHandler instance."""
if self._image_host_handler is None:
try:
from modules.forum_downloader import ImageHostHandler
self._image_host_handler = ImageHostHandler
self.log("Loaded ImageHostHandler from forum_downloader", 'debug')
except ImportError:
self.log("ImageHostHandler not available", 'warning')
self._image_host_handler = False # sentinel to avoid retrying
return self._image_host_handler if self._image_host_handler is not False else None
# ------------------------------------------------------------------
# HTTP helpers
# ------------------------------------------------------------------
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
"""Fetch a page with cookies. Falls back to FlareSolverr on 403."""
headers = self._get_request_headers()
try:
async with session.get(url, headers=headers, allow_redirects=True) as resp:
if resp.status == 200:
return await resp.text()
if resp.status == 403:
self.log(f"Got 403 for {url}, trying FlareSolverr", 'debug')
return await self._fetch_via_flaresolverr(url)
self.log(f"HTTP {resp.status} for {url}", 'warning')
return None
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return await self._fetch_via_flaresolverr(url)
async def _fetch_via_flaresolverr(self, url: str) -> Optional[str]:
"""Fetch a page using FlareSolverr to bypass Cloudflare."""
try:
import requests as std_requests
except ImportError:
self.log("requests library not available for FlareSolverr", 'warning')
return None
fs_session_id = None
try:
# Create session
resp = std_requests.post(self.FLARESOLVERR_URL, json={
'cmd': 'sessions.create'
}, timeout=30)
data = resp.json()
if data.get('status') != 'ok':
self.log("Failed to create FlareSolverr session", 'warning')
return None
fs_session_id = data.get('session')
# Fetch page
cookies = self._load_cookies()
resp = std_requests.post(self.FLARESOLVERR_URL, json={
'cmd': 'request.get',
'url': url,
'session': fs_session_id,
'cookies': [{'name': k, 'value': v} for k, v in cookies.items()],
'maxTimeout': 60000,
}, timeout=70)
page_data = resp.json()
if page_data.get('status') == 'ok':
return page_data.get('solution', {}).get('response', '')
self.log(f"FlareSolverr failed for {url}: {page_data.get('message', 'unknown')}", 'warning')
return None
except Exception as e:
self.log(f"FlareSolverr error for {url}: {e}", 'warning')
return None
finally:
if fs_session_id:
try:
std_requests.post(self.FLARESOLVERR_URL, json={
'cmd': 'sessions.destroy',
'session': fs_session_id,
}, timeout=10)
except Exception:
pass
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def search_threads(self, query: str) -> List[Dict]:
"""Search for threads matching a celebrity name.
Returns list of {thread_id, title, url, reply_count}.
"""
threads = []
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
# XenForo search: POST form to /search/search
search_url = f'{self.BASE_URL}/search/search'
headers = self._get_request_headers()
headers['Content-Type'] = 'application/x-www-form-urlencoded'
# Need CSRF token - fetch search page first
search_page_url = f'{self.BASE_URL}/search/'
page_html = await self._fetch_page(session, search_page_url)
if not page_html:
self.log("Failed to fetch search page", 'warning')
return threads
# Extract CSRF token
csrf_match = re.search(r'name="_xfToken"\s+value="([^"]+)"', page_html)
xf_token = csrf_match.group(1) if csrf_match else ''
form_data = {
'keywords': query,
'search_type': 'post',
'c[title_only]': '1',
'order': 'date',
'_xfToken': xf_token,
}
try:
async with session.post(search_url, headers=headers, data=form_data,
allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"Search returned HTTP {resp.status}", 'warning')
return threads
result_html = await resp.text()
result_url = str(resp.url)
except Exception as e:
self.log(f"Search failed: {e}", 'error')
return threads
threads = self._parse_search_results(result_html)
# Handle search result pagination
page = 2
while True:
next_url = self._find_next_search_page(result_html, result_url, page)
if not next_url:
break
await asyncio.sleep(0.3)
result_html = await self._fetch_page(session, next_url)
if not result_html:
break
more = self._parse_search_results(result_html)
if not more:
break
threads.extend(more)
page += 1
self.log(f"Search for '{query}' found {len(threads)} threads", 'info')
return threads
async def get_thread_info(self, thread_url: str) -> Optional[Dict]:
"""Fetch page 1 of a thread and extract metadata.
Returns {thread_id, title, reply_count, page_count, url}.
"""
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
page_html = await self._fetch_page(session, thread_url)
if not page_html:
return None
title = self._extract_title(page_html)
page_count = self._extract_page_count(page_html)
reply_count = self._extract_reply_count(page_html)
thread_id = self._extract_thread_id(thread_url)
return {
'thread_id': thread_id,
'title': title or 'Untitled',
'reply_count': reply_count,
'page_count': page_count,
'url': thread_url.split('#')[0].rstrip('/'),
}
except Exception as e:
self.log(f"Error getting thread info for {thread_url}: {e}", 'error')
return None
async def get_thread_images(self, thread_url: str, page_count: int = None,
start_page: int = 1) -> List[Dict]:
"""Scrape all pages of a thread and extract image host links.
Returns list of {url, host, post_number} dicts (deduplicated).
"""
images = []
seen_urls: Set[str] = set()
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
# If page_count not provided, fetch page 1 to determine it
if page_count is None:
page1_html = await self._fetch_page(session, thread_url)
if not page1_html:
return images
page_count = self._extract_page_count(page1_html)
page_images = self._extract_image_links(page1_html)
for img in page_images:
if img['url'] not in seen_urls:
seen_urls.add(img['url'])
images.append(img)
start_page = 2
for page_num in range(start_page, page_count + 1):
page_url = self._build_page_url(thread_url, page_num)
await asyncio.sleep(0.5) # Rate limit
page_html = await self._fetch_page(session, page_url)
if not page_html:
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
break
page_images = self._extract_image_links(page_html)
new_count = 0
for img in page_images:
if img['url'] not in seen_urls:
seen_urls.add(img['url'])
images.append(img)
new_count += 1
self.log(f"Page {page_num}/{page_count}: {new_count} new image links", 'debug')
self.log(f"Total: {len(images)} unique image links from {page_count} pages", 'info')
return images
async def resolve_image_url(self, host_page_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
"""Resolve an image host page URL to a direct image URL.
Uses ImageHostHandler from forum_downloader where possible.
"""
handler = self._get_image_host_handler()
# Try direct extraction without fetching the page
if handler:
direct = handler.extract_direct_url(host_page_url)
if direct:
return direct
# imgbox thumbnail → full image conversion (thumbs2 → images2)
m = re.match(r'https?://thumbs(\d*)\.imgbox\.com/([a-f0-9]+/[a-f0-9]+/)(\w+)_t\.\w+', host_page_url)
if m:
return f"https://images{m.group(1)}.imgbox.com/{m.group(2)}{m.group(3)}_o.jpg"
# For hosts that need page content, fetch and parse
own_session = session is None
if own_session:
timeout = aiohttp.ClientTimeout(total=30)
session = aiohttp.ClientSession(timeout=timeout)
try:
# ImageBam requires sfw_inter=1 cookie to bypass consent page
headers = dict(self.HEADERS)
if 'imagebam' in host_page_url:
headers['Cookie'] = 'sfw_inter=1'
try:
async with session.get(host_page_url, headers=headers,
allow_redirects=True) as resp:
if resp.status != 200:
return None
page_content = await resp.text()
final_url = str(resp.url)
except Exception as e:
self.log(f"Failed to fetch image host page {host_page_url}: {e}", 'debug')
return None
# Try handler with page content
if handler:
direct = handler.extract_direct_url(host_page_url, page_content=page_content)
if direct:
return direct
# Manual extraction fallbacks
return self._extract_direct_image_from_html(host_page_url, page_content, final_url)
finally:
if own_session:
await session.close()
# ------------------------------------------------------------------
# HTML parsing helpers
# ------------------------------------------------------------------
def _parse_search_results(self, html_content: str) -> List[Dict]:
"""Parse XenForo search results page for thread links."""
threads = []
# Parse each contentRow block to extract title, URL, and date
for block_match in re.finditer(
r'<div\s+class="contentRow[^"]*"[^>]*>(.*?)</div>\s*</div>\s*</div>',
html_content, re.DOTALL
):
block = block_match.group(1)
# Extract thread URL and title
title_match = re.search(
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
block, re.DOTALL
)
if not title_match:
continue
url = title_match.group(1)
title_raw = title_match.group(2)
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
if not title:
continue
if not url.startswith('http'):
url = self.BASE_URL + url
thread_id = self._extract_thread_id(url)
if not thread_id:
continue
# Extract date from <time datetime="..."> tag
published_at = None
time_match = re.search(r'<time[^>]+datetime="([^"]+)"', block)
if time_match:
published_at = time_match.group(1)
threads.append({
'thread_id': thread_id,
'title': title,
'url': url.split('#')[0].rstrip('/'),
'reply_count': 0,
'published_at': published_at,
})
# Fallback: if contentRow block parsing found nothing, try simpler title-only parsing
if not threads:
for m in re.finditer(
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
html_content, re.DOTALL
):
url = m.group(1)
title_raw = m.group(2)
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
if not title:
continue
if not url.startswith('http'):
url = self.BASE_URL + url
thread_id = self._extract_thread_id(url)
if not thread_id:
continue
threads.append({
'thread_id': thread_id,
'title': title,
'url': url.split('#')[0].rstrip('/'),
'reply_count': 0,
'published_at': None,
})
# Deduplicate by thread_id
seen = set()
unique = []
for t in threads:
if t['thread_id'] not in seen:
seen.add(t['thread_id'])
unique.append(t)
return unique
def _find_next_search_page(self, html_content: str, current_url: str, page_num: int) -> Optional[str]:
"""Find URL for the next page of search results."""
# XenForo pagination: <a href="...page-{N}..." class="pageNav-page">
pattern = rf'<a\s+href="([^"]*)"[^>]*class="pageNav-jump[^"]*"[^>]*>\s*Next'
m = re.search(pattern, html_content, re.IGNORECASE)
if m:
url = m.group(1)
if not url.startswith('http'):
url = self.BASE_URL + html.unescape(url)
return url
return None
# Domains/patterns for non-content images (reaction GIFs, emojis, signatures, etc.)
JUNK_URL_PATTERNS = [
'giphy.com', 'tenor.com', 'gfycat.com', # reaction GIFs
'jsdelivr.net', 'joypixels', 'twemoji', # emoji CDNs
'wp-content/', # WordPress media (blog graphics, profile pics)
'/unicode/', '/emoji/', # emoji paths
'haboodadi.com', # forum signature images
]
# Image hosts that are permanently dead (DNS gone / domain expired)
DEAD_HOSTS = [
'someimage.com',
]
def _extract_image_links(self, page_html: str) -> List[Dict]:
"""Extract image host links from all posts on a page."""
images = []
# Find all message bodies: XenForo uses <article class="message ..."> and
# <div class="bbWrapper"> for post content
for content_match in re.finditer(
r'<div\s+class="bbWrapper">(.*?)</div>\s*(?:</div>|<div\s+class="(?:js-post|message))',
page_html, re.DOTALL
):
content = content_match.group(1)
# Extract links to known image hosts
for link_match in re.finditer(r'<a\s+[^>]*href="([^"]+)"[^>]*>', content):
link_url = html.unescape(link_match.group(1))
if self._is_image_host_url(link_url) and not self._is_junk_url(link_url):
images.append({'url': link_url, 'host': self._identify_host(link_url)})
# Also catch direct image URLs (full-size, not thumbnails)
# NOTE: Skip images hosted on known image host CDNs (imgbox, imgur, etc.)
# — legitimate gallery images are posted as <a href> links to host pages
# (handled above), while inline <img> from these hosts are signatures.
for img_match in re.finditer(r'<img\s+[^>]*src="([^"]+)"[^>]*>', content):
img_url = html.unescape(img_match.group(1))
# Skip thumbnails, avatars, smilies, and junk
if any(skip in img_url.lower() for skip in [
'thumb', 'avatar', 'smili', 'emoji', 'icon', 'logo',
'data/assets', '/styles/', 'xenforo'
]):
continue
if self._is_junk_url(img_url):
continue
# Skip inline images from known image hosts — these are signatures,
# not gallery content (gallery images come through as <a> links above)
if self._is_image_host_url(img_url):
continue
if self._is_direct_image_url(img_url):
images.append({'url': img_url, 'host': 'direct'})
return images
def _is_junk_url(self, url: str) -> bool:
"""Filter out non-content images: reaction GIFs, emojis, blog graphics, dead hosts, etc."""
url_lower = url.lower()
if any(pat in url_lower for pat in self.JUNK_URL_PATTERNS):
return True
if any(host in url_lower for host in self.DEAD_HOSTS):
return True
return False
def _is_image_host_url(self, url: str) -> bool:
"""Check if a URL belongs to a known image hosting service."""
try:
domain = urlparse(url).netloc.lower()
return any(host in domain for host in self.IMAGE_HOST_DOMAINS)
except Exception:
return False
def _is_direct_image_url(self, url: str) -> bool:
"""Check if a URL points directly to an image file."""
try:
path = urlparse(url).path.lower()
return any(path.endswith(f'.{ext}') for ext in self.IMAGE_EXTS)
except Exception:
return False
def _identify_host(self, url: str) -> str:
"""Identify which image host a URL belongs to."""
handler = self._get_image_host_handler()
if handler:
host = handler.identify_host(url)
if host:
return host
# Fallback
try:
domain = urlparse(url).netloc.lower()
for host_domain in self.IMAGE_HOST_DOMAINS:
if host_domain in domain:
return host_domain.split('.')[0]
except Exception:
pass
return 'unknown'
def _extract_direct_image_from_html(self, url: str, page_content: str, final_url: str) -> Optional[str]:
"""Manually extract direct image URL from host page HTML."""
domain = urlparse(url).netloc.lower()
# imagebam: <img class="main-image ..." src="..."> (class may have extra classes)
if 'imagebam' in domain:
m = re.search(r'<img\s+[^>]*src="(https?://images\d*\.imagebam\.com/[^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
m = re.search(r'<img\s+[^>]*class="main-image[^"]*"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# Alternative: og:image meta tag
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# pixhost: <img id="image" src="..."> or img.pixhost.to URL
if 'pixhost' in domain:
m = re.search(r'<img\s+[^>]*id="image"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# Convert thumbnail URL to full: t{N}.pixhost.to/thumbs/ -> img{N}.pixhost.to/images/
m = re.search(r'https?://t(\d+)\.pixhost\.to/thumbs/(\d+)/(.+)', url)
if m:
return f"https://img{m.group(1)}.pixhost.to/images/{m.group(2)}/{m.group(3)}"
# imagetwist: <img class="pic" src="...">
if 'imagetwist' in domain:
m = re.search(r'<img\s+[^>]*class="pic"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
m = re.search(r'<p\s+[^>]*style="text-align:center"[^>]*>\s*<img\s+[^>]*src="([^"]+)"',
page_content)
if m:
return html.unescape(m.group(1))
# imgbox: <img id="img" src="..."> or src before id
if 'imgbox' in domain:
m = re.search(r'<img\s+[^>]*id="img"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
m = re.search(r'<img\s+[^>]*src="([^"]+)"[^>]*id="img"', page_content)
if m:
return html.unescape(m.group(1))
# Direct image URL pattern
m = re.search(r'(https?://images\d*\.imgbox\.com/[^\s"<>]+)', page_content)
if m:
return html.unescape(m.group(1))
# turboimagehost: <img class="uImage" src="...">
if 'turboimagehost' in domain:
m = re.search(r'<img\s+[^>]*class="uImage"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# acidimg: <img class="centred" src="...">
if 'acidimg' in domain:
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# pixxxels: same pattern as acidimg
if 'pixxxels' in domain:
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# imx.to: <img class="image-show" src="...">
if 'imx.to' in domain:
m = re.search(r'<img\s+[^>]*class="image-show"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# Generic: try og:image meta tag
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
if m:
img_url = html.unescape(m.group(1))
if self._is_direct_image_url(img_url):
return img_url
return None
# ------------------------------------------------------------------
# Utility helpers
# ------------------------------------------------------------------
@staticmethod
def _extract_title(page_html: str) -> Optional[str]:
"""Extract thread title from XenForo <h1 class="p-title-value">."""
m = re.search(r'<h1\s+class="p-title-value"[^>]*>(.*?)</h1>', page_html, re.DOTALL)
if m:
# Remove inner tags (like <span> for prefixes/labels, viewer count spans)
title = re.sub(r'<[^>]+>', '', m.group(1))
# Clean up non-breaking spaces and extra whitespace
title = title.replace('\xa0', ' ')
title = re.sub(r'\s*\(\d+\s*Viewer[s]?\)', '', title) # Remove "(1 Viewer)"
title = re.sub(r'\s+', ' ', title).strip()
return html.unescape(title)
# Fallback: <title> — strip common XenForo site name suffixes
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
if m:
title = html.unescape(m.group(1).strip())
title = re.sub(r'\s*[-–—|]\s*(?:HQCelebCorner|PicturePub|XenForo).*$', '', title, flags=re.IGNORECASE).strip()
return title
return None
@staticmethod
def _extract_page_count(page_html: str) -> int:
"""Extract total page count from XenForo pagination."""
# <li class="pageNav-page"><a href="...">42</a></li>
pages = re.findall(r'<li\s+class="pageNav-page[^"]*">\s*<a[^>]*>(\d+)</a>', page_html)
if pages:
return max(int(p) for p in pages)
return 1
@staticmethod
def _extract_reply_count(page_html: str) -> int:
"""Extract reply count from XenForo thread info."""
# <dl class="pairs pairs--inline"><dt>Replies</dt><dd>123</dd></dl>
m = re.search(r'<dt>Replies</dt>\s*<dd>([\d,]+)</dd>', page_html)
if m:
return int(m.group(1).replace(',', ''))
return 0
@staticmethod
def _extract_thread_id(url: str) -> Optional[str]:
"""Extract thread ID from XenForo URL.
Handles both formats:
- /threads/title.12345/
- /index.php?threads/title.12345/
"""
m = re.search(r'threads/[^/]*?\.(\d+)', url)
if m:
return m.group(1)
# Fallback: just /threads/{id}/
m = re.search(r'threads/(\d+)', url)
if m:
return m.group(1)
return None
@staticmethod
def _build_page_url(thread_url: str, page_num: int) -> str:
"""Build paginated thread URL for XenForo.
Handles: /index.php?threads/slug.12345/page-2
"""
# Remove existing page- suffix and fragment
base = thread_url.split('#')[0].rstrip('/')
base = re.sub(r'/page-\d+$', '', base)
if page_num == 1:
return base + '/'
return f'{base}/page-{page_num}'
@staticmethod
def _get_extension(filename_or_url: str) -> str:
"""Get lowercase file extension."""
clean = filename_or_url.split('?')[0].split('#')[0]
if '.' in clean.split('/')[-1]:
return clean.rsplit('.', 1)[-1].lower()
return ''
@staticmethod
def _filename_from_url(url: str) -> str:
"""Extract filename from URL path."""
path = urlparse(url).path
name = path.rstrip('/').split('/')[-1]
return name if name else 'unnamed.jpg'

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff