36
modules/paid_content/__init__.py
Normal file
36
modules/paid_content/__init__.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
Paid Content Module
|
||||
|
||||
Downloads and organizes content from subscription-based creator platforms
|
||||
(OnlyFans, Fansly, Patreon, Fanbox, etc.) via the Coomer.party and Kemono.party archival APIs.
|
||||
Also supports YouTube channels and Twitch clips via yt-dlp.
|
||||
"""
|
||||
|
||||
from .scraper import PaidContentScraper
|
||||
from .api_client import PaidContentAPIClient
|
||||
from .db_adapter import PaidContentDBAdapter
|
||||
from .file_host_downloader import FileHostDownloader
|
||||
from .embed_downloader import EmbedDownloader
|
||||
from .youtube_client import YouTubeClient
|
||||
from .twitch_client import TwitchClient, TwitchThumbnailCache
|
||||
from .fansly_direct_client import FanslyDirectClient
|
||||
from .onlyfans_client import OnlyFansClient
|
||||
from .xhamster_client import XHamsterClient
|
||||
from .tiktok_client import TikTokClient
|
||||
from .instagram_adapter import InstagramAdapter
|
||||
|
||||
__all__ = [
|
||||
'PaidContentScraper',
|
||||
'PaidContentAPIClient',
|
||||
'PaidContentDBAdapter',
|
||||
'FileHostDownloader',
|
||||
'EmbedDownloader',
|
||||
'YouTubeClient',
|
||||
'TwitchClient',
|
||||
'TwitchThumbnailCache',
|
||||
'FanslyDirectClient',
|
||||
'OnlyFansClient',
|
||||
'XHamsterClient',
|
||||
'TikTokClient',
|
||||
'InstagramAdapter',
|
||||
]
|
||||
311
modules/paid_content/api_client.py
Normal file
311
modules/paid_content/api_client.py
Normal file
@@ -0,0 +1,311 @@
|
||||
"""
|
||||
Unified API client for Coomer.party and Kemono.party
|
||||
Both services share the same API structure (Kemono fork)
|
||||
"""
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
from modules.base_module import LoggingMixin, RateLimitMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class PaidContentAPIClient(LoggingMixin, RateLimitMixin):
|
||||
"""
|
||||
API client for Coomer and Kemono archival services
|
||||
|
||||
API Endpoints:
|
||||
- GET /creators - List all creators
|
||||
- GET /{service}/user/{creator_id} - Get creator info
|
||||
- GET /{service}/user/{creator_id} - Get creator's posts (paginated with ?o=offset)
|
||||
- GET /{service}/user/{creator_id}/post/{post_id} - Get single post
|
||||
"""
|
||||
|
||||
# Fallback URLs if database doesn't have them configured
|
||||
DEFAULT_SERVICE_URLS = {
|
||||
'coomer': 'https://coomer.party',
|
||||
'kemono': 'https://kemono.party'
|
||||
}
|
||||
|
||||
SUPPORTED_PLATFORMS = {
|
||||
'coomer': ['onlyfans', 'fansly', 'candfans'],
|
||||
'kemono': ['patreon', 'fanbox', 'gumroad', 'subscribestar', 'discord']
|
||||
}
|
||||
|
||||
def __init__(self, service_id: str, session_cookie: str = None, base_url: str = None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='API')
|
||||
self._init_rate_limiter(min_delay=0.5, max_delay=2.0, batch_delay_min=1, batch_delay_max=3)
|
||||
|
||||
self.service_id = service_id
|
||||
|
||||
# Use provided base_url, or fall back to defaults
|
||||
if base_url:
|
||||
# If base_url includes /api/v1, extract just the base
|
||||
if '/api/v1' in base_url:
|
||||
self.base_url = base_url.replace('/api/v1', '').rstrip('/')
|
||||
else:
|
||||
self.base_url = base_url.rstrip('/')
|
||||
else:
|
||||
self.base_url = self.DEFAULT_SERVICE_URLS.get(service_id)
|
||||
|
||||
self.api_url = f"{self.base_url}/api/v1"
|
||||
self.session_cookie = session_cookie
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
|
||||
async def _get_session(self) -> aiohttp.ClientSession:
|
||||
"""Get or create aiohttp session"""
|
||||
if self._session is None or self._session.closed:
|
||||
# Note: Coomer/Kemono require 'Accept: text/css' header as anti-scraping measure
|
||||
# Despite this, they still return JSON responses
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/css',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Referer': self.base_url
|
||||
}
|
||||
cookies = {}
|
||||
if self.session_cookie:
|
||||
cookies['session'] = self.session_cookie
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
self._session = aiohttp.ClientSession(headers=headers, cookies=cookies, timeout=timeout)
|
||||
return self._session
|
||||
|
||||
async def close(self):
|
||||
"""Close the aiohttp session"""
|
||||
if self._session and not self._session.closed:
|
||||
await self._session.close()
|
||||
self._session = None
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
await self.close()
|
||||
|
||||
async def check_health(self) -> Dict[str, Any]:
|
||||
"""Check API health status"""
|
||||
import time
|
||||
try:
|
||||
session = await self._get_session()
|
||||
start = time.time()
|
||||
async with session.get(f"{self.api_url}/creators", timeout=aiohttp.ClientTimeout(total=10)) as resp:
|
||||
elapsed = time.time() - start
|
||||
if resp.status == 200:
|
||||
# content_type=None allows parsing JSON regardless of response content-type
|
||||
await resp.json(content_type=None)
|
||||
return {'status': 'healthy', 'response_time': round(elapsed, 3)}
|
||||
elif resp.status == 429:
|
||||
return {'status': 'rate_limited', 'response_code': 429}
|
||||
else:
|
||||
return {'status': 'degraded', 'response_code': resp.status}
|
||||
except asyncio.TimeoutError:
|
||||
return {'status': 'timeout', 'error': 'Request timed out'}
|
||||
except Exception as e:
|
||||
return {'status': 'down', 'error': str(e)}
|
||||
|
||||
async def get_all_creators(self) -> List[Dict]:
|
||||
"""Get list of all available creators (for search)"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
session = await self._get_session()
|
||||
async with session.get(f"{self.api_url}/creators") as resp:
|
||||
if resp.status == 200:
|
||||
return await resp.json(content_type=None)
|
||||
self.log(f"Failed to get creators list: HTTP {resp.status}", 'warning')
|
||||
return []
|
||||
except Exception as e:
|
||||
self.log(f"Error getting creators list: {e}", 'error')
|
||||
return []
|
||||
|
||||
async def get_creator(self, platform: str, creator_id: str) -> Optional[Creator]:
|
||||
"""Get creator info"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
session = await self._get_session()
|
||||
|
||||
# First try to get creator profile
|
||||
url = f"{self.api_url}/{platform}/user/{creator_id}/profile"
|
||||
async with session.get(url) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json(content_type=None)
|
||||
return Creator.from_api(data, self.service_id, platform, self.base_url)
|
||||
|
||||
# Fallback: get first post to extract creator info
|
||||
url = f"{self.api_url}/{platform}/user/{creator_id}/posts"
|
||||
async with session.get(url) as resp:
|
||||
if resp.status == 200:
|
||||
posts = await resp.json(content_type=None)
|
||||
if posts and len(posts) > 0:
|
||||
# Extract creator info from first post
|
||||
first_post = posts[0]
|
||||
# Construct image URLs - use .st instead of .party
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(self.base_url)
|
||||
# Convert .party to .st for image URLs (coomer.party/kemono.party images are at .st)
|
||||
netloc = parsed.netloc.replace('.party', '.st')
|
||||
img_domain = f"img.{netloc}"
|
||||
profile_image_url = f"https://{img_domain}/icons/{platform}/{creator_id}"
|
||||
banner_image_url = f"https://{img_domain}/banners/{platform}/{creator_id}"
|
||||
return Creator(
|
||||
creator_id=creator_id,
|
||||
service_id=self.service_id,
|
||||
platform=platform,
|
||||
username=first_post.get('user', creator_id),
|
||||
display_name=first_post.get('user', creator_id),
|
||||
profile_image_url=profile_image_url,
|
||||
banner_image_url=banner_image_url
|
||||
)
|
||||
|
||||
self.log(f"Creator not found: {platform}/{creator_id}", 'warning')
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting creator {platform}/{creator_id}: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def get_creator_posts(self, platform: str, creator_id: str, offset: int = 0) -> List[Post]:
|
||||
"""Get creator's posts (50 per page by default)"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
session = await self._get_session()
|
||||
|
||||
url = f"{self.api_url}/{platform}/user/{creator_id}/posts"
|
||||
params = {'o': offset} if offset > 0 else {}
|
||||
|
||||
async with session.get(url, params=params) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json(content_type=None)
|
||||
return [Post.from_api(p, self.service_id, platform, creator_id, self.base_url) for p in data]
|
||||
elif resp.status == 404:
|
||||
self.log(f"Creator not found: {platform}/{creator_id}", 'warning')
|
||||
else:
|
||||
self.log(f"Failed to get posts: HTTP {resp.status}", 'warning')
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting posts for {platform}/{creator_id}: {e}", 'error')
|
||||
return []
|
||||
|
||||
async def get_all_creator_posts(self, platform: str, creator_id: str,
|
||||
since_date: str = None, max_posts: int = None,
|
||||
progress_callback=None) -> List[Post]:
|
||||
"""Fetch all posts with pagination"""
|
||||
all_posts = []
|
||||
offset = 0
|
||||
page = 0
|
||||
|
||||
self.log(f"Fetching posts for {platform}/{creator_id}", 'info')
|
||||
|
||||
while True:
|
||||
posts = await self.get_creator_posts(platform, creator_id, offset)
|
||||
if not posts:
|
||||
break
|
||||
|
||||
for post in posts:
|
||||
# Stop if we've reached posts we've already seen
|
||||
if since_date and post.published_at and post.published_at <= since_date:
|
||||
self.log(f"Reached already-seen post date: {post.published_at}", 'debug')
|
||||
return all_posts
|
||||
|
||||
all_posts.append(post)
|
||||
|
||||
if max_posts and len(all_posts) >= max_posts:
|
||||
self.log(f"Reached max posts limit: {max_posts}", 'debug')
|
||||
return all_posts
|
||||
|
||||
page += 1
|
||||
offset += 50
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(page, len(all_posts))
|
||||
|
||||
self._delay_between_batches()
|
||||
|
||||
self.log(f"Fetched {len(all_posts)} posts for {platform}/{creator_id}", 'info')
|
||||
return all_posts
|
||||
|
||||
async def get_post(self, platform: str, creator_id: str, post_id: str) -> Optional[Post]:
|
||||
"""Get single post by ID"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
session = await self._get_session()
|
||||
|
||||
url = f"{self.api_url}/{platform}/user/{creator_id}/post/{post_id}"
|
||||
async with session.get(url) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json(content_type=None)
|
||||
# Single post endpoint wraps response in {"post": {...}}
|
||||
if isinstance(data, dict) and 'post' in data:
|
||||
data = data['post']
|
||||
return Post.from_api(data, self.service_id, platform, creator_id, self.base_url)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting post {post_id}: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def search_creators(self, query: str, platform: str = None) -> List[Dict]:
|
||||
"""Search for creators by name"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
# Get all creators and filter locally (API doesn't have search endpoint)
|
||||
all_creators = await self.get_all_creators()
|
||||
|
||||
query_lower = query.lower()
|
||||
results = []
|
||||
|
||||
for creator in all_creators:
|
||||
if platform and creator.get('service') != platform:
|
||||
continue
|
||||
|
||||
name = (creator.get('name') or '').lower()
|
||||
if query_lower in name:
|
||||
results.append({
|
||||
'id': creator.get('id'),
|
||||
'name': creator.get('name'),
|
||||
'service': creator.get('service'),
|
||||
'indexed': creator.get('indexed'),
|
||||
'updated': creator.get('updated'),
|
||||
'favorited': creator.get('favorited', 0)
|
||||
})
|
||||
|
||||
# Sort by favorited count (popularity)
|
||||
results.sort(key=lambda x: x.get('favorited', 0), reverse=True)
|
||||
return results[:50] # Limit results
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error searching creators: {e}", 'error')
|
||||
return []
|
||||
|
||||
def get_attachment_url(self, server_path: str) -> str:
|
||||
"""Convert server path to full download URL"""
|
||||
if not server_path:
|
||||
return ''
|
||||
if server_path.startswith('http'):
|
||||
return server_path
|
||||
return f"{self.base_url}/data{server_path}"
|
||||
|
||||
def get_thumbnail_url(self, server_path: str) -> str:
|
||||
"""Get thumbnail URL for an attachment"""
|
||||
if not server_path:
|
||||
return ''
|
||||
if server_path.startswith('http'):
|
||||
return server_path
|
||||
return f"{self.base_url}/thumbnail/data{server_path}"
|
||||
|
||||
@classmethod
|
||||
def get_supported_platforms(cls, service_id: str) -> List[str]:
|
||||
"""Get list of supported platforms for a service"""
|
||||
return cls.SUPPORTED_PLATFORMS.get(service_id, [])
|
||||
|
||||
@classmethod
|
||||
def is_valid_service(cls, service_id: str) -> bool:
|
||||
"""Check if service ID is valid"""
|
||||
return service_id in cls.SERVICE_URLS
|
||||
|
||||
@classmethod
|
||||
def get_service_ids(cls) -> List[str]:
|
||||
"""Get list of all service IDs"""
|
||||
return list(cls.SERVICE_URLS.keys())
|
||||
389
modules/paid_content/bellazon_client.py
Normal file
389
modules/paid_content/bellazon_client.py
Normal file
@@ -0,0 +1,389 @@
|
||||
"""
|
||||
Bellazon Forum Thread Client for Paid Content
|
||||
|
||||
Scrapes Bellazon forum threads (Invision Power Suite) treating each thread
|
||||
as a "creator" and each reply with media as a post.
|
||||
|
||||
Only bellazon-hosted uploads are captured (external image host links are
|
||||
unreliable/ephemeral). Video attachments (attachment.php) are also captured.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class BellazonClient(LoggingMixin):
|
||||
"""Client for scraping Bellazon forum threads."""
|
||||
|
||||
SERVICE_ID = 'bellazon'
|
||||
PLATFORM = 'bellazon'
|
||||
BASE_URL = 'https://www.bellazon.com/main'
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
# Extensions considered images
|
||||
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
|
||||
# Extensions considered videos
|
||||
VIDEO_EXTS = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
|
||||
|
||||
def __init__(self, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Bellazon')
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_profile_info(self, topic_id: str) -> Optional[Dict]:
|
||||
"""Fetch first page of a thread and return profile-like info.
|
||||
|
||||
Returns dict with: username (slug), display_name, post_count, topic_url
|
||||
"""
|
||||
# Bellazon requires a slug in the URL but redirects to the correct one
|
||||
url = f'{self.BASE_URL}/topic/{topic_id}-x/'
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Bellazon topic {topic_id} returned HTTP {resp.status}", 'warning')
|
||||
return None
|
||||
final_url = str(resp.url)
|
||||
page_html = await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Failed to fetch Bellazon topic {topic_id}: {e}", 'error')
|
||||
return None
|
||||
|
||||
# Extract slug from final URL: /topic/{id}-{slug}/
|
||||
slug = self._extract_slug(final_url, topic_id)
|
||||
|
||||
# Extract thread title from <h1>
|
||||
title = self._extract_title(page_html)
|
||||
|
||||
# Extract page count from "Page X of Y"
|
||||
page_count = self._extract_page_count(page_html)
|
||||
|
||||
# Count comments on this page to estimate total
|
||||
comment_ids = re.findall(r'data-commentid="(\d+)"', page_html)
|
||||
per_page = len(comment_ids) or 20
|
||||
estimated_comments = per_page * page_count
|
||||
|
||||
return {
|
||||
'username': slug,
|
||||
'display_name': title or slug,
|
||||
'post_count': estimated_comments,
|
||||
'page_count': page_count,
|
||||
'topic_url': final_url.split('?')[0].rstrip('/'),
|
||||
}
|
||||
|
||||
async def get_posts(self, topic_id: str, topic_url: str,
|
||||
known_post_ids: Optional[Set[str]] = None,
|
||||
progress_callback=None) -> List[Post]:
|
||||
"""Scrape all pages of a thread and return posts with media."""
|
||||
known = known_post_ids or set()
|
||||
posts: List[Post] = []
|
||||
|
||||
# Fetch page 1 to get page count
|
||||
page1_url = f'{topic_url}/page/1/'
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
page_html = await self._fetch_page(session, page1_url)
|
||||
if page_html is None:
|
||||
return posts
|
||||
|
||||
page_count = self._extract_page_count(page_html)
|
||||
self.log(f"Thread has {page_count} pages", 'info')
|
||||
|
||||
# Parse page 1
|
||||
page_posts = self._parse_page(page_html, topic_id, known)
|
||||
posts.extend(page_posts)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
# Parse remaining pages
|
||||
for page_num in range(2, page_count + 1):
|
||||
page_url = f'{topic_url}/page/{page_num}/'
|
||||
await asyncio.sleep(1) # Rate limit
|
||||
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if page_html is None:
|
||||
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
|
||||
break
|
||||
|
||||
page_posts = self._parse_page(page_html, topic_id, known)
|
||||
posts.extend(page_posts)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
self.log(f"Page {page_num}/{page_count}: {len(page_posts)} posts with media", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error scraping Bellazon thread: {e}", 'error')
|
||||
|
||||
self.log(f"Total: {len(posts)} posts with media from {page_count} pages", 'info')
|
||||
return posts
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTML parsing helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _parse_page(self, page_html: str, topic_id: str, known: Set[str]) -> List[Post]:
|
||||
"""Parse a single page of HTML and return Post objects for comments with media."""
|
||||
posts: List[Post] = []
|
||||
|
||||
# Split HTML into comment blocks using data-commentid markers
|
||||
# Each comment starts with data-commentid="..." and contains a content block
|
||||
comment_pattern = re.compile(
|
||||
r'data-commentid="(\d+)"\s+data-quotedata="([^"]*)"',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
matches = list(comment_pattern.finditer(page_html))
|
||||
if not matches:
|
||||
return posts
|
||||
|
||||
for i, match in enumerate(matches):
|
||||
comment_id = match.group(1)
|
||||
post_id = f"comment_{comment_id}"
|
||||
|
||||
if post_id in known:
|
||||
continue
|
||||
|
||||
quotedata_raw = match.group(2)
|
||||
|
||||
# Parse quote data for username and timestamp
|
||||
username, timestamp = self._parse_quotedata(quotedata_raw)
|
||||
|
||||
# Extract the content block for this comment
|
||||
start = match.end()
|
||||
end = matches[i + 1].start() if i + 1 < len(matches) else len(page_html)
|
||||
content_block = page_html[start:end]
|
||||
|
||||
# Find the actual content within data-role="commentContent"
|
||||
# The closing pattern is </div> followed by blank lines then </div>
|
||||
content_match = re.search(
|
||||
r'data-role="commentContent"[^>]*>(.*?)</div>\s*\n\s*\n\s*</div>',
|
||||
content_block, re.DOTALL
|
||||
)
|
||||
if not content_match:
|
||||
# Fallback: grab everything from commentContent to ipsEntry__foot
|
||||
content_match = re.search(
|
||||
r'data-role="commentContent"[^>]*>(.*?)(?=ipsEntry__foot)',
|
||||
content_block, re.DOTALL
|
||||
)
|
||||
if not content_match:
|
||||
continue
|
||||
|
||||
content_html = content_match.group(1)
|
||||
|
||||
# Extract media from content
|
||||
attachments = self._extract_media(content_html)
|
||||
|
||||
if not attachments:
|
||||
continue # Skip text-only replies
|
||||
|
||||
# Build published_at from timestamp
|
||||
published_at = None
|
||||
if timestamp:
|
||||
try:
|
||||
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
||||
published_at = dt.isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=topic_id,
|
||||
title='',
|
||||
content=f"Posted by {username}" if username else '',
|
||||
published_at=published_at,
|
||||
attachments=attachments,
|
||||
)
|
||||
posts.append(post)
|
||||
known.add(post_id)
|
||||
|
||||
return posts
|
||||
|
||||
def _extract_media(self, content_html: str) -> List[Attachment]:
|
||||
"""Extract image and video attachments from a comment's HTML content."""
|
||||
attachments: List[Attachment] = []
|
||||
seen_urls: set = set()
|
||||
|
||||
# 1. Bellazon-hosted images: <a class="ipsAttachLink ipsAttachLink_image" href="...full..."><img src="...thumb...">
|
||||
for m in re.finditer(
|
||||
r'ipsAttachLink_image"\s+href="([^"]+)"[^>]*><img[^>]*src="([^"]+)"',
|
||||
content_html
|
||||
):
|
||||
full_url = self._normalize_url(m.group(1))
|
||||
if full_url in seen_urls:
|
||||
continue
|
||||
# Skip thumbnails as the full URL
|
||||
if '_thumb.' in full_url or '.thumb.' in full_url:
|
||||
continue
|
||||
seen_urls.add(full_url)
|
||||
attachments.append(self._make_attachment(full_url, 'image'))
|
||||
|
||||
# 2. Direct image/video links from bellazon uploads not caught by pattern 1
|
||||
for m in re.finditer(
|
||||
r'href="([^"]*bellazon\.com/main/uploads/[^"]+)"',
|
||||
content_html
|
||||
):
|
||||
url = self._normalize_url(m.group(1))
|
||||
if url in seen_urls:
|
||||
continue
|
||||
if '_thumb.' in url or '.thumb.' in url:
|
||||
continue
|
||||
ext = self._get_extension(url)
|
||||
if ext in self.IMAGE_EXTS or ext in self.VIDEO_EXTS:
|
||||
seen_urls.add(url)
|
||||
file_type = 'image' if ext in self.IMAGE_EXTS else 'video'
|
||||
attachments.append(self._make_attachment(url, file_type))
|
||||
|
||||
# 3. Video <source> tags: <source src="//www.bellazon.com/main/uploads/...MP4" type="video/mp4">
|
||||
for m in re.finditer(
|
||||
r'<source\s+src="([^"]+)"[^>]*type="video/',
|
||||
content_html
|
||||
):
|
||||
url = self._normalize_url(m.group(1))
|
||||
if url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
name = self._filename_from_url(url)
|
||||
attachments.append(self._make_attachment(url, 'video', name=name))
|
||||
|
||||
# 4. Video/file attachments: <a href="...attachment.php?id=XXX">filename.MP4</a>
|
||||
# These are protocol-relative URLs like //www.bellazon.com/main/applications/...
|
||||
for m in re.finditer(
|
||||
r'href="([^"]*attachment\.php\?id=\d+[^"]*)"[^>]*>([^<]+)',
|
||||
content_html
|
||||
):
|
||||
att_url = self._normalize_url(m.group(1))
|
||||
filename = m.group(2).strip()
|
||||
if att_url in seen_urls:
|
||||
continue
|
||||
ext = self._get_extension(filename)
|
||||
if ext in self.VIDEO_EXTS or ext in self.IMAGE_EXTS:
|
||||
seen_urls.add(att_url)
|
||||
file_type = 'video' if ext in self.VIDEO_EXTS else 'image'
|
||||
attachments.append(self._make_attachment(att_url, file_type, name=filename))
|
||||
|
||||
return attachments
|
||||
|
||||
def _make_attachment(self, url: str, file_type: str, name: str = None) -> Attachment:
|
||||
"""Create an Attachment from a URL."""
|
||||
if name is None:
|
||||
name = self._filename_from_url(url)
|
||||
ext = self._get_extension(name)
|
||||
|
||||
return Attachment(
|
||||
name=name,
|
||||
file_type=file_type,
|
||||
extension=ext if ext else None,
|
||||
server_path=url, # Used as dedup key
|
||||
download_url=url,
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utility helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
||||
"""Fetch a single page, return HTML or None."""
|
||||
try:
|
||||
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
||||
return None
|
||||
return await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {url}: {e}", 'warning')
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_slug(url: str, topic_id: str) -> str:
|
||||
"""Extract slug from URL like /topic/39089-india-reynolds/"""
|
||||
m = re.search(rf'/topic/{re.escape(topic_id)}-([^/?#]+)', url)
|
||||
if m:
|
||||
return m.group(1).strip('/')
|
||||
return topic_id
|
||||
|
||||
@staticmethod
|
||||
def _extract_title(page_html: str) -> Optional[str]:
|
||||
"""Extract thread title from <h1>."""
|
||||
m = re.search(r'<h1[^>]*>([^<]+)</h1>', page_html)
|
||||
if m:
|
||||
return html.unescape(m.group(1).strip())
|
||||
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
title = html.unescape(m.group(1).strip())
|
||||
# Remove site suffix
|
||||
title = re.sub(r'\s*[-–—]\s*Bellazon.*$', '', title, flags=re.IGNORECASE).strip()
|
||||
return title
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_page_count(page_html: str) -> int:
|
||||
"""Extract total page count from 'Page X of Y'."""
|
||||
m = re.search(r'Page\s+\d+\s+of\s+(\d+)', page_html)
|
||||
if m:
|
||||
return int(m.group(1))
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def _parse_quotedata(raw: str) -> tuple:
|
||||
"""Parse HTML-encoded JSON quotedata, return (username, unix_timestamp)."""
|
||||
try:
|
||||
decoded = html.unescape(raw)
|
||||
data = json.loads(decoded)
|
||||
return data.get('username', ''), data.get('timestamp')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return '', None
|
||||
|
||||
@staticmethod
|
||||
def _normalize_url(url: str) -> str:
|
||||
"""Normalize a URL: handle protocol-relative, decode HTML entities, make absolute."""
|
||||
url = html.unescape(url) # & → &
|
||||
if url.startswith('//'):
|
||||
url = 'https:' + url
|
||||
elif url.startswith('/'):
|
||||
url = 'https://www.bellazon.com' + url
|
||||
elif not url.startswith('http'):
|
||||
url = 'https://www.bellazon.com/main/' + url
|
||||
return url
|
||||
|
||||
@staticmethod
|
||||
def _get_extension(filename_or_url: str) -> str:
|
||||
"""Get lowercase file extension from a filename or URL."""
|
||||
# Strip query params
|
||||
clean = filename_or_url.split('?')[0].split('#')[0]
|
||||
if '.' in clean.split('/')[-1]:
|
||||
return clean.rsplit('.', 1)[-1].lower()
|
||||
return ''
|
||||
|
||||
@staticmethod
|
||||
def _filename_from_url(url: str) -> str:
|
||||
"""Extract filename from URL path."""
|
||||
path = urlparse(url).path
|
||||
name = path.rstrip('/').split('/')[-1]
|
||||
return name if name else 'unnamed'
|
||||
468
modules/paid_content/besteyecandy_client.py
Normal file
468
modules/paid_content/besteyecandy_client.py
Normal file
@@ -0,0 +1,468 @@
|
||||
"""
|
||||
BestEyeCandy.com Client for Paid Content
|
||||
|
||||
Scrapes celebrity photo galleries from BestEyeCandy.com.
|
||||
Each celeb has a unique CID and paginated photo listings.
|
||||
|
||||
Optimization: Full-res URLs follow a predictable pattern. We visit ONE
|
||||
detail page to determine the pattern (server hostname + name format),
|
||||
then construct all remaining URLs from photo IDs found on listing pages.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class BestEyeCandyClient(LoggingMixin):
|
||||
"""Client for scraping BestEyeCandy.com celebrity photo galleries."""
|
||||
|
||||
SERVICE_ID = 'besteyecandy'
|
||||
PLATFORM = 'besteyecandy'
|
||||
BASE_URL = 'https://besteyecandy.com'
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
def __init__(self, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='BestEyeCandy')
|
||||
self.unified_db = unified_db
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Cookie support
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _get_cookies(self) -> Optional[list]:
|
||||
"""Load cookies from the scrapers table for besteyecandy."""
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?",
|
||||
(self.SERVICE_ID,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
return data['cookies']
|
||||
elif isinstance(data, list):
|
||||
return data
|
||||
except Exception as e:
|
||||
self.log(f"Could not load cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _build_cookie_jar(self, cookies_list: list) -> aiohttp.CookieJar:
|
||||
"""Build an aiohttp CookieJar from a list of cookie dicts."""
|
||||
jar = aiohttp.CookieJar(unsafe=True)
|
||||
for cookie in cookies_list:
|
||||
from http.cookies import Morsel
|
||||
import types
|
||||
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
domain = cookie.get('domain', '')
|
||||
path = cookie.get('path', '/')
|
||||
|
||||
# Use SimpleCookie approach
|
||||
from http.cookies import SimpleCookie
|
||||
sc = SimpleCookie()
|
||||
sc[name] = value
|
||||
sc[name]['domain'] = domain
|
||||
sc[name]['path'] = path
|
||||
if cookie.get('secure'):
|
||||
sc[name]['secure'] = True
|
||||
|
||||
jar.update_cookies(sc, urlparse(f"https://{domain.lstrip('.')}"))
|
||||
|
||||
return jar
|
||||
|
||||
def _create_session(self, timeout: aiohttp.ClientTimeout = None) -> aiohttp.ClientSession:
|
||||
"""Create an aiohttp session with cookies loaded from DB."""
|
||||
if timeout is None:
|
||||
timeout = aiohttp.ClientTimeout(total=60)
|
||||
|
||||
cookies_list = self._get_cookies()
|
||||
if cookies_list:
|
||||
jar = self._build_cookie_jar(cookies_list)
|
||||
self.log(f"Loaded {len(cookies_list)} cookies for session", 'debug')
|
||||
return aiohttp.ClientSession(timeout=timeout, cookie_jar=jar)
|
||||
else:
|
||||
self.log("No cookies found for besteyecandy, requests may fail", 'warning')
|
||||
return aiohttp.ClientSession(timeout=timeout)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_profile_info(self, cid: str, celeb_slug: str) -> Optional[Dict]:
|
||||
"""Fetch page 1 of a celeb's listing and return profile-like info."""
|
||||
url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-1/{celeb_slug}.html')
|
||||
|
||||
try:
|
||||
async with self._create_session() as session:
|
||||
async with session.get(url, headers=self.HEADERS,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"BestEyeCandy cid {cid} returned HTTP {resp.status}",
|
||||
'warning')
|
||||
return None
|
||||
page_html = await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Failed to fetch BestEyeCandy cid {cid}: {e}", 'error')
|
||||
return None
|
||||
|
||||
# Extract celeb name from page title or heading
|
||||
celeb_name = self._extract_celeb_name(page_html) or celeb_slug.replace('-', ' ')
|
||||
|
||||
# Extract total photos and pages
|
||||
total_photos = self._extract_total_photos(page_html)
|
||||
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
|
||||
page_count = self._extract_page_count(page_html,
|
||||
photos_per_page=photos_per_page)
|
||||
|
||||
celeb_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-1/{celeb_slug}.html')
|
||||
|
||||
return {
|
||||
'username': celeb_slug,
|
||||
'display_name': celeb_name,
|
||||
'post_count': total_photos,
|
||||
'page_count': page_count,
|
||||
'celeb_url': celeb_url,
|
||||
}
|
||||
|
||||
async def get_posts(self, cid: str, celeb_slug: str,
|
||||
known_post_ids: Optional[Set[str]] = None,
|
||||
progress_callback=None) -> List[Post]:
|
||||
"""Scrape all listing pages and return posts with full-res image URLs.
|
||||
|
||||
Each listing page becomes one Post with ~48 Attachments (one per photo).
|
||||
Post IDs are "page_N" (e.g. "page_1", "page_2", ...).
|
||||
|
||||
Phase 1: Fetch page 1, get first photo ID, visit detail page to learn
|
||||
the full-res URL pattern.
|
||||
Phase 2: Paginate all listing pages, build one Post per page.
|
||||
"""
|
||||
known = known_post_ids or set()
|
||||
posts: List[Post] = []
|
||||
total_photos = 0
|
||||
url_pattern = None
|
||||
|
||||
try:
|
||||
async with self._create_session() as session:
|
||||
# -- Phase 1: Fetch page 1 and determine full-res URL pattern --
|
||||
page1_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-1/{celeb_slug}.html')
|
||||
|
||||
page_html = await self._fetch_page(session, page1_url)
|
||||
if page_html is None:
|
||||
return []
|
||||
|
||||
# Estimate page count for progress display
|
||||
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
|
||||
estimated_pages = self._extract_page_count(
|
||||
page_html, photos_per_page=photos_per_page)
|
||||
self.log(f"Estimated {estimated_pages} pages of photos "
|
||||
f"({photos_per_page}/page)", 'info')
|
||||
|
||||
# Discover full-res URL pattern from first photo
|
||||
first_page_ids = self._extract_photo_ids(page_html)
|
||||
if first_page_ids:
|
||||
url_pattern = await self._discover_url_pattern(
|
||||
session, first_page_ids[0], cid, celeb_slug)
|
||||
|
||||
if not url_pattern:
|
||||
self.log("Could not determine full-res URL pattern", 'error')
|
||||
return []
|
||||
|
||||
self.log(f"URL pattern: server={url_pattern['server']}, "
|
||||
f"name_format={url_pattern['name_format']}, "
|
||||
f"ext={url_pattern['ext']}", 'info')
|
||||
|
||||
# -- Phase 2: Paginate all pages, one Post per page --
|
||||
page_num = 0
|
||||
has_next = True # start with page 1
|
||||
|
||||
while has_next:
|
||||
page_num += 1
|
||||
|
||||
if page_num == 1:
|
||||
# Already fetched page 1
|
||||
pass
|
||||
else:
|
||||
await asyncio.sleep(2) # Rate limit
|
||||
|
||||
page_url = (
|
||||
f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-{page_num}/{celeb_slug}.html')
|
||||
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if page_html is None:
|
||||
self.log(f"Failed to fetch page {page_num}, stopping",
|
||||
'warning')
|
||||
break
|
||||
|
||||
page_ids = self._extract_photo_ids(page_html)
|
||||
if not page_ids:
|
||||
self.log(f"Page {page_num}: no photos, stopping", 'info')
|
||||
break
|
||||
|
||||
total_photos += len(page_ids)
|
||||
has_next = self._has_next_page(page_html)
|
||||
|
||||
# Check if this page-post is already known
|
||||
post_id = f"page_{page_num}"
|
||||
if post_id in known:
|
||||
self.log(f"Page {page_num}: already known, skipping",
|
||||
'debug')
|
||||
if progress_callback:
|
||||
progress_callback(
|
||||
f"Page {page_num}/~{estimated_pages} — "
|
||||
f"{total_photos} photos (skipped known)")
|
||||
continue
|
||||
|
||||
# Build attachments for all photos on this page
|
||||
attachments = []
|
||||
for photo_id in page_ids:
|
||||
dl_url = self._construct_full_res_url(url_pattern, photo_id)
|
||||
filename = dl_url.rsplit('/', 1)[-1]
|
||||
|
||||
attachments.append(Attachment(
|
||||
name=filename,
|
||||
file_type='image',
|
||||
extension=url_pattern.get('ext', 'jpg'),
|
||||
server_path=dl_url,
|
||||
download_url=dl_url,
|
||||
))
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=cid,
|
||||
title=f"Page {page_num}",
|
||||
content=f"{len(page_ids)} photos",
|
||||
published_at=datetime.now(tz=timezone.utc).isoformat(),
|
||||
attachments=attachments,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(
|
||||
f"Page {page_num}/~{estimated_pages} — "
|
||||
f"{total_photos} photos")
|
||||
|
||||
self.log(f"Page {page_num}/~{estimated_pages}: "
|
||||
f"{len(page_ids)} photos", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error scraping BestEyeCandy: {e}", 'error')
|
||||
|
||||
self.log(f"Total: {len(posts)} new page-posts with "
|
||||
f"{total_photos} photos across all pages", 'info')
|
||||
return posts
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# URL pattern discovery
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _discover_url_pattern(self, session: aiohttp.ClientSession,
|
||||
photo_id: str, cid: str,
|
||||
celeb_slug: str) -> Optional[Dict]:
|
||||
"""Visit a detail page to discover the full-res URL pattern.
|
||||
|
||||
Returns dict with keys: server, dir_pattern, name_format, ext
|
||||
"""
|
||||
detail_url = (f'{self.BASE_URL}/section/celeb-photogallery/'
|
||||
f'cid-{cid}/{celeb_slug}/photo-{photo_id}.html')
|
||||
|
||||
await asyncio.sleep(2) # Rate limit
|
||||
page_html = await self._fetch_page(session, detail_url)
|
||||
if page_html is None:
|
||||
return None
|
||||
|
||||
# Look for full-res image URL in the detail page
|
||||
# Pattern: <img src="https://euX.besteyecandy.com/section/large-photos/area-female/besteyecandy-{ID}/{Name}_{ID}_BestEyeCandyCOM.jpg">
|
||||
# or <a href="..."> with similar pattern
|
||||
patterns = [
|
||||
r'(https?://[a-z0-9]+\.besteyecandy\.com/section/large-photos/[^"\'>\s]+)',
|
||||
r'(https?://[a-z0-9]+\.besteyecandy\.com/[^"\'>\s]*besteyecandy-' + re.escape(photo_id) + r'[^"\'>\s]*)',
|
||||
]
|
||||
|
||||
full_res_url = None
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, page_html)
|
||||
if match:
|
||||
full_res_url = match.group(1)
|
||||
break
|
||||
|
||||
if not full_res_url:
|
||||
self.log(f"Could not find full-res URL on detail page for photo {photo_id}",
|
||||
'error')
|
||||
return None
|
||||
|
||||
self.log(f"Found full-res URL: {full_res_url}", 'debug')
|
||||
|
||||
# Parse the URL to extract the pattern components
|
||||
parsed = urlparse(full_res_url)
|
||||
server = parsed.netloc # e.g., eu4.besteyecandy.com
|
||||
|
||||
# Extract name format from the filename
|
||||
# e.g., Myleene_Klass_7727820_BestEyeCandyCOM.jpg
|
||||
filename = parsed.path.rsplit('/', 1)[-1]
|
||||
ext = filename.rsplit('.', 1)[-1] if '.' in filename else 'jpg'
|
||||
|
||||
# Extract the path pattern (everything before the filename)
|
||||
path_dir = parsed.path.rsplit('/', 1)[0] # e.g., /section/large-photos/area-female/besteyecandy-7727820
|
||||
|
||||
# The directory pattern includes the photo ID, extract the base
|
||||
# e.g., /section/large-photos/area-female/besteyecandy-{ID}
|
||||
dir_pattern = re.sub(re.escape(photo_id), '{ID}', path_dir)
|
||||
|
||||
# Extract the name format by removing the photo ID
|
||||
# e.g., Myleene_Klass_{ID}_BestEyeCandyCOM.jpg -> Myleene_Klass_{ID}_BestEyeCandyCOM
|
||||
name_without_ext = filename.rsplit('.', 1)[0]
|
||||
name_format = name_without_ext.replace(photo_id, '{ID}')
|
||||
|
||||
return {
|
||||
'server': server,
|
||||
'dir_pattern': dir_pattern,
|
||||
'name_format': name_format,
|
||||
'ext': ext,
|
||||
'example_url': full_res_url,
|
||||
}
|
||||
|
||||
def _construct_full_res_url(self, url_pattern: Dict, photo_id: str) -> str:
|
||||
"""Construct the full-res URL for a photo ID using the discovered pattern."""
|
||||
dir_path = url_pattern['dir_pattern'].replace('{ID}', photo_id)
|
||||
filename = url_pattern['name_format'].replace('{ID}', photo_id) + '.' + url_pattern['ext']
|
||||
return f"https://{url_pattern['server']}{dir_path}/{filename}"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTML parsing helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _extract_photo_ids(self, page_html: str) -> List[str]:
|
||||
"""Extract photo IDs from a listing page.
|
||||
|
||||
Photo links look like: href="...photo-12345.html"
|
||||
"""
|
||||
ids = re.findall(r'href="[^"]*photo-(\d+)\.html"', page_html)
|
||||
# Deduplicate while preserving order
|
||||
seen = set()
|
||||
unique_ids = []
|
||||
for pid in ids:
|
||||
if pid not in seen:
|
||||
seen.add(pid)
|
||||
unique_ids.append(pid)
|
||||
return unique_ids
|
||||
|
||||
@staticmethod
|
||||
def _extract_celeb_name(page_html: str) -> Optional[str]:
|
||||
"""Extract celebrity name from the page."""
|
||||
# Try <title> tag: "Myleene Klass Photo Collection @ ...::: BestEyeCandy.com :::..."
|
||||
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
title = html.unescape(m.group(1).strip())
|
||||
# Remove everything from "Photo Collection" or "@" onwards
|
||||
title = re.sub(r'\s*Photo\s+Collection.*$', '', title,
|
||||
flags=re.IGNORECASE).strip()
|
||||
title = re.sub(r'\s*@.*$', '', title).strip()
|
||||
# Fallback: remove BestEyeCandy suffix
|
||||
title = re.sub(r'\s*[-\u2013\u2014|]?\s*\.{0,3}:{0,3}\s*BestEyeCandy.*$', '',
|
||||
title, flags=re.IGNORECASE).strip()
|
||||
if title:
|
||||
return title
|
||||
|
||||
# Try <h1> or <h2>
|
||||
m = re.search(r'<h[12][^>]*>([^<]+)</h[12]>', page_html)
|
||||
if m:
|
||||
return html.unescape(m.group(1).strip())
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_total_photos(page_html: str) -> int:
|
||||
"""Extract total photo count from the page.
|
||||
|
||||
Handles European format (15.660) and US format (15,660).
|
||||
"""
|
||||
# Look for "N.NNN photos" or "N,NNN photos" or "NNN photos"
|
||||
# Require leading digit to avoid matching ", photo" from keywords
|
||||
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
num_str = m.group(1)
|
||||
# European format uses dots as thousands separators: 15.660
|
||||
# US format uses commas: 15,660
|
||||
# Remove both dots and commas (they're thousands separators)
|
||||
num_str = num_str.replace('.', '').replace(',', '')
|
||||
try:
|
||||
return int(num_str)
|
||||
except ValueError:
|
||||
pass
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def _extract_page_count(page_html: str, photos_per_page: int = 48) -> int:
|
||||
"""Extract total page count from the listing page.
|
||||
|
||||
Uses total photo count divided by photos per page, or falls back
|
||||
to finding the maximum page number in pagination links.
|
||||
"""
|
||||
# Method 1: Calculate from total photos
|
||||
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
num_str = m.group(1).replace('.', '').replace(',', '')
|
||||
try:
|
||||
total = int(num_str)
|
||||
if total > 0:
|
||||
return (total + photos_per_page - 1) // photos_per_page
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Method 2: Find max page-N in pagination links for same celeb
|
||||
page_nums = [int(x) for x in re.findall(r'/page-(\d+)/', page_html)]
|
||||
if page_nums:
|
||||
return max(page_nums)
|
||||
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def _has_next_page(page_html: str) -> bool:
|
||||
"""Check if there's a 'Next Page' link on the current page."""
|
||||
return 'alt="Next Page"' in page_html
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utility helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_page(self, session: aiohttp.ClientSession,
|
||||
url: str) -> Optional[str]:
|
||||
"""Fetch a single page, return HTML or None."""
|
||||
try:
|
||||
async with session.get(url, headers=self.HEADERS,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
||||
return None
|
||||
return await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {url}: {e}", 'warning')
|
||||
return None
|
||||
622
modules/paid_content/coppermine_client.py
Normal file
622
modules/paid_content/coppermine_client.py
Normal file
@@ -0,0 +1,622 @@
|
||||
"""
|
||||
Coppermine Gallery scraper client.
|
||||
|
||||
Coppermine is a PHP photo gallery with a nested structure:
|
||||
categories > sub-categories > albums > photos
|
||||
|
||||
One album maps to one Post with N Attachments.
|
||||
Full-res URLs are derived from thumbnails by stripping the `thumb_` prefix.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urljoin, urlparse, parse_qs
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class CoppermineClient(LoggingMixin):
|
||||
SERVICE_ID = 'coppermine'
|
||||
PLATFORM = 'coppermine'
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
}
|
||||
|
||||
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
|
||||
|
||||
def __init__(self, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Coppermine')
|
||||
|
||||
async def get_profile_info(self, gallery_url: str) -> Optional[Dict]:
|
||||
"""Fetch gallery root and extract profile metadata.
|
||||
|
||||
Args:
|
||||
gallery_url: Base gallery URL (e.g. https://kylie-jenner.org/gallery)
|
||||
|
||||
Returns:
|
||||
Dict with username, display_name, post_count, gallery_url or None on failure
|
||||
"""
|
||||
root_url = self._build_url(gallery_url, 'index.php')
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
html = await self._fetch_page(session, root_url)
|
||||
if not html:
|
||||
return None
|
||||
|
||||
# Extract site title from <title> tag
|
||||
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
|
||||
site_title = title_match.group(1).strip() if title_match else 'Coppermine Gallery'
|
||||
# Clean HTML entities
|
||||
site_title = re.sub(r'&', '&', site_title)
|
||||
site_title = re.sub(r'<', '<', site_title)
|
||||
site_title = re.sub(r'>', '>', site_title)
|
||||
site_title = re.sub(r'&#\d+;', '', site_title)
|
||||
site_title = re.sub(r'&\w+;', '', site_title)
|
||||
|
||||
# Try to extract stats: "N files in M albums"
|
||||
total_files = 0
|
||||
total_albums = 0
|
||||
stats_match = re.search(
|
||||
r'(\d[\d,]*)\s+files?\s+in\s+(\d[\d,]*)\s+albums?',
|
||||
html, re.IGNORECASE
|
||||
)
|
||||
if stats_match:
|
||||
total_files = int(stats_match.group(1).replace(',', ''))
|
||||
total_albums = int(stats_match.group(2).replace(',', ''))
|
||||
|
||||
# Use domain as username
|
||||
parsed = urlparse(gallery_url)
|
||||
domain = parsed.netloc.replace('www.', '')
|
||||
|
||||
return {
|
||||
'username': domain,
|
||||
'display_name': site_title,
|
||||
'post_count': total_albums,
|
||||
'gallery_url': gallery_url,
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching profile info from {gallery_url}: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def get_posts(self, gallery_url: str,
|
||||
known_post_ids: Optional[Set[str]] = None,
|
||||
progress_callback=None,
|
||||
post_callback=None):
|
||||
"""Crawl the gallery, yielding new albums as Post objects incrementally.
|
||||
|
||||
Phase 1: Fetch root, extract top-level category links
|
||||
Phase 2: Recursively crawl categories until album links found
|
||||
Phase 3: For each album, fetch thumbnails and call post_callback immediately
|
||||
|
||||
Args:
|
||||
gallery_url: Base gallery URL
|
||||
known_post_ids: Set of post IDs already in DB (album_NNN)
|
||||
progress_callback: Called with status message strings
|
||||
post_callback: async callable(post) — called for each album as it's fetched.
|
||||
If provided, posts are streamed instead of collected.
|
||||
|
||||
Returns:
|
||||
List of Post objects (only if post_callback is None)
|
||||
"""
|
||||
known = known_post_ids or set()
|
||||
timeout = aiohttp.ClientTimeout(total=None, sock_connect=30, sock_read=60)
|
||||
posts_collected = [] if post_callback is None else None
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# Phase 1: Get all category links from root
|
||||
root_url = self._build_url(gallery_url, 'index.php')
|
||||
root_html = await self._fetch_page(session, root_url)
|
||||
if not root_html:
|
||||
self.log("Failed to fetch gallery root", 'error')
|
||||
return [] if post_callback is None else None
|
||||
|
||||
category_ids = self._extract_category_ids(root_html)
|
||||
self.log(f"Found {len(category_ids)} top-level categories", 'info')
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f'Found {len(category_ids)} categories, crawling...')
|
||||
|
||||
# Phase 2: Recursively crawl categories to find album IDs
|
||||
album_ids = set()
|
||||
visited_cats = set()
|
||||
for cat_id in category_ids:
|
||||
new_albums = await self._crawl_category(
|
||||
session, gallery_url, cat_id, visited_cats, known, progress_callback
|
||||
)
|
||||
album_ids.update(new_albums)
|
||||
|
||||
# Filter out known albums
|
||||
new_album_ids = {aid for aid in album_ids
|
||||
if f"album_{aid}" not in known}
|
||||
|
||||
self.log(f"Found {len(new_album_ids)} new albums "
|
||||
f"({len(album_ids)} total, {len(album_ids) - len(new_album_ids)} known)",
|
||||
'info')
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f'Found {len(new_album_ids)} new albums, fetching photos...')
|
||||
|
||||
# Phase 3: Fetch each new album and deliver Post objects
|
||||
parsed = urlparse(gallery_url)
|
||||
domain = parsed.netloc.replace('www.', '')
|
||||
fetched = 0
|
||||
|
||||
for i, album_id in enumerate(sorted(new_album_ids)):
|
||||
if progress_callback and (i + 1) % 5 == 0:
|
||||
progress_callback(
|
||||
f'Fetching album {i + 1}/{len(new_album_ids)}...'
|
||||
)
|
||||
|
||||
post = await self._fetch_album(session, gallery_url, album_id, domain)
|
||||
if post and post.attachments:
|
||||
fetched += 1
|
||||
if post_callback:
|
||||
await post_callback(post)
|
||||
else:
|
||||
posts_collected.append(post)
|
||||
|
||||
# Rate limit: 1s between page fetches
|
||||
await asyncio.sleep(2)
|
||||
|
||||
self.log(f"Fetched {fetched} albums with attachments", 'info')
|
||||
return posts_collected
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error crawling gallery {gallery_url}: {e}", 'error')
|
||||
return [] if post_callback is None else None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _build_url(self, gallery_url: str, page: str) -> str:
|
||||
"""Build a full URL from the gallery base and a page name."""
|
||||
base = gallery_url.rstrip('/')
|
||||
return f"{base}/{page}"
|
||||
|
||||
async def _fetch_page(self, session: aiohttp.ClientSession, url: str,
|
||||
max_retries: int = 3) -> Optional[str]:
|
||||
"""Fetch a page and return its HTML text, or None on failure.
|
||||
|
||||
Retries with exponential backoff on connection errors / server disconnects.
|
||||
"""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with session.get(url, headers=self.HEADERS) as resp:
|
||||
if resp.status == 429:
|
||||
wait = 5 * (attempt + 1)
|
||||
self.log(f"Rate limited on {url}, waiting {wait}s", 'warning')
|
||||
await asyncio.sleep(wait)
|
||||
continue
|
||||
if resp.status != 200:
|
||||
self.log(f"HTTP {resp.status} fetching {url}", 'warning')
|
||||
return None
|
||||
return await resp.text()
|
||||
except (aiohttp.ServerDisconnectedError, aiohttp.ClientOSError,
|
||||
aiohttp.ClientPayloadError, ConnectionResetError) as e:
|
||||
wait = 3 * (attempt + 1)
|
||||
if attempt < max_retries - 1:
|
||||
self.log(f"Connection error on {url}, retry {attempt + 1} in {wait}s: {e}",
|
||||
'warning')
|
||||
await asyncio.sleep(wait)
|
||||
else:
|
||||
self.log(f"Failed after {max_retries} attempts: {url}: {e}", 'warning')
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {url}: {e}", 'warning')
|
||||
return None
|
||||
return None
|
||||
|
||||
def _extract_category_ids(self, html: str) -> List[str]:
|
||||
"""Extract category IDs from index.php page.
|
||||
|
||||
Looks for links like: index.php?cat=N
|
||||
"""
|
||||
cat_ids = []
|
||||
seen = set()
|
||||
for match in re.finditer(r'index\.php\?cat=(\d+)', html):
|
||||
cat_id = match.group(1)
|
||||
if cat_id not in seen:
|
||||
seen.add(cat_id)
|
||||
cat_ids.append(cat_id)
|
||||
return cat_ids
|
||||
|
||||
def _extract_album_ids(self, html: str) -> List[str]:
|
||||
"""Extract album IDs from a category page.
|
||||
|
||||
Looks for links like: thumbnails.php?album=N
|
||||
"""
|
||||
album_ids = []
|
||||
seen = set()
|
||||
for match in re.finditer(r'thumbnails\.php\?album=(\d+)', html):
|
||||
album_id = match.group(1)
|
||||
if album_id not in seen:
|
||||
seen.add(album_id)
|
||||
album_ids.append(album_id)
|
||||
return album_ids
|
||||
|
||||
def _extract_page_count(self, html: str) -> int:
|
||||
"""Extract total page count from Coppermine pagination text.
|
||||
|
||||
Looks for patterns like "53 albums on 2 page(s)" or "N files on M page(s)".
|
||||
"""
|
||||
match = re.search(r'on\s+(\d+)\s+page\(s\)', html, re.IGNORECASE)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
return 1
|
||||
|
||||
async def _crawl_category(self, session: aiohttp.ClientSession,
|
||||
gallery_url: str, cat_id: str,
|
||||
visited: Set[str], known: Set[str],
|
||||
progress_callback=None,
|
||||
depth: int = 0) -> Set[str]:
|
||||
"""Recursively crawl a category to find all album IDs.
|
||||
|
||||
Categories can contain sub-categories or albums. We recurse
|
||||
until we find album links (thumbnails.php?album=N).
|
||||
Handles pagination within category pages (index.php?cat=N&page=M).
|
||||
|
||||
Args:
|
||||
session: aiohttp session
|
||||
gallery_url: Base gallery URL
|
||||
cat_id: Category ID to crawl
|
||||
visited: Set of already-visited category IDs (prevents loops)
|
||||
known: Set of known post_ids (for logging only)
|
||||
progress_callback: Status callback
|
||||
depth: Recursion depth (max 10)
|
||||
|
||||
Returns:
|
||||
Set of album ID strings
|
||||
"""
|
||||
if cat_id in visited or depth > 10:
|
||||
return set()
|
||||
visited.add(cat_id)
|
||||
|
||||
# Fetch first page
|
||||
cat_url = self._build_url(gallery_url, f'index.php?cat={cat_id}')
|
||||
html = await self._fetch_page(session, cat_url)
|
||||
if not html:
|
||||
return set()
|
||||
|
||||
await asyncio.sleep(2)
|
||||
|
||||
album_ids = set(self._extract_album_ids(html))
|
||||
sub_cat_ids = self._extract_category_ids(html)
|
||||
|
||||
# Handle pagination: fetch remaining pages
|
||||
total_pages = self._extract_page_count(html)
|
||||
if total_pages > 1:
|
||||
for page_num in range(2, total_pages + 1):
|
||||
page_url = self._build_url(
|
||||
gallery_url, f'index.php?cat={cat_id}&page={page_num}'
|
||||
)
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if page_html:
|
||||
album_ids.update(self._extract_album_ids(page_html))
|
||||
# Sub-categories are the same on every page, no need to re-extract
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Filter out the current category from sub-categories
|
||||
sub_cat_ids = [c for c in sub_cat_ids if c != cat_id and c not in visited]
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(
|
||||
f'Category {cat_id}: {len(album_ids)} albums, '
|
||||
f'{len(sub_cat_ids)} sub-categories'
|
||||
+ (f' ({total_pages} pages)' if total_pages > 1 else '')
|
||||
)
|
||||
|
||||
# Recurse into sub-categories
|
||||
for sub_id in sub_cat_ids:
|
||||
sub_albums = await self._crawl_category(
|
||||
session, gallery_url, sub_id, visited, known,
|
||||
progress_callback, depth + 1
|
||||
)
|
||||
album_ids.update(sub_albums)
|
||||
|
||||
return album_ids
|
||||
|
||||
async def _fetch_album(self, session: aiohttp.ClientSession,
|
||||
gallery_url: str, album_id: str,
|
||||
domain: str) -> Optional[Post]:
|
||||
"""Fetch an album page (all pages) and build a Post object.
|
||||
|
||||
Handles pagination within albums (thumbnails.php?album=N&page=M).
|
||||
|
||||
Args:
|
||||
session: aiohttp session
|
||||
gallery_url: Base gallery URL
|
||||
album_id: Album ID to fetch
|
||||
domain: Domain name for creator_id
|
||||
|
||||
Returns:
|
||||
Post object with attachments, or None on failure
|
||||
"""
|
||||
album_url = self._build_url(gallery_url, f'thumbnails.php?album={album_id}')
|
||||
html = await self._fetch_page(session, album_url)
|
||||
if not html:
|
||||
return None
|
||||
|
||||
# Extract album title from first page
|
||||
title = self._extract_album_title(html)
|
||||
if not title:
|
||||
title = f"Album {album_id}"
|
||||
|
||||
# Extract attachments from first page
|
||||
attachments = self._extract_attachments(html, gallery_url)
|
||||
|
||||
# Handle pagination within album
|
||||
total_pages = self._extract_page_count(html)
|
||||
if total_pages > 1:
|
||||
for page_num in range(2, total_pages + 1):
|
||||
page_url = self._build_url(
|
||||
gallery_url, f'thumbnails.php?album={album_id}&page={page_num}'
|
||||
)
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if page_html:
|
||||
attachments.extend(self._extract_attachments(page_html, gallery_url))
|
||||
await asyncio.sleep(2)
|
||||
|
||||
if not attachments:
|
||||
return None
|
||||
|
||||
# Extract album date from breadcrumb + title
|
||||
album_date = self._extract_album_date(html, title)
|
||||
|
||||
post_id = f"album_{album_id}"
|
||||
return Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=domain,
|
||||
title=None,
|
||||
content=title,
|
||||
published_at=album_date,
|
||||
attachments=attachments,
|
||||
)
|
||||
|
||||
def _extract_album_title(self, html: str) -> Optional[str]:
|
||||
"""Extract album title from page HTML.
|
||||
|
||||
Priority: breadcrumb last item > <h1>/<h2> heading > <title> last segment
|
||||
"""
|
||||
# Try breadcrumb: last text segment after the last ">"
|
||||
# Coppermine breadcrumbs: "Home > Category > Sub > Album Title"
|
||||
bc_match = re.search(
|
||||
r'class="[^"]*breadcrumb[^"]*"[^>]*>(.*?)</(?:div|span|td|p)',
|
||||
html, re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
if bc_match:
|
||||
bc_text = bc_match.group(1)
|
||||
# Strip HTML tags, split on ">", take last segment
|
||||
bc_text = re.sub(r'<[^>]+>', ' ', bc_text)
|
||||
parts = [p.strip() for p in bc_text.split('>') if p.strip()]
|
||||
if parts:
|
||||
title = self._clean_text(parts[-1])
|
||||
if title and title.lower() not in ('home', 'index', 'gallery'):
|
||||
return title
|
||||
|
||||
# Try headings
|
||||
for tag in ('h1', 'h2', 'h3'):
|
||||
h_match = re.search(
|
||||
rf'<{tag}[^>]*>(.*?)</{tag}>', html, re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
if h_match:
|
||||
title = self._clean_text(h_match.group(1))
|
||||
if title and len(title) > 2:
|
||||
return title
|
||||
|
||||
# Fallback: <title> tag — take the last segment before the site name
|
||||
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
|
||||
if title_match:
|
||||
title = title_match.group(1).strip()
|
||||
# Usually "Site Name - Album Title" or "Album Title - Site Name"
|
||||
# The album-specific part is typically not the site name;
|
||||
# use the longest segment as a heuristic
|
||||
if ' - ' in title:
|
||||
parts = [p.strip() for p in title.split(' - ')]
|
||||
# Pick the longest part (album names tend to be longer than site names)
|
||||
title = max(parts, key=len)
|
||||
if title:
|
||||
return self._clean_text(title)
|
||||
|
||||
return None
|
||||
|
||||
def _extract_album_date(self, html: str, title: str) -> str:
|
||||
"""Extract album date from breadcrumb year + title month/day.
|
||||
|
||||
Breadcrumb: "Home > Candids > 2026 > January 11 - Leaving..."
|
||||
Title: "January 11 - Leaving Golden Globes afterparty..."
|
||||
|
||||
Returns ISO date string, or current datetime as fallback.
|
||||
"""
|
||||
MONTHS = {
|
||||
'january': 1, 'february': 2, 'march': 3, 'april': 4,
|
||||
'may': 5, 'june': 6, 'july': 7, 'august': 8,
|
||||
'september': 9, 'october': 10, 'november': 11, 'december': 12,
|
||||
}
|
||||
|
||||
# Extract year from breadcrumb path (look for 4-digit year in links)
|
||||
year = None
|
||||
# Breadcrumb links: index.php?cat=155">2026</a>
|
||||
for m in re.finditer(r'>\s*((?:19|20)\d{2})\s*</', html):
|
||||
year = int(m.group(1))
|
||||
|
||||
# Also try path segments in albums/ URLs for year
|
||||
if not year:
|
||||
path_match = re.search(r'albums/[^/]+/(20\d{2})/', html)
|
||||
if path_match:
|
||||
year = int(path_match.group(1))
|
||||
|
||||
# Extract month and day from album title
|
||||
month, day = None, None
|
||||
if title:
|
||||
# "January 11 - ..." or "March 3 - ..."
|
||||
date_match = re.match(
|
||||
r'(\w+)\s+(\d{1,2})\b', title
|
||||
)
|
||||
if date_match:
|
||||
month_name = date_match.group(1).lower()
|
||||
if month_name in MONTHS:
|
||||
month = MONTHS[month_name]
|
||||
day = int(date_match.group(2))
|
||||
|
||||
# Build date from breadcrumb year + title month/day
|
||||
if year and month and day:
|
||||
try:
|
||||
return datetime(year, month, day).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
if year and month:
|
||||
try:
|
||||
return datetime(year, month, 1).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
if year:
|
||||
return datetime(year, 1, 1).isoformat()
|
||||
|
||||
# Fallback: parse "Date added=Jan 13, 2026" from thumbnail tooltips
|
||||
MONTH_ABBR = {
|
||||
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
|
||||
'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
|
||||
'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
|
||||
}
|
||||
added_match = re.search(
|
||||
r'Date added\s*=\s*(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
|
||||
)
|
||||
if added_match:
|
||||
m_abbr = added_match.group(1).lower()
|
||||
if m_abbr in MONTH_ABBR:
|
||||
try:
|
||||
return datetime(
|
||||
int(added_match.group(3)),
|
||||
MONTH_ABBR[m_abbr],
|
||||
int(added_match.group(2))
|
||||
).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Also try "last one added on Jan 13, 2026" from album_stat
|
||||
stat_match = re.search(
|
||||
r'last one added on\s+(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
|
||||
)
|
||||
if stat_match:
|
||||
m_abbr = stat_match.group(1).lower()
|
||||
if m_abbr in MONTH_ABBR:
|
||||
try:
|
||||
return datetime(
|
||||
int(stat_match.group(3)),
|
||||
MONTH_ABBR[m_abbr],
|
||||
int(stat_match.group(2))
|
||||
).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return datetime.now().isoformat()
|
||||
|
||||
def _extract_attachments(self, html: str, gallery_url: str) -> List[Attachment]:
|
||||
"""Extract photo attachments from album page HTML.
|
||||
|
||||
Finds thumbnail images and converts them to full-res URLs by
|
||||
stripping the `thumb_` prefix from the filename.
|
||||
"""
|
||||
attachments = []
|
||||
seen_urls = set()
|
||||
|
||||
# Pattern: thumbnail images in album pages
|
||||
# Common patterns:
|
||||
# <img src="albums/path/thumb_filename.jpg" ...>
|
||||
# <img src="albums/path/normal_filename.jpg" ...>
|
||||
for match in re.finditer(
|
||||
r'<img[^>]+src=["\']([^"\']*?albums/[^"\']*?(?:thumb_|normal_)[^"\']+)["\']',
|
||||
html, re.IGNORECASE
|
||||
):
|
||||
thumb_src = match.group(1)
|
||||
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
|
||||
if full_url and full_url not in seen_urls:
|
||||
seen_urls.add(full_url)
|
||||
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
|
||||
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
||||
|
||||
attachments.append(Attachment(
|
||||
name=filename,
|
||||
server_path=full_url, # use as dedup key
|
||||
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
|
||||
extension=ext or None,
|
||||
download_url=full_url,
|
||||
))
|
||||
|
||||
# Also try: <a href="displayimage.php?..."><img src="albums/...">
|
||||
# Some themes wrap thumbnails in links
|
||||
if not attachments:
|
||||
for match in re.finditer(
|
||||
r'<a[^>]+href=["\'][^"\']*displayimage\.php[^"\']*["\'][^>]*>'
|
||||
r'\s*<img[^>]+src=["\']([^"\']+)["\']',
|
||||
html, re.IGNORECASE | re.DOTALL
|
||||
):
|
||||
thumb_src = match.group(1)
|
||||
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
|
||||
if full_url and full_url not in seen_urls:
|
||||
seen_urls.add(full_url)
|
||||
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
|
||||
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
||||
|
||||
attachments.append(Attachment(
|
||||
name=filename,
|
||||
server_path=full_url,
|
||||
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
|
||||
extension=ext or None,
|
||||
download_url=full_url,
|
||||
))
|
||||
|
||||
return attachments
|
||||
|
||||
def _thumb_to_fullres(self, thumb_src: str, gallery_url: str) -> Optional[str]:
|
||||
"""Convert a thumbnail URL to a full-resolution URL.
|
||||
|
||||
Strips `thumb_` or `normal_` prefix from the filename and
|
||||
prepends the gallery base URL if needed.
|
||||
|
||||
Args:
|
||||
thumb_src: Thumbnail src attribute value
|
||||
gallery_url: Base gallery URL
|
||||
|
||||
Returns:
|
||||
Full-resolution image URL, or None if conversion fails
|
||||
"""
|
||||
if not thumb_src:
|
||||
return None
|
||||
|
||||
# Strip thumb_ or normal_ prefix from filename
|
||||
# e.g. albums/candids/2026/0111/thumb_001.jpg → albums/candids/2026/0111/001.jpg
|
||||
fullres_path = re.sub(r'(/)(?:thumb_|normal_)', r'\1', thumb_src)
|
||||
|
||||
# If the path is already absolute (starts with http), return as-is
|
||||
if fullres_path.startswith(('http://', 'https://')):
|
||||
return fullres_path
|
||||
|
||||
# Otherwise, make it absolute relative to gallery URL
|
||||
base = gallery_url.rstrip('/')
|
||||
fullres_path = fullres_path.lstrip('./')
|
||||
return f"{base}/{fullres_path}"
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
"""Clean HTML entities and whitespace from text."""
|
||||
text = re.sub(r'&', '&', text)
|
||||
text = re.sub(r'<', '<', text)
|
||||
text = re.sub(r'>', '>', text)
|
||||
text = re.sub(r'"', '"', text)
|
||||
text = re.sub(r'&#\d+;', '', text)
|
||||
text = re.sub(r'&\w+;', '', text)
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
return text.strip()
|
||||
3616
modules/paid_content/db_adapter.py
Normal file
3616
modules/paid_content/db_adapter.py
Normal file
File diff suppressed because it is too large
Load Diff
297
modules/paid_content/embed_downloader.py
Normal file
297
modules/paid_content/embed_downloader.py
Normal file
@@ -0,0 +1,297 @@
|
||||
"""
|
||||
Embed Downloader - Downloads embedded videos from posts using yt-dlp
|
||||
Supports: YouTube, Vimeo, Dailymotion, Twitch, and many other platforms
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
|
||||
|
||||
class EmbedDownloader(LoggingMixin):
|
||||
"""
|
||||
Download embedded videos from posts using yt-dlp
|
||||
|
||||
Wrapper around yt-dlp for downloading videos from various platforms
|
||||
embedded in creator posts.
|
||||
"""
|
||||
|
||||
# Quality presets for yt-dlp
|
||||
QUALITY_PRESETS = {
|
||||
'best': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
|
||||
'1080p': 'bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/best[height<=1080][ext=mp4]/best',
|
||||
'720p': 'bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/best[height<=720][ext=mp4]/best',
|
||||
'480p': 'bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/best[height<=480][ext=mp4]/best',
|
||||
'audio': 'bestaudio[ext=m4a]/bestaudio/best',
|
||||
}
|
||||
|
||||
def __init__(self, ytdlp_path: str = None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Embed')
|
||||
|
||||
# Find yt-dlp executable
|
||||
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
|
||||
if not self.ytdlp_path:
|
||||
self.log("yt-dlp not found, embed downloading will be disabled", 'warning')
|
||||
|
||||
def _find_ytdlp(self) -> Optional[str]:
|
||||
"""Find yt-dlp executable"""
|
||||
# Check common locations
|
||||
common_paths = [
|
||||
'/usr/local/bin/yt-dlp',
|
||||
'/usr/bin/yt-dlp',
|
||||
'/opt/homebrew/bin/yt-dlp',
|
||||
os.path.expanduser('~/.local/bin/yt-dlp'),
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
|
||||
# Try to find via which
|
||||
try:
|
||||
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if yt-dlp is available"""
|
||||
return self.ytdlp_path is not None
|
||||
|
||||
async def download(self, url: str, output_dir: Path, quality: str = 'best',
|
||||
filename_template: str = None) -> Dict:
|
||||
"""
|
||||
Download video from URL
|
||||
|
||||
Args:
|
||||
url: Video URL to download
|
||||
output_dir: Directory to save the video
|
||||
quality: Quality preset ('best', '1080p', '720p', '480p', 'audio')
|
||||
filename_template: Optional custom filename template
|
||||
|
||||
Returns:
|
||||
Dict with success status and file info
|
||||
"""
|
||||
if not self.is_available():
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'yt-dlp not available'
|
||||
}
|
||||
|
||||
try:
|
||||
# Create output directory
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build output template
|
||||
if filename_template:
|
||||
output_template = str(output_dir / filename_template)
|
||||
else:
|
||||
output_template = str(output_dir / 'embed_%(title).50s_%(id)s.%(ext)s')
|
||||
|
||||
# Get format string
|
||||
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
|
||||
|
||||
# Build command
|
||||
cmd = [
|
||||
self.ytdlp_path,
|
||||
'--no-playlist',
|
||||
'--no-warnings',
|
||||
'-f', format_str,
|
||||
'--merge-output-format', 'mp4',
|
||||
'-o', output_template,
|
||||
'--print-json', # Output JSON with video info
|
||||
url
|
||||
]
|
||||
|
||||
self.log(f"Downloading embed: {url}", 'debug')
|
||||
|
||||
# Run yt-dlp
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
# Try to extract useful error message
|
||||
if 'Video unavailable' in error_msg:
|
||||
error_msg = 'Video unavailable or private'
|
||||
elif 'age-restricted' in error_msg.lower():
|
||||
error_msg = 'Video is age-restricted'
|
||||
elif 'members only' in error_msg.lower():
|
||||
error_msg = 'Video is members-only'
|
||||
elif len(error_msg) > 200:
|
||||
error_msg = error_msg[:200] + '...'
|
||||
|
||||
self.log(f"yt-dlp failed: {error_msg}", 'warning')
|
||||
return {
|
||||
'success': False,
|
||||
'error': error_msg or f'yt-dlp exited with code {result.returncode}'
|
||||
}
|
||||
|
||||
# Parse output JSON
|
||||
stdout_text = stdout.decode('utf-8', errors='replace')
|
||||
video_info = None
|
||||
|
||||
for line in stdout_text.strip().split('\n'):
|
||||
try:
|
||||
video_info = json.loads(line)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not video_info:
|
||||
# Try to find the downloaded file
|
||||
files = list(output_dir.glob('embed_*'))
|
||||
if files:
|
||||
file_path = files[0]
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path),
|
||||
'filename': file_path.name,
|
||||
'file_size': file_path.stat().st_size if file_path.exists() else None
|
||||
}
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'Could not parse yt-dlp output'
|
||||
}
|
||||
|
||||
# Extract file info
|
||||
file_path = video_info.get('_filename') or video_info.get('filename')
|
||||
|
||||
# Handle potential path issues
|
||||
if file_path:
|
||||
file_path = Path(file_path)
|
||||
if not file_path.exists():
|
||||
# Try to find the file
|
||||
possible_files = list(output_dir.glob(f"*{video_info.get('id', '')}*"))
|
||||
if possible_files:
|
||||
file_path = possible_files[0]
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path) if file_path else None,
|
||||
'filename': file_path.name if file_path else None,
|
||||
'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
|
||||
'title': video_info.get('title'),
|
||||
'duration': video_info.get('duration'),
|
||||
'uploader': video_info.get('uploader'),
|
||||
'upload_date': video_info.get('upload_date'),
|
||||
'video_id': video_info.get('id'),
|
||||
'platform': video_info.get('extractor_key', video_info.get('extractor', 'unknown')).lower()
|
||||
}
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'Download timed out'
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading embed: {e}", 'error')
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
async def get_video_info(self, url: str) -> Dict:
|
||||
"""
|
||||
Get video information without downloading
|
||||
|
||||
Args:
|
||||
url: Video URL
|
||||
|
||||
Returns:
|
||||
Dict with video metadata
|
||||
"""
|
||||
if not self.is_available():
|
||||
return {'success': False, 'error': 'yt-dlp not available'}
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
self.ytdlp_path,
|
||||
'--no-playlist',
|
||||
'--no-warnings',
|
||||
'-j', # Output JSON
|
||||
'--no-download',
|
||||
url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
return {
|
||||
'success': False,
|
||||
'error': error_msg or f'yt-dlp exited with code {result.returncode}'
|
||||
}
|
||||
|
||||
video_info = json.loads(stdout.decode('utf-8'))
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'title': video_info.get('title'),
|
||||
'duration': video_info.get('duration'),
|
||||
'uploader': video_info.get('uploader'),
|
||||
'upload_date': video_info.get('upload_date'),
|
||||
'view_count': video_info.get('view_count'),
|
||||
'like_count': video_info.get('like_count'),
|
||||
'description': video_info.get('description'),
|
||||
'thumbnail': video_info.get('thumbnail'),
|
||||
'video_id': video_info.get('id'),
|
||||
'platform': video_info.get('extractor_key', video_info.get('extractor', 'unknown')).lower(),
|
||||
'formats': len(video_info.get('formats', []))
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting video info: {e}", 'error')
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def detect_platform(url: str) -> Optional[str]:
|
||||
"""Detect video platform from URL"""
|
||||
url_lower = url.lower()
|
||||
|
||||
if 'youtube.com' in url_lower or 'youtu.be' in url_lower:
|
||||
return 'youtube'
|
||||
elif 'vimeo.com' in url_lower:
|
||||
return 'vimeo'
|
||||
elif 'dailymotion.com' in url_lower:
|
||||
return 'dailymotion'
|
||||
elif 'twitch.tv' in url_lower:
|
||||
return 'twitch'
|
||||
elif 'twitter.com' in url_lower or 'x.com' in url_lower:
|
||||
return 'twitter'
|
||||
elif 'tiktok.com' in url_lower:
|
||||
return 'tiktok'
|
||||
elif 'instagram.com' in url_lower:
|
||||
return 'instagram'
|
||||
elif 'reddit.com' in url_lower:
|
||||
return 'reddit'
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def is_supported_url(url: str) -> bool:
|
||||
"""Check if URL is from a supported platform"""
|
||||
return EmbedDownloader.detect_platform(url) is not None
|
||||
1158
modules/paid_content/fansly_direct_client.py
Normal file
1158
modules/paid_content/fansly_direct_client.py
Normal file
File diff suppressed because it is too large
Load Diff
529
modules/paid_content/file_host_downloader.py
Normal file
529
modules/paid_content/file_host_downloader.py
Normal file
@@ -0,0 +1,529 @@
|
||||
"""
|
||||
Download files from external file hosting services
|
||||
Supports: Bunkr, Pixeldrain, Gofile, Cyberdrop
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin, RateLimitMixin
|
||||
|
||||
|
||||
class FileHostDownloader(LoggingMixin, RateLimitMixin):
|
||||
"""
|
||||
Download files from various file hosting services
|
||||
Used for manual import of PPV content
|
||||
"""
|
||||
|
||||
SUPPORTED_HOSTS = {
|
||||
'bunkr': ['bunkr.sk', 'bunkr.si', 'bunkr.la', 'bunkrr.ru', 'bunkr.ph', 'bunkr.is', 'bunkr.ac', 'bunkr.cr'],
|
||||
'pixeldrain': ['pixeldrain.com'],
|
||||
'gofile': ['gofile.io'],
|
||||
'cyberdrop': ['cyberdrop.me', 'cyberdrop.to', 'cyberdrop.cc'],
|
||||
'fileditch': ['fileditchfiles.me', 'fileditch.me'],
|
||||
}
|
||||
|
||||
# Bunkr CDN servers (food-themed) - try in order
|
||||
BUNKR_CDNS = [
|
||||
'i-soup.bunkr.ru',
|
||||
'i-burger.bunkr.ru',
|
||||
'i-pizza.bunkr.ru',
|
||||
'i-taco.bunkr.ru',
|
||||
'i-fries.bunkr.ru',
|
||||
'i-hotdog.bunkr.ru',
|
||||
'i-nachos.bunkr.ru',
|
||||
'i-sushi.bunkr.ru',
|
||||
'i-ramen.bunkr.ru',
|
||||
'i-curry.bunkr.ru',
|
||||
'i-kebab.bunkr.ru',
|
||||
'i-pasta.bunkr.ru',
|
||||
'i-steak.bunkr.ru',
|
||||
'i-salad.bunkr.ru',
|
||||
'i-sandwich.bunkr.ru',
|
||||
'i-waffle.bunkr.ru',
|
||||
'i-pancake.bunkr.ru',
|
||||
'i-donut.bunkr.ru',
|
||||
'i-cookie.bunkr.ru',
|
||||
'i-cake.bunkr.ru',
|
||||
'i-bacon.bunkr.ru',
|
||||
'i-cheese.bunkr.ru',
|
||||
'i-chicken.bunkr.ru',
|
||||
'i-fish.bunkr.ru',
|
||||
'i-noodle.bunkr.ru',
|
||||
'i-rice.bunkr.ru',
|
||||
'i-bread.bunkr.ru',
|
||||
'burger.bunkr.ru',
|
||||
'pizza.bunkr.ru',
|
||||
'milkshake.bunkr.ru',
|
||||
]
|
||||
|
||||
def __init__(self, log_callback=None, progress_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='FileHost')
|
||||
self._init_rate_limiter(min_delay=1, max_delay=3)
|
||||
self.progress_callback = progress_callback # Called with (downloaded_bytes, total_bytes, filename)
|
||||
|
||||
def detect_host(self, url: str) -> Optional[str]:
|
||||
"""Detect which file host a URL belongs to"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower().replace('www.', '')
|
||||
|
||||
for host, domains in self.SUPPORTED_HOSTS.items():
|
||||
if domain in domains:
|
||||
return host
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def is_supported_url(self, url: str) -> bool:
|
||||
"""Check if URL is from a supported file host"""
|
||||
return self.detect_host(url) is not None
|
||||
|
||||
async def download_url(self, url: str, save_dir: Path) -> Dict:
|
||||
"""
|
||||
Download file(s) from URL
|
||||
Returns: {'success': bool, 'files': [paths], 'error': str}
|
||||
"""
|
||||
host = self.detect_host(url)
|
||||
if not host:
|
||||
return {'success': False, 'files': [], 'error': 'Unsupported host'}
|
||||
|
||||
handler = getattr(self, f'_download_{host}', None)
|
||||
if not handler:
|
||||
return {'success': False, 'files': [], 'error': f'No handler for {host}'}
|
||||
|
||||
try:
|
||||
save_dir = Path(save_dir)
|
||||
save_dir.mkdir(parents=True, exist_ok=True)
|
||||
return await handler(url, save_dir)
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading from {host}: {e}", 'error')
|
||||
return {'success': False, 'files': [], 'error': str(e)}
|
||||
|
||||
async def _download_pixeldrain(self, url: str, save_dir: Path) -> Dict:
|
||||
"""Download from Pixeldrain"""
|
||||
# Extract file ID from URL
|
||||
# Format: https://pixeldrain.com/u/FILEID or /l/LISTID
|
||||
|
||||
parsed = urlparse(url)
|
||||
path_parts = parsed.path.strip('/').split('/')
|
||||
|
||||
if len(path_parts) < 2:
|
||||
return {'success': False, 'files': [], 'error': 'Invalid Pixeldrain URL'}
|
||||
|
||||
url_type, file_id = path_parts[0], path_parts[1]
|
||||
|
||||
files = []
|
||||
timeout = aiohttp.ClientTimeout(total=300)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
if url_type == 'u':
|
||||
# Single file
|
||||
api_url = f"https://pixeldrain.com/api/file/{file_id}/info"
|
||||
async with session.get(api_url) as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': f'API error: {resp.status}'}
|
||||
info = await resp.json()
|
||||
|
||||
download_url = f"https://pixeldrain.com/api/file/{file_id}"
|
||||
filename = info.get('name', f'{file_id}.bin')
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
await self._download_file(session, download_url, save_path)
|
||||
files.append(str(save_path))
|
||||
|
||||
elif url_type == 'l':
|
||||
# List (album)
|
||||
api_url = f"https://pixeldrain.com/api/list/{file_id}"
|
||||
async with session.get(api_url) as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': f'API error: {resp.status}'}
|
||||
data = await resp.json()
|
||||
|
||||
for i, item in enumerate(data.get('files', [])):
|
||||
self._delay_between_items()
|
||||
item_id = item['id']
|
||||
filename = item.get('name', f'{i:03d}_{item_id}.bin')
|
||||
download_url = f"https://pixeldrain.com/api/file/{item_id}"
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
try:
|
||||
await self._download_file(session, download_url, save_path)
|
||||
files.append(str(save_path))
|
||||
except Exception as e:
|
||||
self.log(f"Failed to download {filename}: {e}", 'warning')
|
||||
|
||||
return {'success': True, 'files': files, 'error': None}
|
||||
|
||||
async def _download_gofile(self, url: str, save_dir: Path) -> Dict:
|
||||
"""Download from Gofile"""
|
||||
# Extract content ID from URL
|
||||
# Format: https://gofile.io/d/CONTENTID
|
||||
|
||||
parsed = urlparse(url)
|
||||
path_parts = parsed.path.strip('/').split('/')
|
||||
|
||||
if len(path_parts) < 2 or path_parts[0] != 'd':
|
||||
return {'success': False, 'files': [], 'error': 'Invalid Gofile URL'}
|
||||
|
||||
content_id = path_parts[1]
|
||||
|
||||
files = []
|
||||
timeout = aiohttp.ClientTimeout(total=300)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# Create guest account token (POST request required since API change)
|
||||
async with session.post('https://api.gofile.io/accounts') as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': 'Failed to get Gofile token'}
|
||||
account_data = await resp.json()
|
||||
if account_data.get('status') != 'ok':
|
||||
return {'success': False, 'files': [], 'error': f"Gofile API error: {account_data.get('status')}"}
|
||||
token = account_data.get('data', {}).get('token')
|
||||
|
||||
if not token:
|
||||
return {'success': False, 'files': [], 'error': 'No Gofile token received'}
|
||||
|
||||
# Get content info
|
||||
# Gofile requires x-website-token header (changed from query param in 2024)
|
||||
headers = {
|
||||
'Authorization': f'Bearer {token}',
|
||||
'x-website-token': '4fd6sg89d7s6',
|
||||
}
|
||||
api_url = f"https://api.gofile.io/contents/{content_id}"
|
||||
|
||||
async with session.get(api_url, headers=headers) as resp:
|
||||
if resp.status == 401:
|
||||
return {'success': False, 'files': [], 'error': 'Gofile authentication failed - websiteToken may have changed'}
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': f'Failed to get content: {resp.status}'}
|
||||
content_data = await resp.json()
|
||||
|
||||
if content_data.get('status') == 'error-notPremium':
|
||||
return {'success': False, 'files': [], 'error': 'Gofile requires premium account for API access - try direct download'}
|
||||
if content_data.get('status') != 'ok':
|
||||
error = content_data.get('data', {}).get('message', content_data.get('status', 'Unknown error'))
|
||||
return {'success': False, 'files': [], 'error': error}
|
||||
|
||||
contents = content_data.get('data', {}).get('children', {})
|
||||
|
||||
for item_id, item in contents.items():
|
||||
if item.get('type') != 'file':
|
||||
continue
|
||||
|
||||
self._delay_between_items()
|
||||
download_url = item.get('link')
|
||||
filename = item.get('name', f'{item_id}.bin')
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
try:
|
||||
await self._download_file(session, download_url, save_path, headers=headers)
|
||||
files.append(str(save_path))
|
||||
except Exception as e:
|
||||
self.log(f"Failed to download {filename}: {e}", 'warning')
|
||||
|
||||
return {'success': True, 'files': files, 'error': None}
|
||||
|
||||
async def _download_cyberdrop(self, url: str, save_dir: Path) -> Dict:
|
||||
"""Download from Cyberdrop"""
|
||||
# Cyberdrop albums: https://cyberdrop.me/a/ALBUMID
|
||||
# Single files: https://cyberdrop.me/f/FILEID or direct CDN links
|
||||
|
||||
files = []
|
||||
timeout = aiohttp.ClientTimeout(total=300)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
parsed = urlparse(url)
|
||||
path_parts = parsed.path.strip('/').split('/')
|
||||
|
||||
if len(path_parts) >= 2 and path_parts[0] == 'a':
|
||||
# Album
|
||||
album_url = url
|
||||
async with session.get(album_url) as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': f'Failed to fetch album: {resp.status}'}
|
||||
html = await resp.text()
|
||||
|
||||
# Parse file links from HTML
|
||||
# Pattern: href="https://fs-XXX.cyberdrop.to/FILE"
|
||||
cdn_pattern = r'href="(https://[a-z0-9-]+\.cyberdrop\.[a-z]+/[^"]+)"'
|
||||
matches = re.findall(cdn_pattern, html)
|
||||
|
||||
for i, file_url in enumerate(matches):
|
||||
self._delay_between_items()
|
||||
filename = file_url.split('/')[-1].split('?')[0]
|
||||
if not filename:
|
||||
filename = f'{i:03d}.bin'
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
try:
|
||||
await self._download_file(session, file_url, save_path)
|
||||
files.append(str(save_path))
|
||||
except Exception as e:
|
||||
self.log(f"Failed to download {filename}: {e}", 'warning')
|
||||
|
||||
else:
|
||||
# Single file or direct CDN link
|
||||
filename = parsed.path.split('/')[-1] or 'download.bin'
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
await self._download_file(session, url, save_path)
|
||||
files.append(str(save_path))
|
||||
|
||||
return {'success': True, 'files': files, 'error': None}
|
||||
|
||||
async def _download_bunkr(self, url: str, save_dir: Path) -> Dict:
|
||||
"""Download from Bunkr with CDN fallback support"""
|
||||
# Bunkr albums: https://bunkr.sk/a/ALBUMID
|
||||
# Single files: https://bunkr.sk/f/FILEID or https://bunkr.sk/v/VIDEOID
|
||||
|
||||
files = []
|
||||
failed = []
|
||||
timeout = aiohttp.ClientTimeout(total=600) # Increased for large files
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
|
||||
parsed = urlparse(url)
|
||||
path_parts = parsed.path.strip('/').split('/')
|
||||
|
||||
if len(path_parts) >= 2 and path_parts[0] == 'a':
|
||||
# Album page
|
||||
async with session.get(url) as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': f'Failed to fetch album: {resp.status}'}
|
||||
html = await resp.text()
|
||||
|
||||
# Parse file links from HTML - look for /f/ links
|
||||
file_pattern = r'href="(/f/[^"]+)"'
|
||||
matches = re.findall(file_pattern, html)
|
||||
|
||||
self.log(f"Found {len(matches)} files in Bunkr album", 'info')
|
||||
|
||||
for i, file_path in enumerate(matches):
|
||||
self._delay_between_items()
|
||||
|
||||
# Make absolute URL
|
||||
file_url = f"https://{parsed.netloc}{file_path}"
|
||||
|
||||
# Get direct download URL and file UUID
|
||||
direct_url, file_uuid = await self._get_bunkr_direct_url_with_uuid(session, file_url)
|
||||
if not direct_url:
|
||||
self.log(f"Could not get direct URL for {file_url}", 'warning')
|
||||
failed.append(file_url)
|
||||
continue
|
||||
|
||||
filename = direct_url.split('/')[-1].split('?')[0]
|
||||
if not filename:
|
||||
filename = f'{i:03d}.bin'
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
try:
|
||||
await self._download_file(session, direct_url, save_path,
|
||||
try_cdn_fallback=True, file_uuid=file_uuid)
|
||||
files.append(str(save_path))
|
||||
self.log(f"Downloaded: {filename}", 'info')
|
||||
except Exception as e:
|
||||
self.log(f"Failed to download {filename}: {e}", 'warning')
|
||||
failed.append(filename)
|
||||
|
||||
else:
|
||||
# Single file page
|
||||
direct_url, file_uuid = await self._get_bunkr_direct_url_with_uuid(session, url)
|
||||
if not direct_url:
|
||||
return {'success': False, 'files': [], 'error': 'Could not get direct download URL'}
|
||||
|
||||
filename = direct_url.split('/')[-1].split('?')[0] or 'download.bin'
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
await self._download_file(session, direct_url, save_path,
|
||||
try_cdn_fallback=True, file_uuid=file_uuid)
|
||||
files.append(str(save_path))
|
||||
|
||||
result = {'success': len(files) > 0, 'files': files, 'error': None}
|
||||
if failed:
|
||||
result['failed'] = failed
|
||||
result['error'] = f'{len(failed)} files failed to download'
|
||||
return result
|
||||
|
||||
async def _get_bunkr_direct_url_with_uuid(self, session: aiohttp.ClientSession, page_url: str) -> tuple:
|
||||
"""Extract direct download URL and file UUID from Bunkr file page"""
|
||||
try:
|
||||
async with session.get(page_url) as resp:
|
||||
if resp.status != 200:
|
||||
return None, None
|
||||
html = await resp.text()
|
||||
|
||||
file_uuid = None
|
||||
|
||||
# Extract file UUID first
|
||||
uuid_patterns = [
|
||||
r'data-v="([a-f0-9-]{36}\.[a-z0-9]+)"',
|
||||
r'([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}\.[a-z0-9]+)',
|
||||
]
|
||||
for pattern in uuid_patterns:
|
||||
match = re.search(pattern, html)
|
||||
if match:
|
||||
file_uuid = match.group(1)
|
||||
break
|
||||
|
||||
# Try to find existing CDN URL in page
|
||||
cdn_patterns = [
|
||||
r'href="(https://[^"]*\.bunkr\.ru/[^"]+)"',
|
||||
r'src="(https://[^"]*\.bunkr\.ru/[^"]+)"',
|
||||
r'data-src="(https://[^"]*\.bunkr\.ru/[^"]+)"',
|
||||
]
|
||||
|
||||
for pattern in cdn_patterns:
|
||||
match = re.search(pattern, html)
|
||||
if match:
|
||||
url = match.group(1)
|
||||
if await self._check_url_accessible(session, url):
|
||||
return url, file_uuid
|
||||
|
||||
# If we have UUID, try CDNs
|
||||
if file_uuid:
|
||||
self.log(f"Found file UUID: {file_uuid}, trying CDNs...", 'debug')
|
||||
for cdn in self.BUNKR_CDNS:
|
||||
cdn_url = f"https://{cdn}/{file_uuid}"
|
||||
if await self._check_url_accessible(session, cdn_url):
|
||||
self.log(f"Found working CDN: {cdn}", 'debug')
|
||||
return cdn_url, file_uuid
|
||||
|
||||
return None, file_uuid
|
||||
except Exception as e:
|
||||
self.log(f"Error getting Bunkr direct URL: {e}", 'warning')
|
||||
return None, None
|
||||
|
||||
async def _check_url_accessible(self, session: aiohttp.ClientSession, url: str) -> bool:
|
||||
"""Check if a URL is accessible (returns 200)"""
|
||||
try:
|
||||
async with session.head(url, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as resp:
|
||||
return resp.status == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def _download_fileditch(self, url: str, save_dir: Path) -> Dict:
|
||||
"""Download from FileDitch (Cloudflare-protected)"""
|
||||
from modules.cloudflare_handler import CloudflareHandler
|
||||
|
||||
# Extract filename from URL: file.php?f=/b74/tLyJWGrzvSyRlJvBVDBa.mp4
|
||||
parsed = urlparse(url)
|
||||
params = parse_qs(parsed.query)
|
||||
file_path = params.get('f', [''])[0]
|
||||
if not file_path:
|
||||
return {'success': False, 'files': [], 'error': 'Invalid FileDitch URL - no file parameter'}
|
||||
|
||||
filename = file_path.rsplit('/', 1)[-1] if '/' in file_path else file_path
|
||||
if not filename:
|
||||
return {'success': False, 'files': [], 'error': 'Could not extract filename from URL'}
|
||||
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
# Use CloudflareHandler to get cookies via FlareSolverr
|
||||
cf_handler = CloudflareHandler(
|
||||
module_name='FileDitch',
|
||||
flaresolverr_url='http://localhost:8191/v1',
|
||||
flaresolverr_enabled=True,
|
||||
)
|
||||
|
||||
self.log('Bypassing Cloudflare for FileDitch via FlareSolverr...', 'info')
|
||||
if not cf_handler.get_cookies_via_flaresolverr(url):
|
||||
return {'success': False, 'files': [], 'error': 'Failed to bypass Cloudflare for FileDitch'}
|
||||
|
||||
cookies = cf_handler.get_cookies_dict()
|
||||
user_agent = cf_handler.get_user_agent()
|
||||
|
||||
# Download with the obtained cookies
|
||||
timeout = aiohttp.ClientTimeout(total=3600)
|
||||
cookie_jar = aiohttp.CookieJar()
|
||||
headers = {'User-Agent': user_agent or 'Mozilla/5.0'}
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout, cookie_jar=cookie_jar, headers=headers) as session:
|
||||
# Set cookies on session
|
||||
for name, value in cookies.items():
|
||||
cookie_jar.update_cookies({name: value}, response_url=url)
|
||||
|
||||
await self._download_file(session, url, save_path, headers=headers)
|
||||
|
||||
return {'success': True, 'files': [str(save_path)], 'error': None}
|
||||
|
||||
async def _download_file(self, session: aiohttp.ClientSession, url: str,
|
||||
save_path: Path, headers: Dict = None,
|
||||
try_cdn_fallback: bool = False, file_uuid: str = None) -> None:
|
||||
"""Download a single file with streaming and optional CDN fallback"""
|
||||
save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
urls_to_try = [url]
|
||||
|
||||
# If CDN fallback enabled and we have a file UUID, add alternate CDNs
|
||||
if try_cdn_fallback and file_uuid:
|
||||
for cdn in self.BUNKR_CDNS:
|
||||
alt_url = f"https://{cdn}/{file_uuid}"
|
||||
if alt_url != url:
|
||||
urls_to_try.append(alt_url)
|
||||
|
||||
last_error = None
|
||||
for try_url in urls_to_try:
|
||||
try:
|
||||
self.log(f"Downloading: {save_path.name} from {try_url[:60]}...", 'info')
|
||||
async with session.get(try_url, headers=headers) as resp:
|
||||
if resp.status == 200:
|
||||
total_size = int(resp.headers.get('content-length', 0))
|
||||
downloaded = 0
|
||||
last_log_pct = 0
|
||||
|
||||
with open(save_path, 'wb') as f:
|
||||
async for chunk in resp.content.iter_chunked(65536): # 64KB chunks
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
|
||||
# Log and callback progress every 2%
|
||||
if total_size > 0:
|
||||
pct = int(downloaded * 100 / total_size)
|
||||
if pct >= last_log_pct + 2:
|
||||
self.log(f" {save_path.name}: {pct}% ({downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB)", 'info')
|
||||
last_log_pct = pct
|
||||
# Call progress callback if provided
|
||||
if self.progress_callback:
|
||||
try:
|
||||
self.progress_callback(downloaded, total_size, save_path.name)
|
||||
except Exception:
|
||||
pass # Don't fail download due to callback error
|
||||
|
||||
self.log(f"Downloaded: {save_path.name} ({downloaded // (1024*1024)}MB)", 'info')
|
||||
return # Success
|
||||
else:
|
||||
last_error = f"HTTP {resp.status}"
|
||||
self.log(f"Download failed: {save_path.name} - {last_error}", 'warning')
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
self.log(f"Download error: {save_path.name} - {last_error}", 'warning')
|
||||
# Try next CDN
|
||||
continue
|
||||
|
||||
raise Exception(f"Download failed after trying {len(urls_to_try)} URLs: {last_error}")
|
||||
|
||||
def _sanitize_filename(self, filename: str) -> str:
|
||||
"""Sanitize filename for filesystem"""
|
||||
if not filename:
|
||||
return 'download.bin'
|
||||
# Remove/replace invalid characters
|
||||
filename = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', filename)
|
||||
filename = filename.strip('. ')
|
||||
return filename or 'download.bin'
|
||||
|
||||
@classmethod
|
||||
def get_supported_domains(cls) -> List[str]:
|
||||
"""Get list of all supported domains"""
|
||||
domains = []
|
||||
for host_domains in cls.SUPPORTED_HOSTS.values():
|
||||
domains.extend(host_domains)
|
||||
return domains
|
||||
171
modules/paid_content/filename_parser.py
Normal file
171
modules/paid_content/filename_parser.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
Filename parser for extracting dates and metadata from Fansly/paid content filenames.
|
||||
|
||||
Supports:
|
||||
1. Fansly snowflake IDs: 871257582885416960.mp4
|
||||
2. Embedded date format: 2023-05-11_at_15-51_id_513099759796367360-zRvVUZeP.mp4
|
||||
3. Date-prefixed files: 2022-07-08.mp4 or 2022-07-08_video.mp4
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, Dict, Tuple
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Fansly epoch calibrated from known files
|
||||
# Based on: 513099759796367360 = 2023-05-11 15:51 UTC
|
||||
FANSLY_EPOCH_MS = 1561483337101
|
||||
|
||||
|
||||
def decode_fansly_snowflake(snowflake_id: str) -> Optional[datetime]:
|
||||
"""
|
||||
Decode a Fansly snowflake ID to a datetime.
|
||||
|
||||
Fansly uses Twitter-style snowflake IDs where the timestamp
|
||||
is encoded in the upper bits (shifted right by 22).
|
||||
"""
|
||||
try:
|
||||
sid = int(snowflake_id)
|
||||
# Timestamp is in upper bits
|
||||
timestamp_ms = (sid >> 22) + FANSLY_EPOCH_MS
|
||||
return datetime.fromtimestamp(timestamp_ms / 1000, tz=timezone.utc)
|
||||
except (ValueError, OverflowError, OSError):
|
||||
return None
|
||||
|
||||
|
||||
def parse_filename(filename: str) -> Dict:
|
||||
"""
|
||||
Parse a filename and extract any date/metadata information.
|
||||
|
||||
Returns:
|
||||
{
|
||||
'original_filename': str,
|
||||
'detected_date': datetime or None,
|
||||
'fansly_id': str or None,
|
||||
'date_source': str or None, # 'snowflake', 'embedded', 'prefix', None
|
||||
'confidence': str, # 'high', 'medium', 'low'
|
||||
}
|
||||
"""
|
||||
result = {
|
||||
'original_filename': filename,
|
||||
'detected_date': None,
|
||||
'fansly_id': None,
|
||||
'date_source': None,
|
||||
'confidence': 'low',
|
||||
}
|
||||
|
||||
# Get the base name without extension
|
||||
name = Path(filename).stem
|
||||
|
||||
# Pattern 1: Embedded date format
|
||||
# 2023-05-11_at_15-51_id_513099759796367360-zRvVUZeP-YcNs55W9.mp4
|
||||
# 2026-01-24_at_06-22_id_871257582885416960_hash2_4547ab5367c6d7ea3a28ac4fc79df018.mp4
|
||||
# Also handles spaces: 2023 05 11_at_15 51_id_513099759796367360
|
||||
embedded_pattern = r'(\d{4})[-_ ](\d{2})[-_ ](\d{2})[-_ ]?at[-_ ](\d{2})[-_ ](\d{2})[-_ ]?id[-_ ](\d{15,20})'
|
||||
match = re.search(embedded_pattern, name, re.IGNORECASE)
|
||||
if match:
|
||||
year, month, day, hour, minute, fansly_id = match.groups()
|
||||
try:
|
||||
result['detected_date'] = datetime(
|
||||
int(year), int(month), int(day),
|
||||
int(hour), int(minute), 0,
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
result['fansly_id'] = fansly_id
|
||||
result['date_source'] = 'embedded'
|
||||
result['confidence'] = 'high'
|
||||
return result
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Pattern 2: Date prefix (YYYY-MM-DD or YYYY_MM_DD)
|
||||
# 2022-07-08.mp4 or 2022-07-08_video.mp4
|
||||
date_prefix_pattern = r'^(\d{4})[-_](\d{2})[-_](\d{2})(?:[_\-\s]|$)'
|
||||
match = re.match(date_prefix_pattern, name)
|
||||
if match:
|
||||
year, month, day = match.groups()
|
||||
try:
|
||||
result['detected_date'] = datetime(
|
||||
int(year), int(month), int(day),
|
||||
12, 0, 0, # Default to noon
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
result['date_source'] = 'prefix'
|
||||
result['confidence'] = 'high'
|
||||
return result
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Pattern 3: Pure Fansly snowflake ID
|
||||
# 871257582885416960.mp4 (15-20 digit number)
|
||||
snowflake_pattern = r'^(\d{15,20})(?:_\d+)?$'
|
||||
match = re.match(snowflake_pattern, name)
|
||||
if match:
|
||||
fansly_id = match.group(1)
|
||||
decoded_date = decode_fansly_snowflake(fansly_id)
|
||||
if decoded_date:
|
||||
# Sanity check: date should be between 2020 and 2030
|
||||
if 2020 <= decoded_date.year <= 2030:
|
||||
result['detected_date'] = decoded_date
|
||||
result['fansly_id'] = fansly_id
|
||||
result['date_source'] = 'snowflake'
|
||||
result['confidence'] = 'high'
|
||||
return result
|
||||
|
||||
# Pattern 4: Fansly ID embedded anywhere in filename
|
||||
# e.g., video_871257582885416960_hd.mp4
|
||||
embedded_id_pattern = r'(\d{15,20})'
|
||||
matches = re.findall(embedded_id_pattern, name)
|
||||
for potential_id in matches:
|
||||
decoded_date = decode_fansly_snowflake(potential_id)
|
||||
if decoded_date and 2020 <= decoded_date.year <= 2030:
|
||||
result['detected_date'] = decoded_date
|
||||
result['fansly_id'] = potential_id
|
||||
result['date_source'] = 'snowflake'
|
||||
result['confidence'] = 'medium'
|
||||
return result
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def parse_filenames(filenames: list) -> Dict:
|
||||
"""
|
||||
Parse multiple filenames and return analysis.
|
||||
|
||||
Returns:
|
||||
{
|
||||
'files': [parsed result for each file],
|
||||
'earliest_date': datetime or None,
|
||||
'latest_date': datetime or None,
|
||||
'suggested_date': datetime or None, # Most common or earliest
|
||||
'has_dates': bool,
|
||||
}
|
||||
"""
|
||||
results = [parse_filename(f) for f in filenames]
|
||||
|
||||
dates = [r['detected_date'] for r in results if r['detected_date']]
|
||||
|
||||
analysis = {
|
||||
'files': results,
|
||||
'earliest_date': min(dates) if dates else None,
|
||||
'latest_date': max(dates) if dates else None,
|
||||
'suggested_date': min(dates) if dates else None, # Use earliest as default
|
||||
'has_dates': len(dates) > 0,
|
||||
}
|
||||
|
||||
return analysis
|
||||
|
||||
|
||||
def format_date_for_display(dt: datetime) -> str:
|
||||
"""Format datetime for display: 'May 11, 2023 at 3:51 PM'"""
|
||||
if dt is None:
|
||||
return ''
|
||||
return dt.strftime('%b %d, %Y at %-I:%M %p')
|
||||
|
||||
|
||||
def format_date_for_input(dt: datetime) -> Tuple[str, str]:
|
||||
"""Format datetime for HTML inputs: (date_str, time_str)"""
|
||||
if dt is None:
|
||||
return ('', '')
|
||||
return (dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M'))
|
||||
14
modules/paid_content/hqcelebcorner_client.py
Normal file
14
modules/paid_content/hqcelebcorner_client.py
Normal file
@@ -0,0 +1,14 @@
|
||||
"""Backwards-compatibility shim — use xenforo_forum_client instead."""
|
||||
from .xenforo_forum_client import XenForoForumClient
|
||||
|
||||
|
||||
class HQCelebCornerClient(XenForoForumClient):
|
||||
"""Legacy alias for XenForoForumClient, pre-configured for HQCelebCorner."""
|
||||
|
||||
def __init__(self, log_callback=None):
|
||||
super().__init__(
|
||||
service_id='hqcelebcorner',
|
||||
base_url='https://www.hqcelebcorner.net',
|
||||
cookie_path='/opt/media-downloader/cookies/forum_cookies_HQCelebCorner.json',
|
||||
log_callback=log_callback,
|
||||
)
|
||||
1285
modules/paid_content/instagram_adapter.py
Normal file
1285
modules/paid_content/instagram_adapter.py
Normal file
File diff suppressed because it is too large
Load Diff
312
modules/paid_content/models.py
Normal file
312
modules/paid_content/models.py
Normal file
@@ -0,0 +1,312 @@
|
||||
"""
|
||||
Pydantic models for Paid Content feature
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class Attachment:
|
||||
"""Represents a file attachment from a post"""
|
||||
name: str
|
||||
server_path: str
|
||||
file_type: Optional[str] = None
|
||||
extension: Optional[str] = None
|
||||
download_url: Optional[str] = None
|
||||
file_size: Optional[int] = None
|
||||
width: Optional[int] = None
|
||||
height: Optional[int] = None
|
||||
duration: Optional[int] = None
|
||||
needs_quality_recheck: bool = False
|
||||
is_preview: bool = False
|
||||
|
||||
@classmethod
|
||||
def from_api(cls, data: Dict, base_url: str = '') -> 'Attachment':
|
||||
"""Create Attachment from API response"""
|
||||
name = data.get('name', '')
|
||||
path = data.get('path', '')
|
||||
|
||||
# Detect file type from extension
|
||||
ext = ''
|
||||
if '.' in name:
|
||||
ext = name.rsplit('.', 1)[-1].lower()
|
||||
|
||||
file_type = 'unknown'
|
||||
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic'}
|
||||
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
|
||||
archive_exts = {'zip', 'rar', '7z', 'tar', 'gz'}
|
||||
|
||||
if ext in image_exts:
|
||||
file_type = 'image'
|
||||
elif ext in video_exts:
|
||||
file_type = 'video'
|
||||
elif ext in archive_exts:
|
||||
file_type = 'archive'
|
||||
elif ext in {'pdf', 'doc', 'docx', 'txt'}:
|
||||
file_type = 'document'
|
||||
|
||||
return cls(
|
||||
name=name,
|
||||
server_path=path,
|
||||
file_type=file_type,
|
||||
extension=ext if ext else None,
|
||||
download_url=f"{base_url}/data{path}" if base_url and path else None
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary for database storage"""
|
||||
d = {
|
||||
'name': self.name,
|
||||
'server_path': self.server_path,
|
||||
'file_type': self.file_type,
|
||||
'extension': self.extension,
|
||||
'download_url': self.download_url,
|
||||
'file_size': self.file_size,
|
||||
'width': self.width,
|
||||
'height': self.height,
|
||||
'duration': self.duration
|
||||
}
|
||||
if self.needs_quality_recheck:
|
||||
d['needs_quality_recheck'] = 1
|
||||
return d
|
||||
|
||||
|
||||
@dataclass
|
||||
class Post:
|
||||
"""Represents a post from a creator"""
|
||||
post_id: str
|
||||
service_id: str
|
||||
platform: str
|
||||
creator_id: str
|
||||
title: Optional[str] = None
|
||||
content: Optional[str] = None
|
||||
published_at: Optional[str] = None
|
||||
added_at: Optional[str] = None
|
||||
edited_at: Optional[str] = None
|
||||
attachments: List[Attachment] = field(default_factory=list)
|
||||
embed_urls: List[str] = field(default_factory=list)
|
||||
is_pinned: bool = False
|
||||
pinned_at: Optional[str] = None
|
||||
auto_tags: List[str] = field(default_factory=list) # Tag names to auto-apply on sync
|
||||
tagged_users: List[str] = field(default_factory=list) # Instagram users tagged in the post
|
||||
|
||||
@classmethod
|
||||
def from_api(cls, data: Dict, service_id: str, platform: str, creator_id: str, base_url: str = '') -> 'Post':
|
||||
"""Create Post from API response"""
|
||||
# Parse attachments
|
||||
attachments = []
|
||||
for att_data in data.get('attachments', []):
|
||||
attachments.append(Attachment.from_api(att_data, base_url))
|
||||
|
||||
# Also check file field (some APIs use this instead of attachments)
|
||||
if 'file' in data and data['file']:
|
||||
file_data = data['file']
|
||||
if isinstance(file_data, dict):
|
||||
attachments.append(Attachment.from_api(file_data, base_url))
|
||||
elif isinstance(file_data, str):
|
||||
attachments.append(Attachment(
|
||||
name=file_data.split('/')[-1] if '/' in file_data else file_data,
|
||||
server_path=file_data
|
||||
))
|
||||
|
||||
# Parse dates
|
||||
published = data.get('published')
|
||||
added = data.get('added')
|
||||
edited = data.get('edited')
|
||||
|
||||
# Content: use 'content' if available, fallback to 'substring' (list endpoint returns truncated)
|
||||
content = data.get('content') or data.get('substring') or ''
|
||||
|
||||
# Single post endpoint returns HTML content (e.g. <p>text</p>), strip tags
|
||||
if content and '<' in content:
|
||||
import re
|
||||
content = re.sub(r'<br\s*/?>', '\n', content)
|
||||
content = re.sub(r'</p>\s*<p>', '\n\n', content)
|
||||
content = re.sub(r'<[^>]+>', '', content)
|
||||
content = content.strip()
|
||||
|
||||
title = data.get('title')
|
||||
|
||||
# OnlyFans posts on Coomer have the post text in 'title' and empty 'content'.
|
||||
# Copy title to content and clear title (OF posts don't have real titles).
|
||||
if not content and title:
|
||||
content = title
|
||||
title = None
|
||||
|
||||
return cls(
|
||||
post_id=str(data.get('id', '')),
|
||||
service_id=service_id,
|
||||
platform=platform,
|
||||
creator_id=creator_id,
|
||||
title=title,
|
||||
content=content,
|
||||
published_at=published,
|
||||
added_at=added,
|
||||
edited_at=edited,
|
||||
attachments=attachments,
|
||||
embed_urls=data.get('embed', []) or []
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary for database storage"""
|
||||
return {
|
||||
'post_id': self.post_id,
|
||||
'title': self.title,
|
||||
'content': self.content,
|
||||
'published_at': self.published_at,
|
||||
'added_at': self.added_at,
|
||||
'edited_at': self.edited_at,
|
||||
'has_attachments': 1 if self.attachments else 0,
|
||||
'attachment_count': len(self.attachments),
|
||||
'embed_count': len(self.embed_urls),
|
||||
'is_pinned': 1 if self.is_pinned else 0,
|
||||
'pinned_at': self.pinned_at
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Message:
|
||||
"""Represents a chat message from/to a creator"""
|
||||
message_id: str
|
||||
platform: str
|
||||
service_id: str
|
||||
creator_id: str # Platform-specific creator ID
|
||||
text: Optional[str] = None
|
||||
sent_at: Optional[str] = None
|
||||
is_from_creator: bool = True
|
||||
is_tip: bool = False
|
||||
tip_amount: Optional[float] = None
|
||||
price: Optional[float] = None
|
||||
is_free: bool = True
|
||||
is_purchased: bool = False
|
||||
reply_to_message_id: Optional[str] = None
|
||||
attachments: List[Attachment] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary for database storage"""
|
||||
return {
|
||||
'message_id': self.message_id,
|
||||
'text': self.text,
|
||||
'sent_at': self.sent_at,
|
||||
'is_from_creator': 1 if self.is_from_creator else 0,
|
||||
'is_tip': 1 if self.is_tip else 0,
|
||||
'tip_amount': self.tip_amount,
|
||||
'price': self.price,
|
||||
'is_free': 1 if self.is_free else 0,
|
||||
'is_purchased': 1 if self.is_purchased else 0,
|
||||
'has_attachments': 1 if self.attachments else 0,
|
||||
'attachment_count': len(self.attachments),
|
||||
'reply_to_message_id': self.reply_to_message_id,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Creator:
|
||||
"""Represents a creator from Coomer/Kemono"""
|
||||
creator_id: str
|
||||
service_id: str
|
||||
platform: str
|
||||
username: str
|
||||
display_name: Optional[str] = None
|
||||
profile_image_url: Optional[str] = None
|
||||
banner_image_url: Optional[str] = None
|
||||
bio: Optional[str] = None
|
||||
post_count: int = 0
|
||||
|
||||
@classmethod
|
||||
def from_api(cls, data: Dict, service_id: str, platform: str, base_url: str = None) -> 'Creator':
|
||||
"""Create Creator from API response"""
|
||||
creator_id = str(data.get('id', ''))
|
||||
|
||||
# Construct image domain - use .st instead of .party (coomer.party redirects to coomer.st)
|
||||
img_domain = None
|
||||
if base_url and creator_id:
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(base_url)
|
||||
# Convert .party to .st for image URLs (coomer.party/kemono.party images are at .st)
|
||||
netloc = parsed.netloc.replace('.party', '.st')
|
||||
img_domain = f"img.{netloc}"
|
||||
|
||||
# Construct profile image URL from icon endpoint
|
||||
profile_image_url = data.get('profile_image')
|
||||
if not profile_image_url and img_domain:
|
||||
# Icon URLs are at img.{domain}/icons/{platform}/{creator_id}
|
||||
profile_image_url = f"https://{img_domain}/icons/{platform}/{creator_id}"
|
||||
|
||||
# Construct banner image URL
|
||||
banner_image_url = data.get('banner_image')
|
||||
if not banner_image_url and img_domain:
|
||||
# Banner URLs are at img.{domain}/banners/{platform}/{creator_id}
|
||||
banner_image_url = f"https://{img_domain}/banners/{platform}/{creator_id}"
|
||||
|
||||
return cls(
|
||||
creator_id=creator_id,
|
||||
service_id=service_id,
|
||||
platform=platform,
|
||||
username=data.get('name', ''),
|
||||
display_name=data.get('name'),
|
||||
profile_image_url=profile_image_url,
|
||||
banner_image_url=banner_image_url,
|
||||
post_count=data.get('post_count', 0)
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary for database storage"""
|
||||
return {
|
||||
'service_id': self.service_id,
|
||||
'platform': self.platform,
|
||||
'creator_id': self.creator_id,
|
||||
'username': self.username,
|
||||
'display_name': self.display_name,
|
||||
'profile_image_url': self.profile_image_url,
|
||||
'banner_image_url': self.banner_image_url,
|
||||
'bio': self.bio,
|
||||
'post_count': self.post_count
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class SyncResult:
|
||||
"""Result of a creator sync operation"""
|
||||
success: bool
|
||||
new_posts: int = 0
|
||||
new_attachments: int = 0
|
||||
downloaded_files: int = 0
|
||||
failed_files: int = 0
|
||||
skipped_files: int = 0
|
||||
error: Optional[str] = None
|
||||
downloaded_file_info: Optional[List[Dict]] = None # List of {file_path, filename, source, content_type}
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
'success': self.success,
|
||||
'new_posts': self.new_posts,
|
||||
'new_attachments': self.new_attachments,
|
||||
'downloaded_files': self.downloaded_files,
|
||||
'failed_files': self.failed_files,
|
||||
'skipped_files': self.skipped_files,
|
||||
'error': self.error
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class DownloadResult:
|
||||
"""Result of a download operation"""
|
||||
success: bool
|
||||
file_path: Optional[str] = None
|
||||
file_hash: Optional[str] = None
|
||||
file_size: Optional[int] = None
|
||||
error: Optional[str] = None
|
||||
is_duplicate: bool = False
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
'success': self.success,
|
||||
'file_path': self.file_path,
|
||||
'file_hash': self.file_hash,
|
||||
'file_size': self.file_size,
|
||||
'error': self.error,
|
||||
'is_duplicate': self.is_duplicate
|
||||
}
|
||||
729
modules/paid_content/onlyfans_client.py
Normal file
729
modules/paid_content/onlyfans_client.py
Normal file
@@ -0,0 +1,729 @@
|
||||
"""
|
||||
OnlyFans Direct API Client
|
||||
|
||||
Downloads content directly from the OnlyFans API using browser-extracted
|
||||
credentials and dynamic request signing.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Dict, Any, Callable
|
||||
from urllib.parse import urlparse, urlencode
|
||||
|
||||
from modules.base_module import LoggingMixin, RateLimitMixin
|
||||
from .models import Post, Attachment, Message
|
||||
from .onlyfans_signing import OnlyFansSigner
|
||||
|
||||
|
||||
class OnlyFansClient(LoggingMixin, RateLimitMixin):
|
||||
"""
|
||||
API client for downloading content directly from OnlyFans.
|
||||
|
||||
API Endpoints:
|
||||
- Base URL: https://onlyfans.com/api2/v2
|
||||
- Auth: Requires browser-extracted credentials (sess, auth_id, x-bc, User-Agent)
|
||||
- Signing: Every request needs dynamic sign/time/app-token headers
|
||||
- GET /users/me - Verify auth
|
||||
- GET /users/{username} - Get user profile
|
||||
- GET /users/{user_id}/posts?limit=50&offset={offset} - Get posts (paginated)
|
||||
"""
|
||||
|
||||
BASE_URL = "https://onlyfans.com/api2/v2"
|
||||
SERVICE_ID = "onlyfans_direct"
|
||||
PLATFORM = "onlyfans"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
auth_config: Dict[str, str],
|
||||
signing_url: Optional[str] = None,
|
||||
log_callback: Optional[Callable] = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
auth_config: Dict with keys: sess, auth_id, auth_uid (optional), x_bc, user_agent
|
||||
signing_url: Optional custom URL for signing rules
|
||||
log_callback: Optional logging callback
|
||||
"""
|
||||
self._init_logger('PaidContent', log_callback, default_module='OnlyFansDirect')
|
||||
# More conservative rate limiting than Fansly (OF is stricter)
|
||||
self._init_rate_limiter(
|
||||
min_delay=1.5, max_delay=3.0,
|
||||
batch_delay_min=3, batch_delay_max=6
|
||||
)
|
||||
|
||||
self.auth_config = auth_config
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
self._signer = OnlyFansSigner(rules_url=signing_url)
|
||||
|
||||
async def _get_session(self) -> aiohttp.ClientSession:
|
||||
"""Get or create aiohttp session with OnlyFans headers"""
|
||||
if self._session is None or self._session.closed:
|
||||
# Build cookie string
|
||||
cookies = f"sess={self.auth_config['sess']}; auth_id={self.auth_config['auth_id']}"
|
||||
auth_uid = self.auth_config.get('auth_uid')
|
||||
if auth_uid:
|
||||
cookies += f"; auth_uid_{self.auth_config['auth_id']}={auth_uid}"
|
||||
|
||||
headers = {
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'User-Agent': self.auth_config.get('user_agent', ''),
|
||||
'x-bc': self.auth_config.get('x_bc', ''),
|
||||
'Cookie': cookies,
|
||||
'Origin': 'https://onlyfans.com',
|
||||
'Referer': 'https://onlyfans.com/',
|
||||
}
|
||||
timeout = aiohttp.ClientTimeout(total=60)
|
||||
self._session = aiohttp.ClientSession(headers=headers, timeout=timeout)
|
||||
return self._session
|
||||
|
||||
async def _sign_request(self, endpoint: str) -> Dict[str, str]:
|
||||
"""
|
||||
Compute signing headers for an API request.
|
||||
|
||||
Args:
|
||||
endpoint: API path (e.g. "/users/me") - will be prefixed with /api2/v2
|
||||
|
||||
Returns:
|
||||
Dict with sign, time, app-token, user-id headers
|
||||
"""
|
||||
user_id = self.auth_config.get('auth_id', '0')
|
||||
# Sign with full URL path (matching OF-Scraper)
|
||||
full_path = f"/api2/v2{endpoint}"
|
||||
sign_headers = await self._signer.sign(full_path, user_id)
|
||||
sign_headers['user-id'] = user_id
|
||||
return sign_headers
|
||||
|
||||
async def _api_request(self, endpoint: str, params: Optional[Dict] = None) -> Optional[Dict]:
|
||||
"""
|
||||
Make a signed API request to OnlyFans.
|
||||
|
||||
Handles 401 (auth failure), 429 (rate limit), and general errors.
|
||||
Auto-retries on 429 with exponential backoff.
|
||||
|
||||
Args:
|
||||
endpoint: API path (e.g. "/users/me")
|
||||
params: Optional query parameters
|
||||
|
||||
Returns:
|
||||
Parsed JSON response or None on failure
|
||||
"""
|
||||
session = await self._get_session()
|
||||
# Include query params in the signing path (OF-Scraper does this)
|
||||
sign_endpoint = endpoint
|
||||
if params:
|
||||
sign_endpoint = f"{endpoint}?{urlencode(params)}"
|
||||
sign_headers = await self._sign_request(sign_endpoint)
|
||||
|
||||
url = f"{self.BASE_URL}{endpoint}"
|
||||
max_retries = 3
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with session.get(url, params=params, headers=sign_headers) as resp:
|
||||
if resp.status == 200:
|
||||
return await resp.json()
|
||||
elif resp.status == 401:
|
||||
self.log("OnlyFans auth failed (401) - credentials may be expired", 'error')
|
||||
return None
|
||||
elif resp.status == 429:
|
||||
retry_after = int(resp.headers.get('Retry-After', 30))
|
||||
wait = min(retry_after * (attempt + 1), 120)
|
||||
self.log(f"Rate limited (429), waiting {wait}s (attempt {attempt + 1}/{max_retries})", 'warning')
|
||||
await asyncio.sleep(wait)
|
||||
# Refresh signing headers for retry (timestamp changes)
|
||||
sign_headers = await self._sign_request(sign_endpoint)
|
||||
continue
|
||||
elif resp.status == 404:
|
||||
self.log(f"Not found (404): {endpoint}", 'debug')
|
||||
return None
|
||||
else:
|
||||
text = await resp.text()
|
||||
self.log(f"API error: HTTP {resp.status} for {endpoint}: {text[:200]}", 'warning')
|
||||
return None
|
||||
except asyncio.TimeoutError:
|
||||
self.log(f"Request timeout for {endpoint} (attempt {attempt + 1})", 'warning')
|
||||
if attempt < max_retries - 1:
|
||||
await asyncio.sleep(5 * (attempt + 1))
|
||||
sign_headers = await self._sign_request(sign_endpoint)
|
||||
continue
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Request error for {endpoint}: {e}", 'error')
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _strip_html(text: str) -> str:
|
||||
"""Strip HTML tags and convert common entities to plain text"""
|
||||
if not text:
|
||||
return ''
|
||||
text = re.sub(r'<br\s*/?>', '\n', text)
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
text = text.replace('&', '&').replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"')
|
||||
return text.strip()
|
||||
|
||||
async def close(self):
|
||||
"""Close the aiohttp session"""
|
||||
if self._session and not self._session.closed:
|
||||
await self._session.close()
|
||||
self._session = None
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
await self.close()
|
||||
|
||||
async def check_auth(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Verify credentials by calling /users/me.
|
||||
|
||||
Returns:
|
||||
Dict with 'valid' bool and optionally 'user_id', 'username', 'name'
|
||||
"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
data = await self._api_request("/users/me")
|
||||
if data and data.get('id'):
|
||||
return {
|
||||
'valid': True,
|
||||
'user_id': str(data['id']),
|
||||
'username': data.get('username', ''),
|
||||
'name': data.get('name', ''),
|
||||
}
|
||||
return {'valid': False, 'error': 'Invalid credentials or unexpected response'}
|
||||
except Exception as e:
|
||||
self.log(f"Error checking auth: {e}", 'error')
|
||||
return {'valid': False, 'error': str(e)}
|
||||
|
||||
async def get_user_info(self, username: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get user profile info.
|
||||
|
||||
Args:
|
||||
username: The OnlyFans username
|
||||
|
||||
Returns:
|
||||
Normalized user info dict or None
|
||||
"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
data = await self._api_request(f"/users/{username}")
|
||||
if not data or not data.get('id'):
|
||||
self.log(f"User not found: {username}", 'warning')
|
||||
return None
|
||||
|
||||
return {
|
||||
'user_id': str(data['id']),
|
||||
'username': data.get('username', username),
|
||||
'display_name': data.get('name', ''),
|
||||
'avatar_url': data.get('avatar'),
|
||||
'banner_url': data.get('header'),
|
||||
'bio': self._strip_html(data.get('rawAbout') or data.get('about') or ''),
|
||||
'join_date': (data.get('joinDate') or '')[:10] or None,
|
||||
'posts_count': data.get('postsCount', 0),
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error getting user info for {username}: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def get_single_post(self, post_id: str) -> Optional[Post]:
|
||||
"""
|
||||
Fetch a single post by its OnlyFans post ID.
|
||||
|
||||
Args:
|
||||
post_id: The OnlyFans post ID
|
||||
|
||||
Returns:
|
||||
Post object or None
|
||||
"""
|
||||
self._delay_between_items()
|
||||
data = await self._api_request(f"/posts/{post_id}")
|
||||
if not data:
|
||||
self.log(f"Post {post_id} not found", 'warning')
|
||||
return None
|
||||
|
||||
user_id = str(data.get('author', {}).get('id', data.get('authorId', '')))
|
||||
post = self._parse_post(data, user_id)
|
||||
return post
|
||||
|
||||
async def get_posts(
|
||||
self,
|
||||
user_id: str,
|
||||
username: str,
|
||||
since_date: Optional[str] = None,
|
||||
until_date: Optional[str] = None,
|
||||
days_back: Optional[int] = None,
|
||||
max_posts: Optional[int] = None,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
) -> List[Post]:
|
||||
"""
|
||||
Fetch posts from a creator's timeline using offset-based pagination.
|
||||
|
||||
Args:
|
||||
user_id: The OnlyFans numeric user ID
|
||||
username: The username (for logging/reference)
|
||||
since_date: Only fetch posts after this date (ISO format)
|
||||
until_date: Only fetch posts before this date (ISO format)
|
||||
days_back: Fetch posts from the last N days
|
||||
max_posts: Maximum number of posts to fetch
|
||||
progress_callback: Called with (page, total_posts) during fetching
|
||||
|
||||
Returns:
|
||||
List of Post objects
|
||||
"""
|
||||
self.log(f"Fetching posts for {username} (user_id: {user_id})", 'info')
|
||||
|
||||
# Calculate date filters - use naive datetimes to avoid tz comparison issues
|
||||
since_dt = None
|
||||
until_dt = None
|
||||
|
||||
if days_back:
|
||||
from datetime import timedelta
|
||||
since_date = (datetime.now() - timedelta(days=days_back)).isoformat()
|
||||
|
||||
if since_date:
|
||||
try:
|
||||
dt = datetime.fromisoformat(since_date.replace('Z', '+00:00'))
|
||||
since_dt = dt.replace(tzinfo=None) # Normalize to naive
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if until_date:
|
||||
try:
|
||||
dt = datetime.fromisoformat(until_date.replace('Z', '+00:00'))
|
||||
until_dt = dt.replace(tzinfo=None) # Normalize to naive
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if since_dt:
|
||||
self.log(f"Date filter: since_date={since_dt.isoformat()}", 'debug')
|
||||
|
||||
all_posts: List[Post] = []
|
||||
offset = 0
|
||||
page_size = 50
|
||||
page = 0
|
||||
consecutive_old = 0 # Track consecutive old posts for early stop
|
||||
|
||||
while True:
|
||||
self._delay_between_items()
|
||||
|
||||
params = {
|
||||
'limit': str(page_size),
|
||||
'offset': str(offset),
|
||||
'order': 'publish_date_desc',
|
||||
}
|
||||
|
||||
data = await self._api_request(f"/users/{user_id}/posts", params=params)
|
||||
if not data:
|
||||
break
|
||||
|
||||
# OF returns a list of posts directly
|
||||
posts_list = data if isinstance(data, list) else data.get('list', [])
|
||||
if not posts_list:
|
||||
break
|
||||
|
||||
page_had_old_post = False
|
||||
for post_data in posts_list:
|
||||
post = self._parse_post(post_data, user_id)
|
||||
if not post:
|
||||
continue
|
||||
|
||||
# Check date filters using published_at
|
||||
if post.published_at and since_dt:
|
||||
try:
|
||||
post_dt = datetime.fromisoformat(post.published_at.replace('Z', '+00:00'))
|
||||
post_dt_naive = post_dt.replace(tzinfo=None) # Normalize to naive
|
||||
if post_dt_naive < since_dt:
|
||||
self.log(f"Reached posts older than since_date ({post.published_at}), stopping", 'debug')
|
||||
return all_posts
|
||||
except (ValueError, TypeError) as e:
|
||||
self.log(f"Date comparison error: {e} (post_date={post.published_at})", 'warning')
|
||||
|
||||
if post.published_at and until_dt:
|
||||
try:
|
||||
post_dt = datetime.fromisoformat(post.published_at.replace('Z', '+00:00'))
|
||||
post_dt_naive = post_dt.replace(tzinfo=None)
|
||||
if post_dt_naive > until_dt:
|
||||
continue
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
all_posts.append(post)
|
||||
|
||||
if max_posts and len(all_posts) >= max_posts:
|
||||
self.log(f"Reached max_posts limit: {max_posts}", 'debug')
|
||||
return all_posts
|
||||
|
||||
page += 1
|
||||
if progress_callback:
|
||||
progress_callback(page, len(all_posts))
|
||||
|
||||
# If we got fewer results than page_size, we've reached the end
|
||||
if len(posts_list) < page_size:
|
||||
break
|
||||
|
||||
offset += page_size
|
||||
self._delay_between_batches()
|
||||
|
||||
# Also fetch pinned posts (they may not appear in the timeline)
|
||||
self._delay_between_items()
|
||||
pinned_data = await self._api_request(
|
||||
f"/users/{user_id}/posts",
|
||||
params={'limit': '50', 'offset': '0', 'order': 'publish_date_desc', 'pinned': '1'},
|
||||
)
|
||||
if pinned_data:
|
||||
pinned_list = pinned_data if isinstance(pinned_data, list) else pinned_data.get('list', [])
|
||||
existing_ids = {p.post_id for p in all_posts}
|
||||
for post_data in pinned_list:
|
||||
post = self._parse_post(post_data, user_id)
|
||||
if post and post.post_id not in existing_ids:
|
||||
all_posts.append(post)
|
||||
|
||||
self.log(f"Fetched {len(all_posts)} posts for {username}", 'info')
|
||||
return all_posts
|
||||
|
||||
def _parse_post(self, post_data: Dict, user_id: str) -> Optional[Post]:
|
||||
"""
|
||||
Parse an OnlyFans post into a Post model.
|
||||
|
||||
Args:
|
||||
post_data: Raw post data from API
|
||||
user_id: Creator's user ID
|
||||
|
||||
Returns:
|
||||
Post object or None if parsing fails
|
||||
"""
|
||||
try:
|
||||
post_id = str(post_data.get('id', ''))
|
||||
if not post_id:
|
||||
return None
|
||||
|
||||
# Parse timestamp - OF uses ISO format strings
|
||||
published_at = None
|
||||
raw_date = post_data.get('postedAt') or post_data.get('createdAt')
|
||||
if raw_date:
|
||||
try:
|
||||
if isinstance(raw_date, str):
|
||||
published_at = raw_date
|
||||
elif isinstance(raw_date, (int, float)):
|
||||
published_at = datetime.fromtimestamp(raw_date).isoformat()
|
||||
except (ValueError, TypeError, OSError):
|
||||
pass
|
||||
|
||||
# Content text
|
||||
content = self._strip_html(post_data.get('rawText') or post_data.get('text') or '')
|
||||
|
||||
# Parse media attachments
|
||||
attachments = []
|
||||
media_list = post_data.get('media', []) or []
|
||||
for media_item in media_list:
|
||||
attachment = self._parse_attachment(media_item)
|
||||
if attachment:
|
||||
attachments.append(attachment)
|
||||
|
||||
# Extract embed URLs from content text
|
||||
embed_urls = []
|
||||
if content:
|
||||
url_pattern = r'https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/|vimeo\.com/|dailymotion\.com/video/)\S+'
|
||||
embed_urls = re.findall(url_pattern, content)
|
||||
|
||||
return Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=user_id,
|
||||
title=None,
|
||||
content=content,
|
||||
published_at=published_at,
|
||||
added_at=datetime.now().isoformat(),
|
||||
attachments=attachments,
|
||||
embed_urls=embed_urls,
|
||||
is_pinned=bool(post_data.get('isPinned')),
|
||||
pinned_at=post_data.get('pinnedAt'),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error parsing post: {e}", 'error')
|
||||
return None
|
||||
|
||||
def _parse_attachment(self, media_item: Dict) -> Optional[Attachment]:
|
||||
"""
|
||||
Parse an OnlyFans media item into an Attachment.
|
||||
|
||||
OF media structure:
|
||||
{
|
||||
id, type, source: {source: url, width, height, duration},
|
||||
full: {source: url, ...}, preview: {source: url, ...}
|
||||
}
|
||||
|
||||
Prefers 'full' quality (OF's standard since 2024), falls back to 'source'.
|
||||
|
||||
Args:
|
||||
media_item: Raw media dict from API
|
||||
|
||||
Returns:
|
||||
Attachment object or None
|
||||
"""
|
||||
try:
|
||||
media_id = str(media_item.get('id', ''))
|
||||
media_type = media_item.get('type', '').lower()
|
||||
|
||||
# Map OF media types to our file types
|
||||
type_map = {
|
||||
'photo': 'image',
|
||||
'video': 'video',
|
||||
'audio': 'audio',
|
||||
'gif': 'image',
|
||||
}
|
||||
file_type = type_map.get(media_type, 'unknown')
|
||||
|
||||
# Get download URL - prefer 'full' quality, fallback to 'source'
|
||||
download_url = None
|
||||
width = None
|
||||
height = None
|
||||
duration = None
|
||||
|
||||
# Current OF API nests media under 'files' key
|
||||
files = media_item.get('files') or media_item
|
||||
|
||||
# Try 'full' first (higher quality)
|
||||
full_data = files.get('full')
|
||||
if full_data and isinstance(full_data, dict):
|
||||
download_url = full_data.get('url') or full_data.get('source')
|
||||
width = full_data.get('width')
|
||||
height = full_data.get('height')
|
||||
duration = full_data.get('duration')
|
||||
|
||||
# Fallback to 'source'
|
||||
if not download_url:
|
||||
source_data = files.get('source')
|
||||
if source_data and isinstance(source_data, dict):
|
||||
download_url = source_data.get('url') or source_data.get('source')
|
||||
if not width:
|
||||
width = source_data.get('width')
|
||||
if not height:
|
||||
height = source_data.get('height')
|
||||
if not duration:
|
||||
duration = source_data.get('duration')
|
||||
|
||||
# For videos without a direct URL, get metadata from media item
|
||||
can_view = media_item.get('canView', True)
|
||||
if not download_url and media_type == 'video':
|
||||
# OF DRM videos use FairPlay SAMPLE-AES encryption — cannot be downloaded.
|
||||
# Get dimensions/duration for metadata, then fall through to preview frame.
|
||||
if not duration:
|
||||
duration = media_item.get('duration')
|
||||
if not width:
|
||||
width = (full_data or {}).get('width')
|
||||
if not height:
|
||||
height = (full_data or {}).get('height')
|
||||
|
||||
# Fallback to 'preview' for any content type
|
||||
# For DRM videos (canView=true), downloads the preview frame image (shown with lock overlay)
|
||||
# For PPV videos (canView=false), there's no preview — marked unavailable
|
||||
if not download_url:
|
||||
preview_data = files.get('preview')
|
||||
if preview_data and isinstance(preview_data, dict):
|
||||
download_url = preview_data.get('url') or preview_data.get('source')
|
||||
if not width:
|
||||
width = preview_data.get('width')
|
||||
if not height:
|
||||
height = preview_data.get('height')
|
||||
|
||||
# Some OF responses have src directly
|
||||
if not download_url:
|
||||
download_url = media_item.get('src')
|
||||
|
||||
# Determine extension from URL
|
||||
ext = ''
|
||||
if download_url:
|
||||
parsed = urlparse(download_url)
|
||||
path = parsed.path
|
||||
if '.' in path:
|
||||
ext = path.rsplit('.', 1)[-1].lower()
|
||||
# Clean up common issues
|
||||
if ext in ('jpeg',):
|
||||
ext = 'jpg'
|
||||
elif media_type == 'photo':
|
||||
ext = 'jpg'
|
||||
elif media_type == 'video':
|
||||
ext = 'mp4'
|
||||
|
||||
filename = f"{media_id}.{ext}" if ext else str(media_id)
|
||||
|
||||
# Override file_type based on actual extension (OF sometimes misreports type)
|
||||
video_exts = {'mp4', 'mov', 'webm', 'avi', 'mkv', 'flv', 'm4v', 'wmv', 'mpg', 'mpeg'}
|
||||
if ext in video_exts and file_type != 'video':
|
||||
file_type = 'video'
|
||||
|
||||
# Duration may be in seconds (float or int)
|
||||
if duration is not None:
|
||||
try:
|
||||
duration = int(float(duration))
|
||||
except (ValueError, TypeError):
|
||||
duration = None
|
||||
|
||||
# Check if content is actually locked (canView=false) vs just missing URL
|
||||
can_view = media_item.get('canView', True)
|
||||
is_preview = not can_view
|
||||
if not download_url and not can_view:
|
||||
self.log(f"PPV/locked content: {filename}", 'debug')
|
||||
|
||||
# Detect preview-only: no full/source URL but got a preview URL
|
||||
if not is_preview and download_url:
|
||||
has_full = False
|
||||
if full_data and isinstance(full_data, dict):
|
||||
has_full = bool(full_data.get('url') or full_data.get('source'))
|
||||
if not has_full:
|
||||
source_data = files.get('source')
|
||||
if source_data and isinstance(source_data, dict):
|
||||
has_full = bool(source_data.get('url') or source_data.get('source'))
|
||||
elif not source_data:
|
||||
has_full = False
|
||||
if not has_full and not media_item.get('src'):
|
||||
# Only got URL from preview fallback
|
||||
is_preview = True
|
||||
|
||||
return Attachment(
|
||||
name=filename,
|
||||
server_path=f"/onlyfans/{media_id}",
|
||||
file_type=file_type,
|
||||
extension=ext if ext else None,
|
||||
download_url=download_url,
|
||||
file_size=None,
|
||||
width=width,
|
||||
height=height,
|
||||
duration=duration,
|
||||
is_preview=is_preview,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error parsing attachment: {e}", 'error')
|
||||
return None
|
||||
|
||||
# ==================== MESSAGES ====================
|
||||
|
||||
async def get_messages(self, user_id: str, max_messages: int = 500) -> List[Message]:
|
||||
"""
|
||||
Fetch messages from a conversation with a creator.
|
||||
|
||||
Uses GET /chats/{user_id}/messages with cursor-based pagination.
|
||||
The 'id' param is used as cursor for older messages.
|
||||
|
||||
Args:
|
||||
user_id: OnlyFans numeric user ID of the creator
|
||||
max_messages: Maximum number of messages to fetch
|
||||
|
||||
Returns:
|
||||
List of Message objects
|
||||
"""
|
||||
messages = []
|
||||
cursor_id = None
|
||||
page = 0
|
||||
|
||||
while len(messages) < max_messages:
|
||||
page += 1
|
||||
params = {'limit': 50, 'order': 'desc'}
|
||||
if cursor_id:
|
||||
params['id'] = cursor_id
|
||||
|
||||
data = await self._api_request(f"/chats/{user_id}/messages", params=params)
|
||||
if not data:
|
||||
break
|
||||
|
||||
# Response is a dict with 'list' key containing messages
|
||||
msg_list = data.get('list', []) if isinstance(data, dict) else data
|
||||
if not msg_list:
|
||||
break
|
||||
|
||||
for msg_data in msg_list:
|
||||
msg = self._parse_message(msg_data, user_id)
|
||||
if msg:
|
||||
messages.append(msg)
|
||||
|
||||
self.log(f"Fetched page {page}: {len(msg_list)} messages (total: {len(messages)})", 'debug')
|
||||
|
||||
# Use the last message's id as cursor for next page
|
||||
if len(msg_list) < 50:
|
||||
break # Last page
|
||||
|
||||
last_id = msg_list[-1].get('id')
|
||||
if last_id and str(last_id) != str(cursor_id):
|
||||
cursor_id = last_id
|
||||
else:
|
||||
break
|
||||
|
||||
self.log(f"Fetched {len(messages)} messages for user {user_id}", 'info')
|
||||
return messages
|
||||
|
||||
def _parse_message(self, msg_data: Dict, creator_user_id: str) -> Optional[Message]:
|
||||
"""
|
||||
Parse an OnlyFans message into a Message model.
|
||||
|
||||
Args:
|
||||
msg_data: Raw message dict from API
|
||||
creator_user_id: Numeric user ID of the creator (to determine direction)
|
||||
|
||||
Returns:
|
||||
Message object or None
|
||||
"""
|
||||
try:
|
||||
msg_id = str(msg_data.get('id', ''))
|
||||
if not msg_id:
|
||||
return None
|
||||
|
||||
# Determine if message is from creator
|
||||
from_user = msg_data.get('fromUser', {})
|
||||
from_user_id = str(from_user.get('id', ''))
|
||||
is_from_creator = (from_user_id == str(creator_user_id))
|
||||
|
||||
# Parse text
|
||||
text = self._strip_html(msg_data.get('text') or '')
|
||||
|
||||
# Parse timestamp
|
||||
created_at = msg_data.get('createdAt')
|
||||
sent_at = None
|
||||
if created_at:
|
||||
try:
|
||||
sent_at = datetime.fromisoformat(created_at.replace('Z', '+00:00')).isoformat()
|
||||
except (ValueError, TypeError):
|
||||
sent_at = created_at
|
||||
|
||||
# PPV/price info
|
||||
price = msg_data.get('price')
|
||||
is_free = msg_data.get('isFree', True)
|
||||
is_purchased = msg_data.get('isOpened', False) or msg_data.get('canPurchase') is False
|
||||
is_tip = msg_data.get('isTip', False)
|
||||
tip_amount = msg_data.get('tipAmount')
|
||||
|
||||
# Parse media attachments (same structure as posts)
|
||||
attachments = []
|
||||
media_list = msg_data.get('media', []) or []
|
||||
for media_item in media_list:
|
||||
att = self._parse_attachment(media_item)
|
||||
if att:
|
||||
attachments.append(att)
|
||||
|
||||
return Message(
|
||||
message_id=msg_id,
|
||||
platform=self.PLATFORM,
|
||||
service_id=self.SERVICE_ID,
|
||||
creator_id=str(creator_user_id),
|
||||
text=text if text else None,
|
||||
sent_at=sent_at,
|
||||
is_from_creator=is_from_creator,
|
||||
is_tip=bool(is_tip),
|
||||
tip_amount=float(tip_amount) if tip_amount else None,
|
||||
price=float(price) if price else None,
|
||||
is_free=bool(is_free),
|
||||
is_purchased=bool(is_purchased),
|
||||
attachments=attachments,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error parsing message: {e}", 'error')
|
||||
return None
|
||||
109
modules/paid_content/onlyfans_signing.py
Normal file
109
modules/paid_content/onlyfans_signing.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""
|
||||
OnlyFans Request Signing Module
|
||||
|
||||
Handles the dynamic request signing required by the OnlyFans API.
|
||||
Fetches signing rules from the DATAHOARDERS/dynamic-rules GitHub repo
|
||||
and computes SHA-1 based signatures for each API request.
|
||||
|
||||
Isolated module so it's easy to update when OF changes their signing scheme.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Dict, Optional
|
||||
|
||||
import aiohttp
|
||||
|
||||
RULES_URL = "https://raw.githubusercontent.com/DATAHOARDERS/dynamic-rules/main/onlyfans.json"
|
||||
|
||||
|
||||
class OnlyFansSigner:
|
||||
"""
|
||||
Computes request signatures for the OnlyFans API.
|
||||
|
||||
Uses dynamic rules fetched from a public GitHub repo (same source as OF-Scraper).
|
||||
Rules are cached locally and refreshed every 6 hours.
|
||||
"""
|
||||
|
||||
RULES_TTL = 6 * 3600 # 6 hours
|
||||
|
||||
def __init__(self, rules_url: Optional[str] = None):
|
||||
self.rules_url = rules_url or RULES_URL
|
||||
self._rules: Optional[Dict] = None
|
||||
self._rules_fetched_at: float = 0
|
||||
|
||||
@property
|
||||
def rules_stale(self) -> bool:
|
||||
"""Check if cached rules need refreshing"""
|
||||
if self._rules is None:
|
||||
return True
|
||||
return (time.time() - self._rules_fetched_at) > self.RULES_TTL
|
||||
|
||||
async def get_rules(self) -> Dict:
|
||||
"""
|
||||
Fetch signing rules, using cache if fresh.
|
||||
|
||||
Returns:
|
||||
Dict with keys: static_param, format, checksum_indexes,
|
||||
checksum_constants, checksum_constant, app_token
|
||||
"""
|
||||
if not self.rules_stale:
|
||||
return self._rules
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=15)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(self.rules_url) as resp:
|
||||
if resp.status != 200:
|
||||
if self._rules is not None:
|
||||
# Use stale cache rather than failing
|
||||
return self._rules
|
||||
raise RuntimeError(
|
||||
f"Failed to fetch OF signing rules: HTTP {resp.status}"
|
||||
)
|
||||
self._rules = await resp.json(content_type=None)
|
||||
self._rules_fetched_at = time.time()
|
||||
|
||||
return self._rules
|
||||
|
||||
async def sign(self, endpoint_path: str, user_id: str = "0") -> Dict[str, str]:
|
||||
"""
|
||||
Compute signing headers for an OnlyFans API request.
|
||||
|
||||
Args:
|
||||
endpoint_path: The full URL path (e.g. "/api2/v2/users/me")
|
||||
user_id: The authenticated user's ID (from auth_id cookie)
|
||||
|
||||
Returns:
|
||||
Dict with 'sign', 'time', 'app-token' headers
|
||||
"""
|
||||
rules = await self.get_rules()
|
||||
# Timestamp in milliseconds (matching OF-Scraper's implementation)
|
||||
timestamp = str(round(time.time() * 1000))
|
||||
|
||||
# 1. Build the message to hash
|
||||
msg = "\n".join([
|
||||
rules["static_param"],
|
||||
timestamp,
|
||||
endpoint_path,
|
||||
str(user_id),
|
||||
])
|
||||
|
||||
# 2. SHA-1 hash
|
||||
sha1_hash = hashlib.sha1(msg.encode("utf-8")).hexdigest()
|
||||
sha1_bytes = sha1_hash.encode("ascii")
|
||||
|
||||
# 3. Checksum from indexed byte positions + single constant
|
||||
# (matching OF-Scraper's implementation)
|
||||
checksum_indexes = rules["checksum_indexes"]
|
||||
checksum_constant = rules.get("checksum_constant", 0)
|
||||
checksum = sum(sha1_bytes[i] for i in checksum_indexes) + checksum_constant
|
||||
|
||||
# 4. Build the sign header using the format template
|
||||
# Typical format: "53760:{}:{:x}:69723085"
|
||||
sign_value = rules["format"].format(sha1_hash, abs(checksum))
|
||||
|
||||
return {
|
||||
"sign": sign_value,
|
||||
"time": timestamp,
|
||||
"app-token": rules["app_token"],
|
||||
}
|
||||
755
modules/paid_content/pornhub_client.py
Normal file
755
modules/paid_content/pornhub_client.py
Normal file
@@ -0,0 +1,755 @@
|
||||
"""
|
||||
Pornhub Client - Fetches creator info and videos using yt-dlp
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html as html_module
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class PornhubClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching Pornhub creator information and videos using yt-dlp
|
||||
|
||||
Supports:
|
||||
- Pornstar pages (pornhub.com/pornstar/name)
|
||||
- Channel pages (pornhub.com/channels/name)
|
||||
- User pages (pornhub.com/users/name)
|
||||
- Model pages (pornhub.com/model/name)
|
||||
"""
|
||||
|
||||
SERVICE_ID = 'pornhub'
|
||||
PLATFORM = 'pornhub'
|
||||
|
||||
# Quality presets for yt-dlp
|
||||
# Pornhub serves single combined streams with IDs like '1080p', '720p', etc.
|
||||
# NOT separate video+audio streams like YouTube
|
||||
QUALITY_PRESETS = {
|
||||
'best': 'bestvideo+bestaudio/best',
|
||||
'1080p': 'bestvideo[height<=1080]+bestaudio/best[height<=1080]/best',
|
||||
'720p': 'bestvideo[height<=720]+bestaudio/best[height<=720]/best',
|
||||
'480p': 'bestvideo[height<=480]+bestaudio/best[height<=480]/best',
|
||||
}
|
||||
|
||||
def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Pornhub')
|
||||
|
||||
# Find yt-dlp executable
|
||||
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
|
||||
if not self.ytdlp_path:
|
||||
self.log("yt-dlp not found, Pornhub support will be disabled", 'warning')
|
||||
|
||||
# Store database reference for cookie access
|
||||
self.unified_db = unified_db
|
||||
self._cookies_file = None
|
||||
|
||||
# Cache for profile page HTML (avoid re-fetching for avatar/banner/bio)
|
||||
self._profile_page_cache: Dict[str, Optional[str]] = {}
|
||||
|
||||
def _find_ytdlp(self) -> Optional[str]:
|
||||
"""Find yt-dlp executable"""
|
||||
common_paths = [
|
||||
'/opt/media-downloader/venv/bin/yt-dlp',
|
||||
'/usr/local/bin/yt-dlp',
|
||||
'/usr/bin/yt-dlp',
|
||||
'/opt/homebrew/bin/yt-dlp',
|
||||
os.path.expanduser('~/.local/bin/yt-dlp'),
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
|
||||
try:
|
||||
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if yt-dlp is available"""
|
||||
return self.ytdlp_path is not None
|
||||
|
||||
def _get_cookies_file(self) -> Optional[str]:
|
||||
"""Get path to cookies file, creating it from database if needed"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
return self._cookies_file
|
||||
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
# Support both {"cookies": [...]} and [...] formats
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
cookies_list = data['cookies']
|
||||
elif isinstance(data, list):
|
||||
cookies_list = data
|
||||
else:
|
||||
cookies_list = []
|
||||
|
||||
if cookies_list:
|
||||
# Write cookies to temp file in Netscape format
|
||||
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='pornhub_cookies_')
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
f.write("# Netscape HTTP Cookie File\n")
|
||||
for cookie in cookies_list:
|
||||
domain = cookie.get('domain', '')
|
||||
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
||||
path = cookie.get('path', '/')
|
||||
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
||||
expiry = str(int(cookie.get('expirationDate', 0)))
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
|
||||
self.log(f"Loaded {len(cookies_list)} cookies from pornhub scraper", 'debug')
|
||||
return self._cookies_file
|
||||
except Exception as e:
|
||||
self.log(f"Could not load cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _get_cookies_list(self) -> Optional[list]:
|
||||
"""Get cookies as a list of dicts for aiohttp requests"""
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
return data['cookies']
|
||||
elif isinstance(data, list):
|
||||
return data
|
||||
except Exception as e:
|
||||
self.log(f"Could not load cookies list: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _get_base_cmd(self) -> List[str]:
|
||||
"""Get base yt-dlp command with cookies if available"""
|
||||
cmd = [self.ytdlp_path]
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
return cmd
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up temporary files"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
try:
|
||||
os.unlink(self._cookies_file)
|
||||
except Exception:
|
||||
pass
|
||||
self._cookies_file = None
|
||||
self._profile_page_cache.clear()
|
||||
|
||||
@staticmethod
|
||||
def extract_creator_id(url: str) -> Optional[Tuple[str, str]]:
|
||||
"""
|
||||
Extract creator type and identifier from Pornhub URL
|
||||
|
||||
Returns:
|
||||
Tuple of (type, id) where type is 'pornstar', 'channels', 'users', or 'model'
|
||||
or None if not a valid Pornhub creator URL
|
||||
"""
|
||||
patterns = [
|
||||
(r'pornhub\.com/pornstar/([a-zA-Z0-9_-]+)', 'pornstar'),
|
||||
(r'pornhub\.com/channels/([a-zA-Z0-9_-]+)', 'channels'),
|
||||
(r'pornhub\.com/users/([a-zA-Z0-9_-]+)', 'users'),
|
||||
(r'pornhub\.com/model/([a-zA-Z0-9_-]+)', 'model'),
|
||||
]
|
||||
|
||||
for pattern, creator_type in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return (creator_type, match.group(1))
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_creator_url(creator_id: str, creator_type: str = 'pornstar') -> str:
|
||||
"""Convert creator ID to a consistent URL format
|
||||
|
||||
Args:
|
||||
creator_id: Creator name/identifier (may be 'type/name' format)
|
||||
creator_type: Default type if not embedded in creator_id
|
||||
"""
|
||||
# Already a full URL
|
||||
if creator_id.startswith('http://') or creator_id.startswith('https://'):
|
||||
return creator_id
|
||||
|
||||
# Handle 'type/name' format from URL parser
|
||||
if '/' in creator_id:
|
||||
parts = creator_id.split('/', 1)
|
||||
creator_type = parts[0]
|
||||
creator_id = parts[1]
|
||||
|
||||
return f"https://www.pornhub.com/{creator_type}/{creator_id}"
|
||||
|
||||
def _get_listing_url(self, url: str) -> str:
|
||||
"""Get the URL to use for listing videos from a creator page.
|
||||
|
||||
For pornstars and models, append /videos to get the video listing.
|
||||
For channels and users, the base URL already lists videos.
|
||||
"""
|
||||
# Parse out the type
|
||||
parsed = self.extract_creator_id(url)
|
||||
if parsed:
|
||||
creator_type, _ = parsed
|
||||
if creator_type in ('pornstar', 'model'):
|
||||
# Strip any trailing slash and append /videos
|
||||
url = url.rstrip('/')
|
||||
if not url.endswith('/videos'):
|
||||
url = f"{url}/videos"
|
||||
return url
|
||||
|
||||
async def get_creator_info(self, url: str) -> Optional[Dict]:
|
||||
"""
|
||||
Get creator information using yt-dlp + profile page scraping
|
||||
|
||||
Returns dict with creator metadata or None if not found
|
||||
"""
|
||||
if not self.is_available():
|
||||
return None
|
||||
|
||||
creator_type_id = self.extract_creator_id(url)
|
||||
creator_type = creator_type_id[0] if creator_type_id else 'pornstar'
|
||||
|
||||
# Try to scrape the display name from the profile page first
|
||||
creator_name = None
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if page_html:
|
||||
# Look for <h1 itemprop="name">Name</h1> inside nameSubscribe div
|
||||
name_match = re.search(r'<div class="nameSubscribe">.*?<h1[^>]*>\s*(.+?)\s*</h1>', page_html, re.DOTALL)
|
||||
if name_match:
|
||||
creator_name = html_module.unescape(name_match.group(1).strip())
|
||||
self.log(f"Found creator name from profile page: {creator_name}", 'debug')
|
||||
except Exception as e:
|
||||
self.log(f"Could not scrape creator name: {e}", 'debug')
|
||||
|
||||
# If page scraping didn't find a name, try yt-dlp
|
||||
if not creator_name:
|
||||
try:
|
||||
listing_url = self._get_listing_url(url)
|
||||
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--playlist-items', '1',
|
||||
listing_url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
playlist_title = data.get('playlist_title') or ''
|
||||
creator_name = (data.get('channel') or data.get('uploader')
|
||||
or playlist_title.replace(' - Videos', '') or None)
|
||||
if creator_name:
|
||||
creator_name = html_module.unescape(creator_name)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
self.log(f"yt-dlp creator info failed: {e}", 'debug')
|
||||
|
||||
# Fall back to deriving name from URL slug
|
||||
if not creator_name and creator_type_id:
|
||||
creator_name = creator_type_id[1].replace('-', ' ').title()
|
||||
|
||||
if creator_name:
|
||||
return {
|
||||
'creator_id': creator_type_id[1] if creator_type_id else None,
|
||||
'creator_name': creator_name,
|
||||
'creator_url': url,
|
||||
'creator_type': creator_type,
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
async def get_creator_videos(self, url: str, since_date: str = None,
|
||||
max_videos: int = None,
|
||||
progress_callback=None) -> List[Dict]:
|
||||
"""
|
||||
Get all videos from a creator page using --flat-playlist for speed.
|
||||
|
||||
Args:
|
||||
url: Pornhub creator URL
|
||||
since_date: Only fetch videos published after this date (ISO format)
|
||||
max_videos: Maximum number of videos to fetch
|
||||
progress_callback: Callback function(count) for progress updates
|
||||
|
||||
Returns:
|
||||
List of video metadata dicts
|
||||
"""
|
||||
if not self.is_available():
|
||||
return []
|
||||
|
||||
try:
|
||||
listing_url = self._get_listing_url(url)
|
||||
|
||||
# Use --flat-playlist for fast listing (avoids per-video HTTP requests)
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--socket-timeout', '30',
|
||||
'--retries', '3',
|
||||
listing_url
|
||||
]
|
||||
|
||||
if max_videos:
|
||||
cmd.extend(['--playlist-items', f'1:{max_videos}'])
|
||||
|
||||
self.log(f"Fetching videos from: {url}", 'info')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error = stderr.decode('utf-8', errors='replace')
|
||||
self.log(f"Failed to get creator videos: {error}", 'warning')
|
||||
return []
|
||||
|
||||
videos = []
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
|
||||
# Skip non-video entries
|
||||
if data.get('_type') == 'playlist':
|
||||
continue
|
||||
|
||||
video_id = data.get('id')
|
||||
if not video_id:
|
||||
continue
|
||||
|
||||
# Flat-playlist doesn't provide upload_date for Pornhub, but check anyway
|
||||
upload_date = data.get('upload_date')
|
||||
if upload_date:
|
||||
try:
|
||||
upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Decode HTML entities in title (flat-playlist returns them encoded)
|
||||
title = html_module.unescape(data.get('title', f'Video {video_id}'))
|
||||
|
||||
# Build video URL
|
||||
video_url = (data.get('webpage_url') or data.get('url')
|
||||
or f"https://www.pornhub.com/view_video.php?viewkey={video_id}")
|
||||
|
||||
videos.append({
|
||||
'video_id': video_id,
|
||||
'title': title,
|
||||
'description': data.get('description', ''),
|
||||
'upload_date': upload_date,
|
||||
'duration': data.get('duration'),
|
||||
'view_count': data.get('view_count'),
|
||||
'thumbnail': data.get('thumbnail'),
|
||||
'url': video_url,
|
||||
})
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(videos))
|
||||
|
||||
if max_videos and len(videos) >= max_videos:
|
||||
break
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
self.log(f"Found {len(videos)} videos", 'info')
|
||||
return videos
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting creator videos: {e}", 'error')
|
||||
return []
|
||||
|
||||
async def download_video(self, video_url: str, output_dir: Path, quality: str = 'best',
|
||||
progress_callback=None) -> Dict:
|
||||
"""
|
||||
Download a video
|
||||
|
||||
Args:
|
||||
video_url: Pornhub video URL
|
||||
output_dir: Directory to save the video
|
||||
quality: Quality preset
|
||||
progress_callback: Callback for download progress
|
||||
|
||||
Returns:
|
||||
Dict with success status and file info
|
||||
"""
|
||||
if not self.is_available():
|
||||
return {'success': False, 'error': 'yt-dlp not available'}
|
||||
|
||||
try:
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')
|
||||
|
||||
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
|
||||
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'-f', format_str,
|
||||
'-o', output_template,
|
||||
'--print-json',
|
||||
'--no-playlist',
|
||||
'--user-agent', 'Mozilla/5.0',
|
||||
'--referer', 'https://www.pornhub.com/',
|
||||
'--merge-output-format', 'mp4',
|
||||
'--concurrent-fragments', '4',
|
||||
'--no-part',
|
||||
'--retries', '20',
|
||||
video_url
|
||||
]
|
||||
|
||||
self.log(f"Downloading video: {video_url}", 'debug')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
if 'Video unavailable' in error_msg or 'not available' in error_msg:
|
||||
error_msg = 'Video unavailable or private'
|
||||
elif 'premium' in error_msg.lower():
|
||||
error_msg = 'Video requires premium access'
|
||||
elif len(error_msg) > 200:
|
||||
error_msg = error_msg[:200] + '...'
|
||||
|
||||
return {'success': False, 'error': error_msg}
|
||||
|
||||
# Parse output JSON
|
||||
video_info = None
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
try:
|
||||
video_info = json.loads(line)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not video_info:
|
||||
# Try to find downloaded file
|
||||
files = list(output_dir.glob('*.mp4'))
|
||||
if files:
|
||||
file_path = max(files, key=lambda f: f.stat().st_mtime)
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path),
|
||||
'filename': file_path.name,
|
||||
'file_size': file_path.stat().st_size
|
||||
}
|
||||
return {'success': False, 'error': 'Could not find downloaded file'}
|
||||
|
||||
file_path = video_info.get('_filename') or video_info.get('filename')
|
||||
if file_path:
|
||||
file_path = Path(file_path)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path) if file_path else None,
|
||||
'filename': file_path.name if file_path else None,
|
||||
'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
|
||||
'title': video_info.get('title'),
|
||||
'duration': video_info.get('duration'),
|
||||
'video_id': video_info.get('id'),
|
||||
'upload_date': video_info.get('upload_date'),
|
||||
'timestamp': video_info.get('timestamp'),
|
||||
'thumbnail': video_info.get('thumbnail'),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading video: {e}", 'error')
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
async def get_profile_page(self, url: str) -> Optional[str]:
|
||||
"""Fetch profile page HTML via aiohttp (with cookies if available).
|
||||
Results are cached to avoid re-fetching for avatar/banner/bio."""
|
||||
# Strip /videos suffix for profile page
|
||||
base_url = re.sub(r'/videos/?$', '', url)
|
||||
|
||||
if base_url in self._profile_page_cache:
|
||||
return self._profile_page_cache[base_url]
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
}
|
||||
|
||||
# Build simple cookies dict for the session
|
||||
cookies_dict = {}
|
||||
cookies_list = self._get_cookies_list()
|
||||
if cookies_list:
|
||||
for cookie in cookies_list:
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
if name:
|
||||
cookies_dict[name] = value
|
||||
|
||||
async with aiohttp.ClientSession(cookies=cookies_dict) as session:
|
||||
async with session.get(
|
||||
base_url,
|
||||
headers=headers,
|
||||
timeout=aiohttp.ClientTimeout(total=15)
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
text = await resp.text()
|
||||
self._profile_page_cache[base_url] = text
|
||||
return text
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile page: {e}", 'debug')
|
||||
|
||||
self._profile_page_cache[base_url] = None
|
||||
return None
|
||||
|
||||
async def get_profile_image(self, url: str) -> Optional[str]:
|
||||
"""Scrape profile page for avatar/photo URL"""
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
# Look for avatar image: <img id="getAvatar" src="...">
|
||||
avatar_match = re.search(r'<img[^>]*id=["\']getAvatar["\'][^>]*src=["\']([^"\']+)["\']', page_html)
|
||||
if avatar_match:
|
||||
self.log("Found Pornhub profile avatar", 'debug')
|
||||
return avatar_match.group(1)
|
||||
|
||||
# Try og:image meta tag
|
||||
og_match = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_html)
|
||||
if not og_match:
|
||||
og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:image"', page_html)
|
||||
if og_match:
|
||||
return og_match.group(1)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile image: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_profile_bio(self, url: str) -> Optional[str]:
|
||||
"""Scrape bio/about section from profile page"""
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
# Look for aboutMeSection -> div with the actual text
|
||||
# Structure: <section class="aboutMeSection ..."><div class="title">About Name</div><div>Bio text</div></section>
|
||||
about_match = re.search(
|
||||
r'<section\s+class="aboutMeSection[^"]*"[^>]*>.*?<div class="title">[^<]*</div>\s*<div>\s*(.*?)\s*</div>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if about_match:
|
||||
bio_text = re.sub(r'<[^>]+>', '', about_match.group(1)).strip()
|
||||
if bio_text:
|
||||
self.log("Found Pornhub profile bio", 'debug')
|
||||
return html_module.unescape(bio_text)
|
||||
|
||||
# Fallback: look for biographyAbout section
|
||||
bio_match = re.search(
|
||||
r'class="biographyAbout[^"]*"[^>]*>.*?<div class="content[^"]*">(.*?)</div>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if bio_match:
|
||||
bio_text = re.sub(r'<[^>]+>', '', bio_match.group(1)).strip()
|
||||
if bio_text:
|
||||
self.log("Found Pornhub profile bio (fallback)", 'debug')
|
||||
return html_module.unescape(bio_text)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile bio: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_profile_banner(self, url: str) -> Optional[str]:
|
||||
"""Scrape banner/cover image if available"""
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
# Look for cover image: <img id="coverPictureDefault" src="...">
|
||||
cover_match = re.search(
|
||||
r'<img[^>]*id=["\']coverPictureDefault["\'][^>]*src=["\']([^"\']+)["\']',
|
||||
page_html
|
||||
)
|
||||
if cover_match:
|
||||
self.log("Found Pornhub profile banner", 'debug')
|
||||
return cover_match.group(1)
|
||||
|
||||
# Fallback: any img inside coverImage div
|
||||
cover_match = re.search(
|
||||
r'<div class="coverImage">\s*<img[^>]*src=["\']([^"\']+)["\']',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if cover_match:
|
||||
self.log("Found Pornhub profile banner (div)", 'debug')
|
||||
return cover_match.group(1)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile banner: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_profile_info(self, url: str) -> Optional[Dict]:
|
||||
"""Scrape all profile info from the page in one pass"""
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
info = {}
|
||||
|
||||
# Extract infoPiece data (Gender, Birth Place, Height, etc.)
|
||||
info_pieces = re.findall(
|
||||
r'<div class="infoPiece">\s*<span>\s*(.*?)\s*</span>\s*(.*?)\s*</div>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
for label, value in info_pieces:
|
||||
label = re.sub(r'<[^>]+>', '', label).strip().rstrip(':')
|
||||
value = re.sub(r'<[^>]+>', '', value).strip()
|
||||
if label and value:
|
||||
info[label.lower().replace(' ', '_')] = value
|
||||
|
||||
return info if info else None
|
||||
|
||||
async def get_joined_date(self, url: str) -> Optional[str]:
|
||||
"""Extract a joined/career start date from profile info"""
|
||||
try:
|
||||
profile_info = await self.get_profile_info(url)
|
||||
if not profile_info:
|
||||
return None
|
||||
|
||||
# Pornstar pages have "Career Start and End: 2011 to Present"
|
||||
career = profile_info.get('career_start_and_end')
|
||||
if career:
|
||||
# Extract start year: "2011 to Present" -> "2011"
|
||||
match = re.match(r'(\d{4})', career)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# User/model pages might not have career info but could have other dates
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Could not get joined date: {e}", 'debug')
|
||||
return None
|
||||
|
||||
async def get_creator(self, url: str) -> Optional[Creator]:
|
||||
"""
|
||||
Get Creator object from creator URL
|
||||
"""
|
||||
info = await self.get_creator_info(url)
|
||||
if not info:
|
||||
return None
|
||||
|
||||
# Build creator_id as 'type/name' format
|
||||
creator_type_id = self.extract_creator_id(url)
|
||||
if creator_type_id:
|
||||
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}"
|
||||
else:
|
||||
creator_id = info.get('creator_id', '')
|
||||
|
||||
# Profile image is already fetched during get_creator_info (page was cached)
|
||||
profile_image = await self.get_profile_image(url)
|
||||
|
||||
return Creator(
|
||||
creator_id=creator_id,
|
||||
service_id='pornhub',
|
||||
platform='pornhub',
|
||||
username=info.get('creator_name', 'Unknown'),
|
||||
display_name=info.get('creator_name'),
|
||||
profile_image_url=profile_image,
|
||||
)
|
||||
|
||||
async def get_posts(self, url: str, since_date: str = None,
|
||||
max_videos: int = None, progress_callback=None) -> List[Post]:
|
||||
"""
|
||||
Get videos as Post objects
|
||||
"""
|
||||
videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
|
||||
|
||||
# Get creator_id from URL
|
||||
creator_type_id = self.extract_creator_id(url)
|
||||
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" if creator_type_id else ''
|
||||
|
||||
posts = []
|
||||
for video in videos:
|
||||
# Create attachment for the video
|
||||
attachment = Attachment(
|
||||
name=f"{video['title']}.mp4",
|
||||
file_type='video',
|
||||
extension='.mp4',
|
||||
server_path=video['url'],
|
||||
download_url=video['url'],
|
||||
duration=video.get('duration'),
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=video['video_id'],
|
||||
service_id='pornhub',
|
||||
platform='pornhub',
|
||||
creator_id=creator_id,
|
||||
title=video['title'],
|
||||
content=video.get('description') or video['title'],
|
||||
published_at=video.get('upload_date'),
|
||||
attachments=[attachment],
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
678
modules/paid_content/reddit_client.py
Normal file
678
modules/paid_content/reddit_client.py
Normal file
@@ -0,0 +1,678 @@
|
||||
"""
|
||||
Reddit Client for Paid Content - Uses gallery-dl to fetch subreddit posts and download media.
|
||||
|
||||
Adapts the gallery-dl + metadata parsing pattern from reddit_community_monitor.py
|
||||
to produce Post/Attachment objects for the paid content system.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class RedditClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching Reddit subreddit content via gallery-dl.
|
||||
|
||||
gallery-dl downloads files during fetch, so attachments come with local_path
|
||||
already set. The sync handler moves files to their final location.
|
||||
"""
|
||||
|
||||
SERVICE_ID = 'reddit'
|
||||
PLATFORM = 'reddit'
|
||||
|
||||
def __init__(self, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Reddit')
|
||||
self.unified_db = unified_db
|
||||
self.gallery_dl_path = shutil.which('gallery-dl') or '/opt/media-downloader/venv/bin/gallery-dl'
|
||||
|
||||
def get_subreddit_info(self, subreddit: str) -> Optional[Dict]:
|
||||
"""Get basic subreddit info by checking the Reddit JSON API.
|
||||
|
||||
Returns dict with creator_id and creator_name.
|
||||
"""
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
try:
|
||||
# Quick check via Reddit's public JSON endpoint
|
||||
url = f'https://www.reddit.com/r/{subreddit}/about.json'
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
|
||||
sub_data = data.get('data', {})
|
||||
display_name = sub_data.get('display_name', subreddit)
|
||||
title = sub_data.get('title', '')
|
||||
|
||||
# Extract icon — community_icon is higher res, icon_img is fallback
|
||||
icon_url = (sub_data.get('community_icon') or sub_data.get('icon_img') or '').split('?')[0]
|
||||
# HTML entities in URLs
|
||||
icon_url = icon_url.replace('&', '&') if icon_url else None
|
||||
|
||||
# Extract banner — banner_background_image is the main one
|
||||
banner_url = sub_data.get('banner_background_image') or sub_data.get('mobile_banner_image') or ''
|
||||
banner_url = banner_url.split('?')[0] if banner_url else None
|
||||
if banner_url:
|
||||
banner_url = banner_url.replace('&', '&')
|
||||
|
||||
# Build bio from title + public description
|
||||
public_desc = sub_data.get('public_description', '')
|
||||
bio_parts = []
|
||||
if title:
|
||||
bio_parts.append(title)
|
||||
if public_desc and public_desc != title:
|
||||
bio_parts.append(public_desc)
|
||||
subscribers = sub_data.get('subscribers')
|
||||
if subscribers:
|
||||
bio_parts.append(f"{subscribers:,} subscribers")
|
||||
bio = ' — '.join(bio_parts) if bio_parts else None
|
||||
|
||||
# Subreddit creation date
|
||||
created_utc = sub_data.get('created_utc')
|
||||
joined_date = None
|
||||
if created_utc:
|
||||
try:
|
||||
joined_date = datetime.fromtimestamp(created_utc, tz=timezone.utc).strftime('%Y-%m-%d')
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
# Use the subreddit title as display name (e.g. "Reddit Pics")
|
||||
# Fall back to r/name format if no title
|
||||
friendly_name = title if title else f'r/{display_name}'
|
||||
|
||||
return {
|
||||
'creator_id': display_name.lower(),
|
||||
'creator_name': f'r/{display_name}',
|
||||
'display_name': friendly_name,
|
||||
'bio': bio,
|
||||
'joined_date': joined_date,
|
||||
'profile_image_url': icon_url or None,
|
||||
'banner_image_url': banner_url or None,
|
||||
}
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404:
|
||||
self.log(f"Subreddit r/{subreddit} not found (404)", 'warning')
|
||||
return None
|
||||
elif e.code == 403:
|
||||
# Private/quarantined — still exists, return basic info
|
||||
self.log(f"Subreddit r/{subreddit} is private/quarantined", 'warning')
|
||||
return {
|
||||
'creator_id': subreddit.lower(),
|
||||
'creator_name': f'r/{subreddit}',
|
||||
}
|
||||
else:
|
||||
self.log(f"HTTP {e.code} checking r/{subreddit}", 'warning')
|
||||
# Return basic info and let sync verify
|
||||
return {
|
||||
'creator_id': subreddit.lower(),
|
||||
'creator_name': f'r/{subreddit}',
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error getting subreddit info for r/{subreddit}: {e}", 'error')
|
||||
return None
|
||||
|
||||
def get_posts(self, subreddit: str, since_date: str = None, max_posts: int = 0,
|
||||
progress_callback=None) -> tuple:
|
||||
"""Fetch posts and download media from a subreddit using gallery-dl.
|
||||
|
||||
Args:
|
||||
subreddit: Subreddit name (without r/)
|
||||
since_date: ISO date string; skip posts older than this
|
||||
max_posts: Maximum posts to fetch (0 = unlimited)
|
||||
progress_callback: Optional callable(downloaded_count, skipped_count, latest_file)
|
||||
for live progress updates
|
||||
|
||||
Returns:
|
||||
Tuple of (List[Post], temp_dir_path) — caller must clean up temp_dir
|
||||
when done moving files. Returns ([], None) on failure.
|
||||
"""
|
||||
temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_')
|
||||
|
||||
try:
|
||||
downloaded = self.run_gallery_dl(subreddit, temp_dir, since_date, max_posts,
|
||||
progress_callback=progress_callback)
|
||||
|
||||
if not downloaded:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
return [], None
|
||||
|
||||
# Group files by post using metadata sidecars
|
||||
grouped = self._group_files_by_post(downloaded, temp_dir, subreddit)
|
||||
|
||||
if not grouped:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
return [], None
|
||||
|
||||
posts = []
|
||||
for post_id, post_data in grouped.items():
|
||||
attachments = []
|
||||
for file_path in post_data['files']:
|
||||
ext = file_path.suffix.lower()
|
||||
file_type = self._detect_file_type(ext)
|
||||
|
||||
attachments.append(Attachment(
|
||||
name=file_path.name,
|
||||
file_type=file_type,
|
||||
extension=ext,
|
||||
server_path=str(file_path), # temp path, will be moved
|
||||
download_url=None, # Already downloaded
|
||||
file_size=file_path.stat().st_size if file_path.exists() else None,
|
||||
))
|
||||
|
||||
if not attachments:
|
||||
continue
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=subreddit.lower(),
|
||||
title=post_data.get('title'),
|
||||
content=post_data.get('title'),
|
||||
published_at=post_data.get('date'),
|
||||
attachments=attachments,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
self.log(f"Parsed {len(posts)} posts with {sum(len(p.attachments) for p in posts)} attachments from r/{subreddit}", 'info')
|
||||
return posts, temp_dir
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching posts from r/{subreddit}: {e}", 'error')
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
return [], None
|
||||
|
||||
def run_gallery_dl(self, subreddit: str, temp_dir: str,
|
||||
since_date: str = None, max_posts: int = 0,
|
||||
progress_callback=None, batch_callback=None,
|
||||
batch_size: int = 50) -> dict:
|
||||
"""Run gallery-dl to download media from a subreddit.
|
||||
|
||||
Streams stdout line-by-line. Calls progress_callback for status updates
|
||||
and batch_callback with lists of new file paths for incremental processing.
|
||||
|
||||
Args:
|
||||
progress_callback: Called with (dl_count, skip_count, total_seen)
|
||||
batch_callback: Called with (new_files: List[Path]) every batch_size files
|
||||
batch_size: How many files to accumulate before calling batch_callback
|
||||
|
||||
Returns:
|
||||
Dict with dl_count, skip_count, total.
|
||||
"""
|
||||
import time
|
||||
|
||||
# Use a separate download archive for paid content reddit
|
||||
archive_dir = '/opt/media-downloader/data/cache'
|
||||
os.makedirs(archive_dir, exist_ok=True)
|
||||
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
|
||||
|
||||
cmd = [
|
||||
self.gallery_dl_path,
|
||||
'--write-metadata',
|
||||
'--download-archive', archive_path,
|
||||
'-d', temp_dir,
|
||||
]
|
||||
|
||||
# REST API mode to avoid shared OAuth rate limits
|
||||
cmd.extend(['-o', 'extractor.reddit.api=rest'])
|
||||
|
||||
# Limit posts (0 = unlimited)
|
||||
if max_posts > 0:
|
||||
cmd.extend(['--range', f'1-{max_posts}'])
|
||||
|
||||
# Date filtering
|
||||
if since_date:
|
||||
try:
|
||||
cutoff = since_date[:10] # YYYY-MM-DD
|
||||
cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"])
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
cmd.append(f'https://www.reddit.com/r/{subreddit}/new/')
|
||||
|
||||
# Check for Reddit cookies file
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
|
||||
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
|
||||
cmd.extend(['--cookies', temp_cookie_file])
|
||||
|
||||
self.log(f"Running gallery-dl for r/{subreddit}", 'info')
|
||||
self.log(f"Command: {' '.join(cmd)}", 'debug')
|
||||
|
||||
dl_count = 0
|
||||
skip_count = 0
|
||||
pending_files = []
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
timeout_secs = 7200 # 2 hours
|
||||
|
||||
while True:
|
||||
if time.time() - start_time > timeout_secs:
|
||||
proc.kill()
|
||||
self.log(f"gallery-dl timed out for r/{subreddit}", 'error')
|
||||
break
|
||||
|
||||
line = proc.stdout.readline()
|
||||
if not line and proc.poll() is not None:
|
||||
break
|
||||
if not line:
|
||||
continue
|
||||
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith('# '):
|
||||
# Skipped file (already in archive)
|
||||
skip_count += 1
|
||||
else:
|
||||
# Downloaded file — gallery-dl prints the full path
|
||||
dl_count += 1
|
||||
file_path = Path(line)
|
||||
if file_path.exists() and not file_path.name.endswith('.json'):
|
||||
pending_files.append(file_path)
|
||||
|
||||
total = dl_count + skip_count
|
||||
if progress_callback and total % 5 == 0:
|
||||
progress_callback(dl_count, skip_count, total)
|
||||
|
||||
# Flush batch for processing
|
||||
if batch_callback and len(pending_files) >= batch_size:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
proc.wait()
|
||||
|
||||
# Final batch
|
||||
if batch_callback and pending_files:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(dl_count, skip_count, dl_count + skip_count)
|
||||
|
||||
returncode = proc.returncode
|
||||
if returncode not in (None, 0, 1, 4, 5):
|
||||
stderr = proc.stderr.read()
|
||||
self.log(f"gallery-dl returned code {returncode} for r/{subreddit}", 'warning')
|
||||
if stderr:
|
||||
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"gallery-dl failed for r/{subreddit}: {e}", 'error')
|
||||
|
||||
self.log(f"gallery-dl done for r/{subreddit}: {dl_count} downloaded, {skip_count} skipped", 'info')
|
||||
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
|
||||
|
||||
def _group_files_by_post(self, files: List[Path], temp_dir: str,
|
||||
subreddit: str) -> Dict[str, Dict]:
|
||||
"""Group downloaded files by Reddit post ID using metadata JSON sidecars.
|
||||
|
||||
Adapted from reddit_community_monitor.py:_group_files_by_post
|
||||
|
||||
Returns:
|
||||
Dict mapping reddit_post_id -> {
|
||||
'files': [Path],
|
||||
'title': str,
|
||||
'date': str,
|
||||
'source_url': str
|
||||
}
|
||||
"""
|
||||
posts: Dict[str, Dict] = {}
|
||||
|
||||
for file_path in files:
|
||||
# Look for matching metadata JSON sidecar
|
||||
json_path = file_path.with_suffix(file_path.suffix + '.json')
|
||||
if not json_path.exists():
|
||||
json_path = file_path.with_suffix('.json')
|
||||
|
||||
metadata = {}
|
||||
if json_path.exists():
|
||||
try:
|
||||
with open(json_path, 'r', encoding='utf-8') as f:
|
||||
metadata = json.load(f)
|
||||
except (json.JSONDecodeError, Exception) as e:
|
||||
self.log(f"Failed to parse metadata for {file_path.name}: {e}", 'debug')
|
||||
|
||||
# Extract Reddit post ID
|
||||
reddit_post_id = None
|
||||
for key in ('id', 'reddit_id', 'parent_id'):
|
||||
if key in metadata:
|
||||
reddit_post_id = str(metadata[key])
|
||||
break
|
||||
|
||||
if not reddit_post_id:
|
||||
# Filename-based fallback: subreddit_postid_num.ext
|
||||
parts = file_path.stem.split('_')
|
||||
if len(parts) >= 2:
|
||||
reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1]
|
||||
else:
|
||||
reddit_post_id = file_path.stem
|
||||
|
||||
# Extract post date
|
||||
post_date = None
|
||||
if 'date' in metadata:
|
||||
date_val = metadata['date']
|
||||
if isinstance(date_val, str):
|
||||
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
|
||||
try:
|
||||
utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=timezone.utc)
|
||||
post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S')
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
if not post_date:
|
||||
post_date = date_val
|
||||
elif isinstance(date_val, (int, float)):
|
||||
try:
|
||||
post_date = datetime.fromtimestamp(date_val, tz=timezone.utc).isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
if not post_date and 'created_utc' in metadata:
|
||||
try:
|
||||
post_date = datetime.fromtimestamp(metadata['created_utc'], tz=timezone.utc).isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
if not post_date:
|
||||
post_date = datetime.now().isoformat()
|
||||
|
||||
title = metadata.get('title', metadata.get('description', ''))
|
||||
sub = metadata.get('subreddit', subreddit)
|
||||
source_url = f"https://www.reddit.com/r/{sub}/comments/{reddit_post_id}" if sub else ''
|
||||
|
||||
if reddit_post_id not in posts:
|
||||
posts[reddit_post_id] = {
|
||||
'files': [],
|
||||
'title': title,
|
||||
'date': post_date,
|
||||
'source_url': source_url,
|
||||
}
|
||||
|
||||
posts[reddit_post_id]['files'].append(file_path)
|
||||
|
||||
return posts
|
||||
|
||||
def _get_cookies_file(self) -> Optional[str]:
|
||||
"""Get Reddit cookies JSON from the scrapers table if configured."""
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT cookies FROM scrapers WHERE name = 'reddit' AND cookies IS NOT NULL"
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
return row[0]
|
||||
except Exception as e:
|
||||
self.log(f"Could not load Reddit cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool:
|
||||
"""Convert JSON cookies array to Netscape cookie file format."""
|
||||
try:
|
||||
cookies = json.loads(cookies_json)
|
||||
if not isinstance(cookies, list):
|
||||
return False
|
||||
|
||||
with open(output_path, 'w') as f:
|
||||
f.write("# Netscape HTTP Cookie File\n")
|
||||
f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n")
|
||||
for cookie in cookies:
|
||||
domain = cookie.get('domain', '')
|
||||
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
||||
path = cookie.get('path', '/')
|
||||
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
||||
expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0)))
|
||||
if expires is None:
|
||||
expires = 0
|
||||
expires = str(int(float(expires)))
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
self.log(f"Failed to write Netscape cookie file: {e}", 'error')
|
||||
return False
|
||||
|
||||
def get_pullpush_post_ids(self, subreddit: str, after_ts: int = 0,
|
||||
before_ts: int = None,
|
||||
progress_callback=None) -> List[Dict]:
|
||||
"""Fetch all historical post IDs for a subreddit from the Pullpush (Pushshift) API.
|
||||
|
||||
Paginates through the full archive using created_utc ascending order.
|
||||
Rate-limited to ~1 request per 2 seconds.
|
||||
|
||||
Args:
|
||||
subreddit: Subreddit name (without r/)
|
||||
after_ts: Unix timestamp to start from (0 = beginning of time)
|
||||
before_ts: Unix timestamp to stop at (None = no upper limit)
|
||||
progress_callback: Optional callable(fetched_count, message)
|
||||
|
||||
Returns:
|
||||
List of dicts: [{id, title, created_utc, url, is_gallery}, ...]
|
||||
"""
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
base_url = 'https://api.pullpush.io/reddit/search/submission/'
|
||||
all_posts = []
|
||||
current_after = after_ts
|
||||
page = 0
|
||||
|
||||
while True:
|
||||
params = (
|
||||
f'subreddit={subreddit}'
|
||||
f'&size=100'
|
||||
f'&sort=asc'
|
||||
f'&sort_type=created_utc'
|
||||
f'&after={current_after}'
|
||||
)
|
||||
if before_ts is not None:
|
||||
params += f'&before={before_ts}'
|
||||
|
||||
url = f'{base_url}?{params}'
|
||||
page += 1
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 429:
|
||||
self.log(f"Pullpush rate limited, waiting 5s...", 'warning')
|
||||
time.sleep(5)
|
||||
continue
|
||||
self.log(f"Pullpush HTTP {e.code} for r/{subreddit}: {e}", 'error')
|
||||
break
|
||||
except Exception as e:
|
||||
self.log(f"Pullpush request failed for r/{subreddit}: {e}", 'error')
|
||||
break
|
||||
|
||||
posts = data.get('data', [])
|
||||
if not posts:
|
||||
break
|
||||
|
||||
for post in posts:
|
||||
all_posts.append({
|
||||
'id': post.get('id', ''),
|
||||
'title': post.get('title', ''),
|
||||
'created_utc': post.get('created_utc', 0),
|
||||
'url': post.get('url', ''),
|
||||
'is_gallery': post.get('is_gallery', False),
|
||||
'selftext': post.get('selftext', ''),
|
||||
})
|
||||
|
||||
last_ts = posts[-1].get('created_utc', 0)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(all_posts),
|
||||
f"Fetched {len(all_posts)} post IDs (page {page})")
|
||||
|
||||
# Handle stuck pagination — same timestamp repeating
|
||||
if last_ts <= current_after:
|
||||
current_after = last_ts + 1
|
||||
else:
|
||||
current_after = last_ts
|
||||
|
||||
# If we got fewer than 100, we've reached the end
|
||||
if len(posts) < 100:
|
||||
break
|
||||
|
||||
# Rate limit: 2s between requests
|
||||
time.sleep(2)
|
||||
|
||||
self.log(f"Pullpush: fetched {len(all_posts)} total post IDs for r/{subreddit}", 'info')
|
||||
return all_posts
|
||||
|
||||
def run_gallery_dl_urls(self, urls_file: str, temp_dir: str,
|
||||
progress_callback=None, batch_callback=None,
|
||||
batch_size: int = 50) -> dict:
|
||||
"""Run gallery-dl with --input-file to download specific Reddit post URLs.
|
||||
|
||||
Same streaming/batch pattern as run_gallery_dl() but reads URLs from a file
|
||||
instead of scraping a subreddit listing.
|
||||
|
||||
Args:
|
||||
urls_file: Path to file containing one URL per line
|
||||
temp_dir: Directory for gallery-dl to download into
|
||||
progress_callback: Called with (dl_count, skip_count, total_seen)
|
||||
batch_callback: Called with (new_files: List[Path]) every batch_size files
|
||||
batch_size: How many files to accumulate before calling batch_callback
|
||||
|
||||
Returns:
|
||||
Dict with dl_count, skip_count, total.
|
||||
"""
|
||||
import time
|
||||
|
||||
# Same archive as normal Reddit paid content sync
|
||||
archive_dir = '/opt/media-downloader/data/cache'
|
||||
os.makedirs(archive_dir, exist_ok=True)
|
||||
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
|
||||
|
||||
cmd = [
|
||||
self.gallery_dl_path,
|
||||
'--write-metadata',
|
||||
'--download-archive', archive_path,
|
||||
'-d', temp_dir,
|
||||
'-o', 'extractor.reddit.api=rest',
|
||||
'--input-file', urls_file,
|
||||
]
|
||||
|
||||
# Check for Reddit cookies file
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
|
||||
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
|
||||
cmd.extend(['--cookies', temp_cookie_file])
|
||||
|
||||
self.log(f"Running gallery-dl with input file ({urls_file})", 'info')
|
||||
self.log(f"Command: {' '.join(cmd)}", 'debug')
|
||||
|
||||
dl_count = 0
|
||||
skip_count = 0
|
||||
pending_files = []
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
timeout_secs = 14400 # 4 hours for backfill (can be large)
|
||||
|
||||
while True:
|
||||
if time.time() - start_time > timeout_secs:
|
||||
proc.kill()
|
||||
self.log("gallery-dl backfill timed out", 'error')
|
||||
break
|
||||
|
||||
line = proc.stdout.readline()
|
||||
if not line and proc.poll() is not None:
|
||||
break
|
||||
if not line:
|
||||
continue
|
||||
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith('# '):
|
||||
skip_count += 1
|
||||
else:
|
||||
dl_count += 1
|
||||
file_path = Path(line)
|
||||
if file_path.exists() and not file_path.name.endswith('.json'):
|
||||
pending_files.append(file_path)
|
||||
|
||||
total = dl_count + skip_count
|
||||
if progress_callback:
|
||||
progress_callback(dl_count, skip_count, total)
|
||||
|
||||
if batch_callback and len(pending_files) >= batch_size:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
proc.wait()
|
||||
|
||||
# Final batch
|
||||
if batch_callback and pending_files:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(dl_count, skip_count, dl_count + skip_count)
|
||||
|
||||
returncode = proc.returncode
|
||||
if returncode not in (None, 0, 1, 4, 5):
|
||||
stderr = proc.stderr.read()
|
||||
self.log(f"gallery-dl backfill returned code {returncode}", 'warning')
|
||||
if stderr:
|
||||
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"gallery-dl backfill failed: {e}", 'error')
|
||||
|
||||
self.log(f"gallery-dl backfill done: {dl_count} downloaded, {skip_count} skipped", 'info')
|
||||
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
|
||||
|
||||
@staticmethod
|
||||
def _detect_file_type(ext: str) -> str:
|
||||
"""Detect file type from extension."""
|
||||
ext = ext.lower().lstrip('.')
|
||||
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
|
||||
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg'}
|
||||
|
||||
if ext in image_exts:
|
||||
return 'image'
|
||||
elif ext in video_exts:
|
||||
return 'video'
|
||||
return 'unknown'
|
||||
9843
modules/paid_content/scraper.py
Normal file
9843
modules/paid_content/scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
259
modules/paid_content/snapchat_client.py
Normal file
259
modules/paid_content/snapchat_client.py
Normal file
@@ -0,0 +1,259 @@
|
||||
"""
|
||||
Snapchat Client for Paid Content - Wraps SnapchatClientDownloader for paid content system.
|
||||
|
||||
Maps spotlights and highlights to the Post/Attachment model used by the paid content scraper.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class SnapchatPaidContentClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching Snapchat creator content via the existing SnapchatClientDownloader.
|
||||
|
||||
Each spotlight/highlight collection maps to one Post with snaps as Attachments.
|
||||
"""
|
||||
|
||||
SERVICE_ID = 'snapchat'
|
||||
PLATFORM = 'snapchat'
|
||||
|
||||
def __init__(self, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Snapchat')
|
||||
self.unified_db = unified_db
|
||||
self._downloader = None
|
||||
|
||||
def _get_downloader(self):
|
||||
"""Lazy-init the underlying SnapchatClientDownloader."""
|
||||
if self._downloader is None:
|
||||
from modules.snapchat_client_module import SnapchatClientDownloader
|
||||
self._downloader = SnapchatClientDownloader(
|
||||
show_progress=False,
|
||||
use_database=False,
|
||||
log_callback=self.log_callback,
|
||||
unified_db=self.unified_db,
|
||||
)
|
||||
return self._downloader
|
||||
|
||||
def get_creator_info(self, username: str) -> Optional[Dict]:
|
||||
"""Get creator information from profile page __NEXT_DATA__.
|
||||
|
||||
Returns dict with display_name and avatar_url if found.
|
||||
"""
|
||||
downloader = self._get_downloader()
|
||||
|
||||
profile_url = f"https://story.snapchat.com/@{username}"
|
||||
html = downloader._fetch_page(profile_url)
|
||||
if not html:
|
||||
return {'creator_id': username, 'creator_name': username}
|
||||
|
||||
data = downloader._extract_next_data(html)
|
||||
display_name = username
|
||||
avatar_url = None
|
||||
|
||||
if data:
|
||||
props = data.get('props', {}).get('pageProps', {})
|
||||
|
||||
# userProfile uses a $case/userInfo wrapper
|
||||
user_profile = props.get('userProfile', {})
|
||||
user_info = user_profile.get('userInfo', {})
|
||||
if user_info:
|
||||
name = user_info.get('displayName', '').strip()
|
||||
if name:
|
||||
display_name = name
|
||||
|
||||
# Bitmoji 3D avatar URL (best quality)
|
||||
bitmoji = user_info.get('bitmoji3d') or {}
|
||||
if isinstance(bitmoji, dict):
|
||||
avatar_url = bitmoji.get('avatarUrl') or bitmoji.get('url')
|
||||
|
||||
# linkPreview OG images as avatar (preview/square.jpeg — good quality)
|
||||
if not avatar_url:
|
||||
link_preview = props.get('linkPreview', {})
|
||||
for img_key in ('facebookImage', 'twitterImage'):
|
||||
img = link_preview.get(img_key, {})
|
||||
if isinstance(img, dict) and img.get('url'):
|
||||
avatar_url = img['url']
|
||||
break
|
||||
|
||||
# pageMetadata.pageTitle sometimes has the display name
|
||||
if display_name == username:
|
||||
page_meta = props.get('pageMetadata', {})
|
||||
page_title = page_meta.get('pageTitle', '')
|
||||
# Format: "DisplayName (@username) | Snapchat..."
|
||||
if page_title and '(@' in page_title:
|
||||
name_part = page_title.split('(@')[0].strip()
|
||||
if name_part:
|
||||
display_name = name_part
|
||||
|
||||
return {
|
||||
'creator_id': username,
|
||||
'creator_name': display_name,
|
||||
'profile_image_url': avatar_url,
|
||||
}
|
||||
|
||||
def get_creator(self, username: str) -> Optional[Creator]:
|
||||
"""Get Creator model for a Snapchat user."""
|
||||
info = self.get_creator_info(username)
|
||||
if not info:
|
||||
return None
|
||||
|
||||
return Creator(
|
||||
creator_id=username,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
username=info.get('creator_name', username),
|
||||
display_name=info.get('creator_name'),
|
||||
profile_image_url=info.get('profile_image_url'),
|
||||
)
|
||||
|
||||
def get_posts(self, username: str, since_date: str = None) -> List[Post]:
|
||||
"""Fetch spotlights and highlights as Post objects.
|
||||
|
||||
Args:
|
||||
username: Snapchat username (without @)
|
||||
since_date: ISO date string; skip snaps older than this
|
||||
|
||||
Returns:
|
||||
List of Post objects (one per spotlight/highlight collection)
|
||||
"""
|
||||
downloader = self._get_downloader()
|
||||
|
||||
# Parse cutoff date
|
||||
cutoff_dt = None
|
||||
if since_date:
|
||||
try:
|
||||
if 'T' in since_date:
|
||||
cutoff_dt = datetime.fromisoformat(since_date.replace('Z', '+00:00').replace('+00:00', ''))
|
||||
else:
|
||||
cutoff_dt = datetime.strptime(since_date[:10], '%Y-%m-%d')
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
# Discover content from profile (spotlights, highlights, stories)
|
||||
profile_content = downloader.get_profile_content(username)
|
||||
self.log(f"Found {len(profile_content.get('spotlights', []))} spotlights, "
|
||||
f"{len(profile_content.get('highlight_collections', []))} highlights, "
|
||||
f"{'stories' if profile_content.get('story_collection') else 'no stories'} "
|
||||
f"for @{username}", 'info')
|
||||
|
||||
posts = []
|
||||
|
||||
# Process story snaps (inline from profile page — no extra HTTP requests)
|
||||
story_collection = profile_content.get('story_collection')
|
||||
if story_collection and story_collection.snaps:
|
||||
post = self._collection_to_post(story_collection, username, cutoff_dt)
|
||||
if post and post.attachments:
|
||||
posts.append(post)
|
||||
|
||||
# Process highlights (inline from profile page — no extra HTTP requests)
|
||||
for collection in profile_content.get('highlight_collections', []):
|
||||
post = self._collection_to_post(collection, username, cutoff_dt)
|
||||
if post and post.attachments:
|
||||
posts.append(post)
|
||||
|
||||
# Process spotlights (still requires per-URL fetch for full metadata)
|
||||
for url in profile_content.get('spotlights', []):
|
||||
collection = downloader.get_spotlight_metadata(url)
|
||||
if not collection:
|
||||
continue
|
||||
post = self._collection_to_post(collection, username, cutoff_dt)
|
||||
if post and post.attachments:
|
||||
posts.append(post)
|
||||
|
||||
self.log(f"Mapped {len(posts)} posts with attachments for @{username}", 'info')
|
||||
return posts
|
||||
|
||||
def _collection_to_post(self, collection, username: str, cutoff_dt=None) -> Optional[Post]:
|
||||
"""Convert a SnapCollection to a Post with Attachments."""
|
||||
if not collection.snaps:
|
||||
return None
|
||||
|
||||
# Use the earliest snap timestamp as the post date
|
||||
timestamps = [s.timestamp for s in collection.snaps if s.timestamp]
|
||||
if timestamps:
|
||||
earliest = min(timestamps)
|
||||
published_at = earliest.strftime('%Y-%m-%d')
|
||||
else:
|
||||
published_at = None
|
||||
|
||||
# Skip if all snaps are older than cutoff
|
||||
if cutoff_dt and timestamps:
|
||||
latest = max(timestamps)
|
||||
if latest < cutoff_dt:
|
||||
return None
|
||||
|
||||
attachments = []
|
||||
for snap in collection.snaps:
|
||||
if not snap.media_url:
|
||||
continue
|
||||
|
||||
# Determine extension from media type
|
||||
ext = '.mp4' if snap.media_type == 'video' else '.jpg'
|
||||
name = f"{snap.media_id}{ext}" if snap.media_id else f"snap_{snap.index}{ext}"
|
||||
|
||||
attachment = Attachment(
|
||||
name=name,
|
||||
file_type=snap.media_type,
|
||||
extension=ext,
|
||||
server_path=snap.media_url,
|
||||
download_url=snap.media_url,
|
||||
width=snap.width if snap.width else None,
|
||||
height=snap.height if snap.height else None,
|
||||
duration=snap.duration_ms // 1000 if snap.duration_ms else None,
|
||||
)
|
||||
attachments.append(attachment)
|
||||
|
||||
if not attachments:
|
||||
return None
|
||||
|
||||
# Build content/title from collection metadata
|
||||
title = collection.title or None
|
||||
content = collection.title if collection.title else None
|
||||
|
||||
# Tag as spotlight or highlight
|
||||
tag_name = collection.collection_type.title() # "Spotlight" or "Highlight"
|
||||
|
||||
return Post(
|
||||
post_id=collection.collection_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=username,
|
||||
title=title,
|
||||
content=content,
|
||||
published_at=published_at,
|
||||
attachments=attachments,
|
||||
auto_tags=[tag_name],
|
||||
)
|
||||
|
||||
def download_snap(self, media_url: str, output_path: str) -> bool:
|
||||
"""Download a single snap file via curl_cffi.
|
||||
|
||||
Args:
|
||||
media_url: Direct URL to the media file
|
||||
output_path: Local path to save the file
|
||||
|
||||
Returns:
|
||||
True if download succeeded
|
||||
"""
|
||||
import os
|
||||
downloader = self._get_downloader()
|
||||
session = downloader._get_session()
|
||||
|
||||
try:
|
||||
url = media_url.replace('&', '&')
|
||||
resp = session.get(url, timeout=60)
|
||||
if resp.status_code == 200 and len(resp.content) > 0:
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(resp.content)
|
||||
return True
|
||||
else:
|
||||
self.log(f"Download failed: HTTP {resp.status_code}, size={len(resp.content)}", 'warning')
|
||||
return False
|
||||
except Exception as e:
|
||||
self.log(f"Download error: {e}", 'error')
|
||||
return False
|
||||
508
modules/paid_content/soundgasm_client.py
Normal file
508
modules/paid_content/soundgasm_client.py
Normal file
@@ -0,0 +1,508 @@
|
||||
"""
|
||||
Soundgasm + Liltsome Archive Client for Paid Content
|
||||
|
||||
Handles:
|
||||
- Soundgasm profile scraping (no auth/Cloudflare needed)
|
||||
- Liltsome archive (liltsome.yerf.org) as supplementary source
|
||||
- Bracket tag parsing from audio titles: [F4M] [Whisper] etc.
|
||||
- Direct HTTP audio downloads (.m4a)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from urllib.parse import quote
|
||||
|
||||
import aiohttp
|
||||
import aiofiles
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bracket tag helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_bracket_tags(title: str) -> Tuple[str, List[str]]:
|
||||
"""Extract [bracket] tags from a title, normalize, return (clean_title, tags)."""
|
||||
tags = re.findall(r'\[([^\]]+)\]', title)
|
||||
clean_title = re.sub(r'\s*\[[^\]]+\]\s*', ' ', title).strip()
|
||||
normalized: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
for tag in tags:
|
||||
tag_lower = tag.strip().lower()
|
||||
if tag_lower and tag_lower not in seen:
|
||||
seen.add(tag_lower)
|
||||
normalized.append(tag_lower)
|
||||
return clean_title, normalized
|
||||
|
||||
|
||||
def format_tag_display(tag_lower: str) -> str:
|
||||
"""Format a normalized lowercase tag for display.
|
||||
|
||||
Gender tags (f4m, m4f, f4a …) → uppercase.
|
||||
Everything else → title case.
|
||||
"""
|
||||
if re.match(r'^[a-z]+\d[a-z]+$', tag_lower):
|
||||
return tag_lower.upper()
|
||||
return tag_lower.title()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SoundgasmClient
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class SoundgasmClient(LoggingMixin):
|
||||
"""Client for fetching audio from Soundgasm and the Liltsome archive."""
|
||||
|
||||
SERVICE_ID = 'soundgasm'
|
||||
PLATFORM = 'soundgasm'
|
||||
|
||||
SOUNDGASM_BASE = 'https://soundgasm.net'
|
||||
LILTSOME_BASE = 'https://liltsome.yerf.org'
|
||||
LILTSOME_LIBRARY_URL = f'{LILTSOME_BASE}/data/library.json'
|
||||
LILTSOME_CACHE_PATH = Path('/opt/media-downloader/data/liltsome_library.json')
|
||||
LILTSOME_ETAG_PATH = Path('/opt/media-downloader/data/liltsome_library.json.etag')
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
def __init__(self, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Soundgasm')
|
||||
self._liltsome_data: Optional[Dict] = None # cached in-memory per sync run
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_profile_info(self, username: str) -> Optional[Dict]:
|
||||
"""Return basic profile info (post count) from Soundgasm and/or Liltsome."""
|
||||
post_count = 0
|
||||
source = None
|
||||
|
||||
# Try Soundgasm profile page first
|
||||
try:
|
||||
sg_posts = await self._fetch_soundgasm_profile(username)
|
||||
if sg_posts is not None:
|
||||
post_count = len(sg_posts)
|
||||
source = 'soundgasm'
|
||||
except Exception as e:
|
||||
self.log(f"Soundgasm profile fetch failed for {username}: {e}", 'debug')
|
||||
|
||||
# Also check Liltsome for additional posts
|
||||
try:
|
||||
lt_entries = await self._get_liltsome_entries(username)
|
||||
if lt_entries:
|
||||
post_count = max(post_count, len(lt_entries))
|
||||
if source is None:
|
||||
source = 'liltsome'
|
||||
except Exception as e:
|
||||
self.log(f"Liltsome lookup failed for {username}: {e}", 'debug')
|
||||
|
||||
if post_count == 0 and source is None:
|
||||
return None
|
||||
|
||||
return {
|
||||
'username': username,
|
||||
'post_count': post_count,
|
||||
'source': source,
|
||||
}
|
||||
|
||||
async def get_posts(self, username: str, known_post_ids: Optional[Set[str]] = None,
|
||||
progress_callback=None) -> List[Post]:
|
||||
"""Fetch posts from both Soundgasm and Liltsome, deduplicating by post_id."""
|
||||
known = known_post_ids or set()
|
||||
posts: List[Post] = []
|
||||
seen_ids: Set[str] = set(known)
|
||||
|
||||
# 1. Soundgasm (may fail if account deleted — that's OK)
|
||||
try:
|
||||
sg_posts = await self._fetch_soundgasm_posts(username, seen_ids)
|
||||
for p in sg_posts:
|
||||
if p.post_id not in seen_ids:
|
||||
seen_ids.add(p.post_id)
|
||||
posts.append(p)
|
||||
self.log(f"Soundgasm: {len(sg_posts)} new posts for {username}", 'info')
|
||||
except Exception as e:
|
||||
self.log(f"Soundgasm fetch failed for {username} (account may be deleted): {e}", 'warning')
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
# 2. Liltsome archive (always)
|
||||
try:
|
||||
lt_posts = await self._fetch_liltsome_posts(username, seen_ids)
|
||||
for p in lt_posts:
|
||||
if p.post_id not in seen_ids:
|
||||
seen_ids.add(p.post_id)
|
||||
posts.append(p)
|
||||
self.log(f"Liltsome: {len(lt_posts)} new posts for {username}", 'info')
|
||||
except Exception as e:
|
||||
self.log(f"Liltsome fetch failed for {username}: {e}", 'warning')
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
return posts
|
||||
|
||||
async def download_audio(self, download_url: str, output_path: Path) -> Dict:
|
||||
"""Download an audio file via direct HTTP GET."""
|
||||
try:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=300)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(download_url, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'error': f'HTTP {resp.status}'}
|
||||
|
||||
async with aiofiles.open(str(output_path), 'wb') as f:
|
||||
total = 0
|
||||
async for chunk in resp.content.iter_chunked(65536):
|
||||
await f.write(chunk)
|
||||
total += len(chunk)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(output_path),
|
||||
'file_size': total,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Download failed for {download_url}: {e}", 'error')
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Soundgasm scraping
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_soundgasm_profile(self, username: str) -> Optional[List[Dict]]:
|
||||
"""Scrape the Soundgasm profile page, return list of {slug, title, plays}."""
|
||||
url = f'{self.SOUNDGASM_BASE}/u/{username}'
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(url, headers=self.HEADERS) as resp:
|
||||
if resp.status == 404:
|
||||
return None
|
||||
if resp.status != 200:
|
||||
self.log(f"Soundgasm profile returned {resp.status}", 'warning')
|
||||
return None
|
||||
html = await resp.text()
|
||||
|
||||
# Parse .sound-details divs for links
|
||||
entries: List[Dict] = []
|
||||
# Pattern: <a href="https://soundgasm.net/u/{username}/{slug}">title</a>
|
||||
# (profile page uses absolute URLs)
|
||||
for m in re.finditer(
|
||||
r'<a\s+href="(?:https?://soundgasm\.net)?/u/' + re.escape(username) + r'/([^"]+)"[^>]*>\s*([^<]+)',
|
||||
html, re.IGNORECASE
|
||||
):
|
||||
slug = m.group(1).strip()
|
||||
title = m.group(2).strip()
|
||||
entries.append({'slug': slug, 'title': title})
|
||||
|
||||
return entries
|
||||
|
||||
async def _fetch_soundgasm_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
|
||||
"""Fetch full post details from Soundgasm for new posts."""
|
||||
profile_entries = await self._fetch_soundgasm_profile(username)
|
||||
if not profile_entries:
|
||||
return []
|
||||
|
||||
posts: List[Post] = []
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
for entry in profile_entries:
|
||||
slug = entry['slug']
|
||||
if slug in seen_ids:
|
||||
continue
|
||||
|
||||
try:
|
||||
detail = await self._fetch_soundgasm_detail(session, username, slug)
|
||||
if detail is None:
|
||||
continue
|
||||
|
||||
title_raw = detail.get('title', entry.get('title', slug))
|
||||
clean_title, tags = parse_bracket_tags(title_raw)
|
||||
description = detail.get('description', '')
|
||||
audio_url = detail.get('audio_url')
|
||||
|
||||
if not audio_url:
|
||||
continue
|
||||
|
||||
# Determine extension from URL
|
||||
ext = '.m4a'
|
||||
if audio_url:
|
||||
url_path = audio_url.split('?')[0]
|
||||
if '.' in url_path.split('/')[-1]:
|
||||
ext = '.' + url_path.split('/')[-1].rsplit('.', 1)[1]
|
||||
|
||||
filename = f"{slug}{ext}"
|
||||
|
||||
attachment = Attachment(
|
||||
name=filename,
|
||||
file_type='audio',
|
||||
extension=ext.lstrip('.'),
|
||||
server_path=f'/u/{username}/{slug}',
|
||||
download_url=audio_url,
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=slug,
|
||||
service_id='soundgasm',
|
||||
platform='soundgasm',
|
||||
creator_id=username,
|
||||
title=clean_title or None,
|
||||
content=description or None,
|
||||
published_at=None, # Soundgasm has no dates
|
||||
attachments=[attachment],
|
||||
auto_tags=tags,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching Soundgasm detail for {slug}: {e}", 'debug')
|
||||
|
||||
return posts
|
||||
|
||||
async def _fetch_soundgasm_detail(self, session: aiohttp.ClientSession,
|
||||
username: str, slug: str) -> Optional[Dict]:
|
||||
"""Fetch a single Soundgasm audio detail page and extract metadata."""
|
||||
url = f'{self.SOUNDGASM_BASE}/u/{username}/{slug}'
|
||||
|
||||
async with session.get(url, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
return None
|
||||
html = await resp.text()
|
||||
|
||||
# Title: <div aria-label="title"...>Title Text</div>
|
||||
# or from the page title tag
|
||||
title = None
|
||||
title_match = re.search(r'aria-label="title"[^>]*>([^<]+)', html)
|
||||
if title_match:
|
||||
title = title_match.group(1).strip()
|
||||
if not title:
|
||||
title_match = re.search(r'<title>([^<]+)</title>', html, re.IGNORECASE)
|
||||
if title_match:
|
||||
title = title_match.group(1).strip()
|
||||
# Remove " - Soundgasm" suffix if present
|
||||
title = re.sub(r'\s*[-–—]\s*Soundgasm.*$', '', title, flags=re.IGNORECASE).strip()
|
||||
|
||||
# Description: <div class="jp-description">...</div>
|
||||
description = None
|
||||
desc_match = re.search(r'class="jp-description"[^>]*>(.*?)</div>', html, re.DOTALL)
|
||||
if desc_match:
|
||||
desc_html = desc_match.group(1)
|
||||
# Strip HTML tags
|
||||
description = re.sub(r'<br\s*/?>', '\n', desc_html)
|
||||
description = re.sub(r'<[^>]+>', '', description).strip()
|
||||
|
||||
# Audio URL: m4a: "https://..."
|
||||
audio_url = None
|
||||
audio_match = re.search(r'm4a:\s*"([^"]+)"', html)
|
||||
if audio_match:
|
||||
audio_url = audio_match.group(1)
|
||||
|
||||
if not audio_url:
|
||||
return None
|
||||
|
||||
return {
|
||||
'title': title or slug,
|
||||
'description': description,
|
||||
'audio_url': audio_url,
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Liltsome archive
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _ensure_liltsome_cache(self) -> bool:
|
||||
"""Download/refresh the Liltsome library.json using ETag-based invalidation.
|
||||
|
||||
Returns True if cache is available (fresh or existing), False otherwise.
|
||||
"""
|
||||
etag_file = self.LILTSOME_ETAG_PATH
|
||||
cache_file = self.LILTSOME_CACHE_PATH
|
||||
|
||||
stored_etag = None
|
||||
if etag_file.exists():
|
||||
try:
|
||||
stored_etag = etag_file.read_text().strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=600) # 131MB can take a while
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# HEAD request to check ETag
|
||||
async with session.head(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Liltsome HEAD returned {resp.status}", 'warning')
|
||||
return cache_file.exists()
|
||||
|
||||
remote_etag = resp.headers.get('ETag', '').strip()
|
||||
|
||||
if stored_etag and remote_etag and stored_etag == remote_etag and cache_file.exists():
|
||||
self.log("Liltsome cache is fresh (ETag match)", 'debug')
|
||||
return True
|
||||
|
||||
# Download the full library
|
||||
self.log("Downloading Liltsome library.json (this may take a while)...", 'info')
|
||||
async with session.get(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Liltsome GET returned {resp.status}", 'warning')
|
||||
return cache_file.exists()
|
||||
|
||||
cache_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
async with aiofiles.open(str(cache_file), 'wb') as f:
|
||||
async for chunk in resp.content.iter_chunked(262144):
|
||||
await f.write(chunk)
|
||||
|
||||
new_etag = resp.headers.get('ETag', remote_etag or '').strip()
|
||||
|
||||
if new_etag:
|
||||
etag_file.write_text(new_etag)
|
||||
|
||||
self.log("Liltsome library.json downloaded successfully", 'info')
|
||||
self._liltsome_data = None # force re-parse
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Failed to refresh Liltsome cache: {e}", 'warning')
|
||||
return cache_file.exists()
|
||||
|
||||
async def _load_liltsome_data(self) -> Optional[Dict]:
|
||||
"""Load and cache the Liltsome library data in memory."""
|
||||
if self._liltsome_data is not None:
|
||||
return self._liltsome_data
|
||||
|
||||
cache_file = self.LILTSOME_CACHE_PATH
|
||||
if not cache_file.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
data = await asyncio.to_thread(self._read_liltsome_json, cache_file)
|
||||
self._liltsome_data = data
|
||||
return data
|
||||
except Exception as e:
|
||||
self.log(f"Failed to parse Liltsome library.json: {e}", 'error')
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _read_liltsome_json(path: Path) -> Dict:
|
||||
"""Read and parse the Liltsome JSON file (blocking, run in thread)."""
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
async def _get_liltsome_entries(self, username: str) -> Optional[List[Dict]]:
|
||||
"""Find artist entries in Liltsome data by username (case-insensitive).
|
||||
|
||||
library.json structure: {"artists": [{"id": "name", "files": {"audio": [...]}}]}
|
||||
"""
|
||||
await self._ensure_liltsome_cache()
|
||||
data = await self._load_liltsome_data()
|
||||
if not data:
|
||||
return None
|
||||
|
||||
username_lower = username.lower()
|
||||
|
||||
# Top-level is {"artists": [...]}
|
||||
artists = data.get('artists', []) if isinstance(data, dict) else data
|
||||
|
||||
for artist in artists:
|
||||
artist_id = str(artist.get('id', '')).lower()
|
||||
artist_name = str(artist.get('name', '')).lower()
|
||||
if artist_id == username_lower or artist_name == username_lower:
|
||||
# Audio entries are in files.audio
|
||||
files = artist.get('files', {})
|
||||
if isinstance(files, dict):
|
||||
return files.get('audio', [])
|
||||
return []
|
||||
|
||||
return None
|
||||
|
||||
async def _fetch_liltsome_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
|
||||
"""Convert Liltsome archive entries to Post objects."""
|
||||
entries = await self._get_liltsome_entries(username)
|
||||
if not entries:
|
||||
return []
|
||||
|
||||
posts: List[Post] = []
|
||||
for entry in entries:
|
||||
filename = entry.get('filename', '')
|
||||
path = entry.get('path', '')
|
||||
title_raw = entry.get('title', filename)
|
||||
entry_tags = entry.get('tags', []) # already lowercase in Liltsome
|
||||
duration = None
|
||||
file_size = entry.get('size')
|
||||
|
||||
if isinstance(entry.get('metadata'), dict):
|
||||
duration = entry['metadata'].get('duration')
|
||||
|
||||
# Build post_id: prefix with liltsome- to avoid collision
|
||||
sanitized_name = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) if filename else path
|
||||
post_id = f'liltsome-{sanitized_name}'
|
||||
|
||||
if post_id in seen_ids:
|
||||
continue
|
||||
|
||||
# Parse bracket tags from title for clean_title
|
||||
clean_title, title_tags = parse_bracket_tags(title_raw)
|
||||
|
||||
# Merge: use Liltsome's pre-parsed tags + any extra from title
|
||||
all_tags_set: Set[str] = set()
|
||||
all_tags: List[str] = []
|
||||
for t in entry_tags:
|
||||
t_lower = t.strip().lower()
|
||||
if t_lower and t_lower not in all_tags_set:
|
||||
all_tags_set.add(t_lower)
|
||||
all_tags.append(t_lower)
|
||||
for t in title_tags:
|
||||
if t not in all_tags_set:
|
||||
all_tags_set.add(t)
|
||||
all_tags.append(t)
|
||||
|
||||
# Build download URL
|
||||
download_url = f'{self.LILTSOME_BASE}/audio_files/{quote(path, safe="/")}' if path else None
|
||||
|
||||
# Determine extension
|
||||
ext = 'm4a'
|
||||
if filename and '.' in filename:
|
||||
ext = filename.rsplit('.', 1)[1].lower()
|
||||
elif path and '.' in path:
|
||||
ext = path.rsplit('.', 1)[1].lower()
|
||||
|
||||
attachment = Attachment(
|
||||
name=f"{sanitized_name}.{ext}" if not filename.endswith(f'.{ext}') else filename,
|
||||
file_type='audio',
|
||||
extension=ext,
|
||||
server_path=path or filename,
|
||||
download_url=download_url,
|
||||
file_size=file_size,
|
||||
duration=duration,
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id='soundgasm',
|
||||
platform='soundgasm',
|
||||
creator_id=username,
|
||||
title=clean_title or None,
|
||||
content=None,
|
||||
published_at=None,
|
||||
attachments=[attachment],
|
||||
auto_tags=all_tags,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
827
modules/paid_content/tiktok_client.py
Normal file
827
modules/paid_content/tiktok_client.py
Normal file
@@ -0,0 +1,827 @@
|
||||
"""
|
||||
TikTok Client for Paid Content - Uses yt-dlp for listing and gallery-dl for downloading
|
||||
|
||||
Adapts the hybrid approach from modules/tiktok_module.py into the paid content client pattern.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html as html_module
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class TikTokClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching TikTok creator information and videos.
|
||||
|
||||
Uses yt-dlp for listing (fast flat-playlist) and gallery-dl for downloading
|
||||
(handles carousels/slideshows properly).
|
||||
"""
|
||||
|
||||
SERVICE_ID = 'tiktok'
|
||||
PLATFORM = 'tiktok'
|
||||
|
||||
def __init__(self, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='TikTok')
|
||||
|
||||
self.ytdlp_path = self._find_executable('yt-dlp')
|
||||
self.gallery_dl_path = self._find_executable('gallery-dl')
|
||||
self.unified_db = unified_db
|
||||
self._cookies_file = None
|
||||
self._last_pinned_posts = {}
|
||||
|
||||
if not self.ytdlp_path:
|
||||
self.log("yt-dlp not found, TikTok listing will be disabled", 'warning')
|
||||
if not self.gallery_dl_path:
|
||||
self.log("gallery-dl not found, TikTok downloading will be disabled", 'warning')
|
||||
|
||||
def _find_executable(self, name: str) -> Optional[str]:
|
||||
"""Find an executable by name"""
|
||||
common_paths = [
|
||||
f'/opt/media-downloader/venv/bin/{name}',
|
||||
f'/usr/local/bin/{name}',
|
||||
f'/usr/bin/{name}',
|
||||
f'/opt/homebrew/bin/{name}',
|
||||
os.path.expanduser(f'~/.local/bin/{name}'),
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
|
||||
try:
|
||||
result = subprocess.run(['which', name], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if both yt-dlp and gallery-dl are available"""
|
||||
return self.ytdlp_path is not None and self.gallery_dl_path is not None
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up any temporary files"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
try:
|
||||
os.unlink(self._cookies_file)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _get_cookies_file(self) -> Optional[str]:
|
||||
"""Get path to cookies file, creating from database if needed."""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
return self._cookies_file
|
||||
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
# Check for tiktok scraper cookies
|
||||
for scraper_id in ('tiktok', 'tiktok_client'):
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
cookies_list = data['cookies']
|
||||
elif isinstance(data, list):
|
||||
cookies_list = data
|
||||
else:
|
||||
cookies_list = []
|
||||
|
||||
if cookies_list:
|
||||
import tempfile
|
||||
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='tiktok_cookies_')
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
f.write("# Netscape HTTP Cookie File\n")
|
||||
for cookie in cookies_list:
|
||||
domain = cookie.get('domain', '')
|
||||
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
||||
path = cookie.get('path', '/')
|
||||
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
||||
expiry = str(int(cookie.get('expirationDate', 0)))
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
|
||||
self.log(f"Loaded {len(cookies_list)} TikTok cookies", 'debug')
|
||||
return self._cookies_file
|
||||
except Exception as e:
|
||||
self.log(f"Could not load TikTok cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _save_cookies_back(self):
|
||||
"""Read updated cookies from temp file and save back to database.
|
||||
yt-dlp and gallery-dl update the cookies file with refreshed tokens
|
||||
from TikTok (e.g. msToken), so we need to persist those changes."""
|
||||
if not self._cookies_file or not os.path.exists(self._cookies_file):
|
||||
return
|
||||
if not self.unified_db:
|
||||
return
|
||||
|
||||
try:
|
||||
import http.cookiejar
|
||||
jar = http.cookiejar.MozillaCookieJar(self._cookies_file)
|
||||
jar.load(ignore_discard=True, ignore_expires=True)
|
||||
|
||||
updated_cookies = []
|
||||
for cookie in jar:
|
||||
updated_cookies.append({
|
||||
'name': cookie.name,
|
||||
'value': cookie.value,
|
||||
'domain': cookie.domain,
|
||||
'path': cookie.path,
|
||||
'secure': cookie.secure,
|
||||
'expirationDate': cookie.expires or 0,
|
||||
})
|
||||
|
||||
if not updated_cookies:
|
||||
return
|
||||
|
||||
# Merge updated cookies back to DB
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('tiktok',))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row and row[0]:
|
||||
existing_data = json.loads(row[0])
|
||||
existing_cookies = existing_data if isinstance(existing_data, list) else existing_data.get('cookies', [])
|
||||
# Merge: updated cookies override existing by name+domain
|
||||
cookie_map = {(c.get('name'), c.get('domain')): c for c in existing_cookies}
|
||||
for c in updated_cookies:
|
||||
cookie_map[(c['name'], c['domain'])] = c
|
||||
final_cookies = list(cookie_map.values())
|
||||
else:
|
||||
final_cookies = updated_cookies
|
||||
|
||||
self.unified_db.save_scraper_cookies('tiktok', final_cookies, merge=False)
|
||||
self.log(f"Saved {len(final_cookies)} refreshed cookies back to DB", 'debug')
|
||||
|
||||
# Clear cached file so next use gets fresh cookies from DB
|
||||
self._cookies_file = None
|
||||
except Exception as e:
|
||||
self.log(f"Failed to save cookies back: {e}", 'debug')
|
||||
|
||||
def _get_base_cmd(self) -> List[str]:
|
||||
"""Get base yt-dlp command with cookies if available."""
|
||||
cmd = [self.ytdlp_path]
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
return cmd
|
||||
|
||||
@staticmethod
|
||||
def extract_username(url: str) -> Optional[str]:
|
||||
"""Extract username from TikTok URL"""
|
||||
match = re.search(r'tiktok\.com/@([a-zA-Z0-9_.]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_creator_url(username: str) -> str:
|
||||
"""Convert username to a consistent URL format"""
|
||||
if username.startswith('http://') or username.startswith('https://'):
|
||||
return username
|
||||
username = username.lstrip('@')
|
||||
return f"https://www.tiktok.com/@{username}"
|
||||
|
||||
async def _resolve_channel_id(self, username: str) -> Optional[str]:
|
||||
"""Resolve a TikTok username to a channel_id (secUid).
|
||||
|
||||
When yt-dlp can't extract the secondary user ID from the profile page,
|
||||
we try to find a video URL from TikTok's embed/RSS and then extract
|
||||
the channel_id (secUid) from that video's metadata via yt-dlp.
|
||||
"""
|
||||
if not self.ytdlp_path:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Step 1: Get a video URL from this user via the oembed embed HTML
|
||||
video_url = None
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# The oembed HTML often contains a video ID we can use
|
||||
oembed_url = f"https://www.tiktok.com/oembed?url=https://www.tiktok.com/@{username}"
|
||||
async with session.get(oembed_url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json()
|
||||
embed_html = data.get('html', '')
|
||||
# Extract video URL from embed iframe
|
||||
match = re.search(r'cite="(https://www\.tiktok\.com/@[^"]+/video/\d+)"', embed_html)
|
||||
if not match:
|
||||
match = re.search(r'data-video-id="(\d+)"', embed_html)
|
||||
if match:
|
||||
video_url = f"https://www.tiktok.com/@{username}/video/{match.group(1)}"
|
||||
else:
|
||||
video_url = match.group(1)
|
||||
|
||||
if not video_url:
|
||||
# oembed thumbnail_url sometimes contains the video ID
|
||||
thumb = data.get('thumbnail_url', '')
|
||||
vid_match = re.search(r'/video/(\d+)', thumb)
|
||||
if vid_match:
|
||||
video_url = f"https://www.tiktok.com/@{username}/video/{vid_match.group(1)}"
|
||||
|
||||
if not video_url:
|
||||
# Step 1b: Check if we have any existing video URLs in the database
|
||||
if self.unified_db:
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT a.download_url FROM paid_content_attachments a
|
||||
JOIN paid_content_posts p ON a.post_id = p.id
|
||||
JOIN paid_content_creators c ON p.creator_id = c.id
|
||||
WHERE c.username = ? AND a.download_url LIKE '%tiktok.com%'
|
||||
LIMIT 1
|
||||
""", (username,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
video_url = row[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not video_url:
|
||||
self.log(f"No video URL found for @{username} to resolve channel_id", 'debug')
|
||||
return None
|
||||
|
||||
# Step 2: Use yt-dlp to get the channel_id from the single video
|
||||
self.log(f"Resolving channel_id from video: {video_url}", 'debug')
|
||||
cmd = self._get_base_cmd() + [
|
||||
'-j',
|
||||
'--no-warnings',
|
||||
'--no-download',
|
||||
'--socket-timeout', '30',
|
||||
video_url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
video_data = json.loads(line)
|
||||
channel_id = video_data.get('channel_id') or video_data.get('playlist_id')
|
||||
if channel_id:
|
||||
self.log(f"Resolved @{username} channel_id: {channel_id[:30]}...", 'info')
|
||||
return channel_id
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Failed to resolve channel_id for @{username}: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_creator_info(self, url: str) -> Optional[Dict]:
|
||||
"""Get creator information using yt-dlp + profile page scraping"""
|
||||
username = self.extract_username(url)
|
||||
if not username:
|
||||
return None
|
||||
|
||||
profile_url = self.normalize_creator_url(username)
|
||||
creator_name = username
|
||||
|
||||
# Try yt-dlp for display name from video metadata
|
||||
if self.ytdlp_path:
|
||||
try:
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--playlist-items', '1',
|
||||
'--socket-timeout', '30',
|
||||
profile_url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
creator_name = (data.get('channel') or data.get('uploader')
|
||||
or data.get('playlist_title') or username)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
else:
|
||||
# Fallback: try tiktokuser: scheme if secondary user ID extraction fails
|
||||
err_text = stderr.decode('utf-8', errors='replace')
|
||||
if 'secondary user ID' in err_text or 'Unable to extract' in err_text:
|
||||
channel_id = await self._resolve_channel_id(username)
|
||||
if channel_id:
|
||||
fb_cmd = self._get_base_cmd() + [
|
||||
'--no-warnings', '--flat-playlist',
|
||||
'-j', '--playlist-items', '1', '--socket-timeout', '30',
|
||||
f"tiktokuser:{channel_id}"
|
||||
]
|
||||
fb_result = await asyncio.create_subprocess_exec(
|
||||
*fb_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
fb_stdout, _ = await fb_result.communicate()
|
||||
if fb_result.returncode == 0:
|
||||
for line in fb_stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
creator_name = (data.get('channel') or data.get('uploader')
|
||||
or data.get('playlist_title') or username)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
self.log(f"Failed to get creator info via yt-dlp: {e}", 'debug')
|
||||
|
||||
# Scrape profile page for avatar and bio
|
||||
profile_image = None
|
||||
bio = None
|
||||
try:
|
||||
profile_image, bio, page_name = await self._scrape_profile_page(profile_url)
|
||||
if page_name and creator_name == username:
|
||||
creator_name = page_name
|
||||
except Exception as e:
|
||||
self.log(f"Failed to scrape profile page: {e}", 'debug')
|
||||
|
||||
return {
|
||||
'creator_id': username,
|
||||
'creator_name': creator_name,
|
||||
'creator_url': profile_url,
|
||||
'profile_image_url': profile_image,
|
||||
'bio': bio,
|
||||
}
|
||||
|
||||
async def _fetch_profile_with_cookies(self, url: str) -> Optional[str]:
|
||||
"""Fetch TikTok profile page using curl_cffi with cookies from database."""
|
||||
cookies_file = self._get_cookies_file()
|
||||
if not cookies_file:
|
||||
return None
|
||||
|
||||
try:
|
||||
from curl_cffi import requests as cf_requests
|
||||
import http.cookiejar
|
||||
|
||||
# Load cookies from the Netscape file
|
||||
jar = http.cookiejar.MozillaCookieJar(cookies_file)
|
||||
jar.load(ignore_discard=True, ignore_expires=True)
|
||||
|
||||
# Try multiple browser versions for curl_cffi compatibility
|
||||
for _browser in ("chrome136", "chrome131", "chrome"):
|
||||
try:
|
||||
session = cf_requests.Session(impersonate=_browser)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
else:
|
||||
session = cf_requests.Session()
|
||||
for cookie in jar:
|
||||
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain)
|
||||
|
||||
resp = session.get(url, timeout=15)
|
||||
if resp.status_code == 200 and 'avatarLarger' in resp.text:
|
||||
self.log("Fetched TikTok profile with cookies (curl_cffi)", 'debug')
|
||||
return resp.text
|
||||
elif 'captcha' in resp.text.lower():
|
||||
self.log("TikTok profile still returned captcha with cookies", 'debug')
|
||||
session.close()
|
||||
except Exception as e:
|
||||
self.log(f"curl_cffi profile fetch failed: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def _scrape_profile_page(self, url: str) -> tuple:
|
||||
"""
|
||||
Scrape TikTok profile page for avatar and bio from embedded JSON data.
|
||||
TikTok embeds user data in __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag.
|
||||
Returns (profile_image_url, bio, display_name).
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
profile_image = None
|
||||
bio = None
|
||||
display_name = None
|
||||
|
||||
try:
|
||||
page_html = None
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
|
||||
if resp.status == 200:
|
||||
page_html = await resp.text()
|
||||
|
||||
# If we got a captcha page, try curl_cffi with cookies
|
||||
if not page_html or ('captcha' in page_html.lower() and 'avatarLarger' not in page_html):
|
||||
page_html = await self._fetch_profile_with_cookies(url)
|
||||
if not page_html:
|
||||
return (None, None, None)
|
||||
|
||||
# Try structured JSON first (__UNIVERSAL_DATA_FOR_REHYDRATION__)
|
||||
rehydration_match = re.search(
|
||||
r'<script[^>]*id="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>(.*?)</script>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if rehydration_match:
|
||||
try:
|
||||
rdata = json.loads(rehydration_match.group(1))
|
||||
user_detail = (rdata.get('__DEFAULT_SCOPE__', {})
|
||||
.get('webapp.user-detail', {}))
|
||||
user = user_detail.get('userInfo', {}).get('user', {})
|
||||
if user:
|
||||
avatar_val = user.get('avatarLarger') or user.get('avatarMedium')
|
||||
if avatar_val and not avatar_val.endswith('.mp4'):
|
||||
profile_image = avatar_val
|
||||
self.log("Found TikTok profile avatar (rehydration)", 'debug')
|
||||
sig_val = user.get('signature', '')
|
||||
if sig_val and sig_val.strip():
|
||||
bio = sig_val.strip()
|
||||
self.log("Found TikTok bio (rehydration)", 'debug')
|
||||
nick_val = user.get('nickname')
|
||||
if nick_val:
|
||||
display_name = nick_val
|
||||
self.log(f"Found TikTok display name (rehydration): {display_name}", 'debug')
|
||||
|
||||
# Extract pinned post IDs
|
||||
pinned_list = user_detail.get('pinnedList', [])
|
||||
if pinned_list:
|
||||
self._last_pinned_posts = {}
|
||||
for item in pinned_list:
|
||||
vid = str(item.get('id', ''))
|
||||
if vid:
|
||||
self._last_pinned_posts[vid] = {'pinned_at': None}
|
||||
if self._last_pinned_posts:
|
||||
self.log(f"Found {len(self._last_pinned_posts)} pinned TikTok posts", 'debug')
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
|
||||
# Fallback: regex extraction from raw HTML
|
||||
# Use json.loads to decode values (handles \uXXXX, surrogate pairs, and raw UTF-8)
|
||||
if not profile_image:
|
||||
avatar_match = re.search(r'"avatarLarger":"([^"]+)"', page_html)
|
||||
if not avatar_match:
|
||||
avatar_match = re.search(r'"avatarMedium":"([^"]+)"', page_html)
|
||||
if avatar_match:
|
||||
try:
|
||||
avatar_url = json.loads(f'"{avatar_match.group(1)}"')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
avatar_url = avatar_match.group(1)
|
||||
if avatar_url and not avatar_url.endswith('.mp4'):
|
||||
profile_image = avatar_url
|
||||
self.log("Found TikTok profile avatar", 'debug')
|
||||
|
||||
if not bio:
|
||||
sig_match = re.search(r'"signature":"([^"]*)"', page_html)
|
||||
if sig_match:
|
||||
try:
|
||||
raw_bio = json.loads(f'"{sig_match.group(1)}"')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
raw_bio = sig_match.group(1)
|
||||
if raw_bio and raw_bio.strip():
|
||||
bio = raw_bio.strip()
|
||||
self.log("Found TikTok bio", 'debug')
|
||||
|
||||
if not display_name:
|
||||
nick_match = re.search(r'"nickname":"([^"]+)"', page_html)
|
||||
if nick_match:
|
||||
try:
|
||||
display_name = json.loads(f'"{nick_match.group(1)}"')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
display_name = nick_match.group(1)
|
||||
self.log(f"Found TikTok display name: {display_name}", 'debug')
|
||||
|
||||
# Extract banner/cover from "coverLarger" field
|
||||
# (stored separately, not returned here but could be used later)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
self.log("TikTok profile page request timed out", 'debug')
|
||||
except Exception as e:
|
||||
self.log(f"Error scraping TikTok profile: {e}", 'debug')
|
||||
|
||||
return (profile_image, bio, display_name)
|
||||
|
||||
async def get_creator_videos(self, url: str, since_date: str = None,
|
||||
max_videos: int = None,
|
||||
progress_callback=None) -> List[Dict]:
|
||||
"""
|
||||
Get all videos from a TikTok profile using yt-dlp --flat-playlist -j.
|
||||
|
||||
Uses JSON output to properly handle multi-line descriptions/titles.
|
||||
Returns list of video metadata dicts with video_id and upload_date.
|
||||
"""
|
||||
if not self.ytdlp_path:
|
||||
return []
|
||||
|
||||
username = self.extract_username(url)
|
||||
if not username:
|
||||
return []
|
||||
|
||||
profile_url = self.normalize_creator_url(username)
|
||||
|
||||
try:
|
||||
# Use yt-dlp flat-playlist with JSON output for full metadata
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--no-warnings',
|
||||
'--socket-timeout', '30',
|
||||
profile_url
|
||||
]
|
||||
|
||||
self.log(f"Fetching TikTok videos for @{username}", 'info')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error = stderr.decode('utf-8', errors='replace')
|
||||
|
||||
# Fallback: if yt-dlp can't extract secondary user ID, try tiktokuser: scheme
|
||||
if 'secondary user ID' in error or 'Unable to extract' in error:
|
||||
self.log(f"yt-dlp can't extract user ID for @{username}, trying channel_id fallback", 'info')
|
||||
channel_id = await self._resolve_channel_id(username)
|
||||
if channel_id:
|
||||
fallback_cmd = self._get_base_cmd() + [
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--no-warnings',
|
||||
'--socket-timeout', '30',
|
||||
f"tiktokuser:{channel_id}"
|
||||
]
|
||||
fb_result = await asyncio.create_subprocess_exec(
|
||||
*fallback_cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await fb_result.communicate()
|
||||
if fb_result.returncode == 0:
|
||||
self.log(f"Fallback tiktokuser: succeeded for @{username}", 'info')
|
||||
else:
|
||||
fb_error = stderr.decode('utf-8', errors='replace')
|
||||
self.log(f"Fallback also failed for @{username}: {fb_error}", 'warning')
|
||||
return []
|
||||
else:
|
||||
self.log(f"Could not resolve channel_id for @{username}", 'warning')
|
||||
return []
|
||||
else:
|
||||
self.log(f"Failed to list TikTok videos: {error}", 'warning')
|
||||
return []
|
||||
|
||||
lines = stdout.decode('utf-8', errors='replace').strip().split('\n')
|
||||
|
||||
# Parse since_date for filtering
|
||||
cutoff_str = None
|
||||
if since_date:
|
||||
try:
|
||||
if 'T' in since_date:
|
||||
cutoff_dt = datetime.fromisoformat(since_date.replace('Z', '+00:00').replace('+00:00', ''))
|
||||
else:
|
||||
cutoff_dt = datetime.strptime(since_date[:10], '%Y-%m-%d')
|
||||
cutoff_str = cutoff_dt.strftime('%Y%m%d')
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
videos = []
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
video_id = str(data.get('id', ''))
|
||||
if not video_id:
|
||||
continue
|
||||
|
||||
upload_date = data.get('upload_date', '')
|
||||
title = data.get('title', '')
|
||||
description = data.get('description', '')
|
||||
|
||||
# Skip posts where yt-dlp returned no metadata at all
|
||||
# When cookies are expired, yt-dlp returns no date, no title,
|
||||
# and no description. Real posts with empty captions still have
|
||||
# upload_date, so we use that as the key signal.
|
||||
if not upload_date and not title and not description:
|
||||
self.log(f"Skipping TikTok {video_id}: no metadata (cookies may be expired)", 'debug')
|
||||
continue
|
||||
|
||||
title = title or description or f"TikTok video #{video_id}"
|
||||
description = description or title
|
||||
|
||||
# Filter by date if cutoff specified
|
||||
if cutoff_str and upload_date and upload_date < cutoff_str:
|
||||
continue
|
||||
|
||||
# Format upload_date to ISO
|
||||
formatted_date = None
|
||||
if upload_date and len(upload_date) == 8 and upload_date.isdigit():
|
||||
formatted_date = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}"
|
||||
|
||||
video_url = data.get('url') or f"https://www.tiktok.com/@{username}/video/{video_id}"
|
||||
|
||||
videos.append({
|
||||
'video_id': video_id,
|
||||
'title': title,
|
||||
'description': description,
|
||||
'upload_date': formatted_date,
|
||||
'url': video_url,
|
||||
'username': username,
|
||||
})
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(videos))
|
||||
|
||||
if max_videos and len(videos) >= max_videos:
|
||||
break
|
||||
|
||||
self.log(f"Found {len(videos)} TikTok videos for @{username}", 'info')
|
||||
self._save_cookies_back()
|
||||
return videos
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting TikTok videos: {e}", 'error')
|
||||
self._save_cookies_back()
|
||||
return []
|
||||
|
||||
async def download_video(self, video_url: str, output_dir: Path, username: str = '') -> Dict:
|
||||
"""
|
||||
Download a TikTok video/carousel using gallery-dl.
|
||||
|
||||
gallery-dl handles both regular videos and carousel/slideshow posts.
|
||||
Returns dict with success status and list of downloaded files.
|
||||
"""
|
||||
if not self.gallery_dl_path:
|
||||
return {'success': False, 'error': 'gallery-dl not available'}
|
||||
|
||||
try:
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cmd = [
|
||||
self.gallery_dl_path,
|
||||
'--write-metadata',
|
||||
'-D', str(output_dir),
|
||||
'-f', '{id}_{num}.{extension}',
|
||||
]
|
||||
|
||||
# Add cookies for age-restricted / login-required content
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
|
||||
cmd.append(video_url)
|
||||
|
||||
self.log(f"Downloading TikTok: {video_url}", 'debug')
|
||||
|
||||
# Snapshot existing files before download so we only pick up new ones
|
||||
existing_files = set(f.name for f in output_dir.iterdir() if f.is_file())
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
# Find newly downloaded files (exclude .json metadata and audio-only files)
|
||||
downloaded_files = []
|
||||
for f in output_dir.iterdir():
|
||||
if f.is_file() and f.name not in existing_files and f.suffix.lower() not in ('.json',):
|
||||
# Skip audio-only files
|
||||
if f.suffix.lower() in ('.mp3', '.m4a', '.aac', '.wav', '.ogg'):
|
||||
continue
|
||||
downloaded_files.append(f)
|
||||
|
||||
if result.returncode != 0:
|
||||
# gallery-dl exit code 4 = partial failure (e.g. slideshow images OK but audio failed)
|
||||
# If we got media files, treat as success
|
||||
if downloaded_files:
|
||||
self.log(f"gallery-dl partial failure (code {result.returncode}) but {len(downloaded_files)} files downloaded", 'debug')
|
||||
else:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
if 'not available' in error_msg.lower() or '404' in error_msg:
|
||||
error_msg = 'Video not available (deleted or private)'
|
||||
elif len(error_msg) > 200:
|
||||
error_msg = error_msg[:200] + '...'
|
||||
return {'success': False, 'error': error_msg}
|
||||
|
||||
if not downloaded_files:
|
||||
return {'success': False, 'error': 'No files downloaded'}
|
||||
|
||||
# Sort by name to maintain carousel order (e.g. id_1.jpg, id_2.jpg)
|
||||
downloaded_files.sort(key=lambda f: f.name)
|
||||
primary_file = downloaded_files[0]
|
||||
|
||||
# Determine if this is a photo carousel (multiple images)
|
||||
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp'}
|
||||
is_carousel = len(downloaded_files) > 1 and all(
|
||||
f.suffix.lower() in image_exts for f in downloaded_files
|
||||
)
|
||||
|
||||
self._save_cookies_back()
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(primary_file),
|
||||
'filename': primary_file.name,
|
||||
'file_size': primary_file.stat().st_size,
|
||||
'all_files': [str(f) for f in downloaded_files],
|
||||
'file_count': len(downloaded_files),
|
||||
'is_carousel': is_carousel,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading TikTok video: {e}", 'error')
|
||||
self._save_cookies_back()
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
async def get_creator(self, url: str) -> Optional[Creator]:
|
||||
"""Get Creator object from URL"""
|
||||
info = await self.get_creator_info(url)
|
||||
if not info:
|
||||
return None
|
||||
|
||||
username = info.get('creator_id', '')
|
||||
|
||||
return Creator(
|
||||
creator_id=username,
|
||||
service_id='tiktok',
|
||||
platform='tiktok',
|
||||
username=info.get('creator_name', username),
|
||||
display_name=info.get('creator_name'),
|
||||
profile_image_url=info.get('profile_image_url'),
|
||||
bio=info.get('bio'),
|
||||
)
|
||||
|
||||
async def get_posts(self, url: str, since_date: str = None,
|
||||
max_videos: int = None, progress_callback=None) -> List[Post]:
|
||||
"""Get TikTok videos as Post objects"""
|
||||
videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
|
||||
|
||||
username = self.extract_username(url) or ''
|
||||
|
||||
posts = []
|
||||
for video in videos:
|
||||
# Each TikTok post could be video or carousel
|
||||
# We create a single attachment for now; the actual download determines type
|
||||
attachment = Attachment(
|
||||
name=f"{video['video_id']}.mp4",
|
||||
file_type='video',
|
||||
extension='.mp4',
|
||||
server_path=video['url'],
|
||||
download_url=video['url'],
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=video['video_id'],
|
||||
service_id='tiktok',
|
||||
platform='tiktok',
|
||||
creator_id=username,
|
||||
title=None,
|
||||
content=video.get('description') or video.get('title', ''),
|
||||
published_at=video.get('upload_date'),
|
||||
attachments=[attachment],
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
751
modules/paid_content/twitch_client.py
Normal file
751
modules/paid_content/twitch_client.py
Normal file
@@ -0,0 +1,751 @@
|
||||
"""
|
||||
Twitch Clips Client - Fetches channel clips using yt-dlp
|
||||
"""
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class TwitchThumbnailCache:
|
||||
"""Cache for Twitch clip thumbnails"""
|
||||
|
||||
def __init__(self, cache_dir: str = None):
|
||||
self.cache_dir = Path(cache_dir or '/opt/media-downloader/data/cache/twitch_thumbnails')
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _get_cache_path(self, thumbnail_url: str) -> Path:
|
||||
"""Get local cache path for a thumbnail URL"""
|
||||
# Create a hash of the URL for the filename
|
||||
url_hash = hashlib.md5(thumbnail_url.encode()).hexdigest()
|
||||
# Extract extension from URL or default to jpg
|
||||
ext = '.jpg'
|
||||
if '.png' in thumbnail_url.lower():
|
||||
ext = '.png'
|
||||
elif '.webp' in thumbnail_url.lower():
|
||||
ext = '.webp'
|
||||
return self.cache_dir / f"{url_hash}{ext}"
|
||||
|
||||
def get_cached(self, thumbnail_url: str) -> Optional[str]:
|
||||
"""Get cached thumbnail path if it exists"""
|
||||
cache_path = self._get_cache_path(thumbnail_url)
|
||||
if cache_path.exists():
|
||||
return str(cache_path)
|
||||
return None
|
||||
|
||||
async def cache_thumbnail(self, thumbnail_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
|
||||
"""Download and cache a thumbnail, return local path"""
|
||||
if not thumbnail_url:
|
||||
return None
|
||||
|
||||
# Check if already cached
|
||||
cache_path = self._get_cache_path(thumbnail_url)
|
||||
if cache_path.exists():
|
||||
return str(cache_path)
|
||||
|
||||
# Download thumbnail
|
||||
try:
|
||||
close_session = False
|
||||
if session is None:
|
||||
session = aiohttp.ClientSession()
|
||||
close_session = True
|
||||
|
||||
try:
|
||||
async with session.get(thumbnail_url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
|
||||
if resp.status == 200:
|
||||
content = await resp.read()
|
||||
with open(cache_path, 'wb') as f:
|
||||
f.write(content)
|
||||
return str(cache_path)
|
||||
finally:
|
||||
if close_session:
|
||||
await session.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
async def cache_thumbnails_batch(self, thumbnail_urls: List[str], max_concurrent: int = 5) -> Dict[str, str]:
|
||||
"""Cache multiple thumbnails in parallel, return url->local_path mapping"""
|
||||
result = {}
|
||||
|
||||
# Filter out already cached
|
||||
to_download = []
|
||||
for url in thumbnail_urls:
|
||||
if not url:
|
||||
continue
|
||||
cached = self.get_cached(url)
|
||||
if cached:
|
||||
result[url] = cached
|
||||
else:
|
||||
to_download.append(url)
|
||||
|
||||
if not to_download:
|
||||
return result
|
||||
|
||||
# Download in batches
|
||||
async with aiohttp.ClientSession() as session:
|
||||
semaphore = asyncio.Semaphore(max_concurrent)
|
||||
|
||||
async def download_one(url: str):
|
||||
async with semaphore:
|
||||
path = await self.cache_thumbnail(url, session)
|
||||
if path:
|
||||
result[url] = path
|
||||
|
||||
await asyncio.gather(*[download_one(url) for url in to_download])
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class TwitchClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching Twitch channel clips using yt-dlp
|
||||
|
||||
Supports:
|
||||
- Channel clips URLs (twitch.tv/username/clips)
|
||||
- Fetching channel metadata
|
||||
- Listing all clips from a channel
|
||||
- Downloading clips
|
||||
"""
|
||||
|
||||
# Quality presets for yt-dlp
|
||||
QUALITY_PRESETS = {
|
||||
'best': 'best',
|
||||
'1080p': 'best[height<=1080]',
|
||||
'720p': 'best[height<=720]',
|
||||
'480p': 'best[height<=480]',
|
||||
}
|
||||
|
||||
def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None, cache_dir: str = None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Twitch')
|
||||
|
||||
# Find yt-dlp executable
|
||||
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
|
||||
if not self.ytdlp_path:
|
||||
self.log("yt-dlp not found, Twitch support will be disabled", 'warning')
|
||||
|
||||
# Store database reference for cookie access
|
||||
self.unified_db = unified_db
|
||||
self._cookies_file = None
|
||||
|
||||
# Initialize thumbnail cache
|
||||
self.thumbnail_cache = TwitchThumbnailCache(cache_dir)
|
||||
|
||||
def _find_ytdlp(self) -> Optional[str]:
|
||||
"""Find yt-dlp executable"""
|
||||
common_paths = [
|
||||
'/opt/media-downloader/venv/bin/yt-dlp', # Prefer venv version (kept up to date)
|
||||
'/usr/local/bin/yt-dlp',
|
||||
'/usr/bin/yt-dlp',
|
||||
'/opt/homebrew/bin/yt-dlp',
|
||||
os.path.expanduser('~/.local/bin/yt-dlp'),
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
|
||||
try:
|
||||
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if yt-dlp is available"""
|
||||
return self.ytdlp_path is not None
|
||||
|
||||
def _get_cookies_file(self) -> Optional[str]:
|
||||
"""Get path to cookies file, creating it from database if needed"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
return self._cookies_file
|
||||
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
# Try twitch-specific cookies first, then fall back to ytdlp
|
||||
for scraper_id in ['twitch', 'ytdlp']:
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
# Support both {"cookies": [...]} and [...] formats
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
cookies_list = data['cookies']
|
||||
elif isinstance(data, list):
|
||||
cookies_list = data
|
||||
else:
|
||||
cookies_list = []
|
||||
|
||||
if cookies_list:
|
||||
# Write cookies to temp file in Netscape format
|
||||
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='twitch_cookies_')
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
f.write("# Netscape HTTP Cookie File\n")
|
||||
for cookie in cookies_list:
|
||||
domain = cookie.get('domain', '')
|
||||
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
||||
path = cookie.get('path', '/')
|
||||
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
||||
expiry = str(int(cookie.get('expirationDate', 0)))
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
|
||||
self.log(f"Loaded {len(cookies_list)} cookies from {scraper_id} scraper", 'debug')
|
||||
return self._cookies_file
|
||||
except Exception as e:
|
||||
self.log(f"Could not load cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _get_base_cmd(self) -> List[str]:
|
||||
"""Get base yt-dlp command with cookies if available"""
|
||||
cmd = [self.ytdlp_path]
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
return cmd
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up temporary files"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
try:
|
||||
os.unlink(self._cookies_file)
|
||||
except Exception:
|
||||
pass
|
||||
self._cookies_file = None
|
||||
|
||||
@staticmethod
|
||||
def extract_channel_name(url: str) -> Optional[str]:
|
||||
"""
|
||||
Extract channel name from Twitch URL
|
||||
|
||||
Supports:
|
||||
- twitch.tv/username
|
||||
- twitch.tv/username/clips
|
||||
- m.twitch.tv/username/clips
|
||||
"""
|
||||
patterns = [
|
||||
r'twitch\.tv/([a-zA-Z0-9_]+)(?:/clips)?',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return match.group(1).lower()
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_clips_url(channel_name: str) -> str:
|
||||
"""Convert channel name to clips URL with all-time filter"""
|
||||
return f"https://www.twitch.tv/{channel_name}/clips?filter=clips&range=all"
|
||||
|
||||
async def get_channel_info(self, channel_url: str, count_clips: bool = True) -> Optional[Dict]:
|
||||
"""
|
||||
Get channel information and optionally count all clips
|
||||
"""
|
||||
if not self.is_available():
|
||||
return None
|
||||
|
||||
channel_name = self.extract_channel_name(channel_url)
|
||||
if not channel_name:
|
||||
return None
|
||||
|
||||
try:
|
||||
clips_url = self.normalize_clips_url(channel_name)
|
||||
|
||||
# First get basic info from first clip
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--playlist-items', '1',
|
||||
clips_url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
self.log(f"Failed to get channel info: {stderr.decode()}", 'warning')
|
||||
return None
|
||||
|
||||
first_clip_data = None
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
first_clip_data = json.loads(line)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not first_clip_data:
|
||||
return None
|
||||
|
||||
# Count all clips if requested (this can take a while for channels with many clips)
|
||||
clip_count = 0
|
||||
if count_clips:
|
||||
self.log(f"Counting clips for {channel_name}...", 'debug')
|
||||
count_cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'--print', 'id',
|
||||
clips_url
|
||||
]
|
||||
|
||||
count_result = await asyncio.create_subprocess_exec(
|
||||
*count_cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
count_stdout, _ = await count_result.communicate()
|
||||
if count_result.returncode == 0:
|
||||
clip_count = len([l for l in count_stdout.decode('utf-8', errors='replace').strip().split('\n') if l])
|
||||
self.log(f"Found {clip_count} clips for {channel_name}", 'info')
|
||||
|
||||
return {
|
||||
'channel_id': channel_name,
|
||||
'channel_name': channel_name,
|
||||
'channel_url': f"https://www.twitch.tv/{channel_name}",
|
||||
'clips_url': clips_url,
|
||||
'thumbnail': first_clip_data.get('thumbnail'),
|
||||
'clip_count': clip_count,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting channel info: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def get_channel_clips(self, channel_url: str, since_date: str = None,
|
||||
max_clips: int = None, progress_callback=None,
|
||||
cache_thumbnails: bool = True) -> List[Dict]:
|
||||
"""
|
||||
Get all clips from a channel
|
||||
|
||||
Args:
|
||||
channel_url: Twitch channel URL
|
||||
since_date: Only fetch clips created after this date (ISO format)
|
||||
max_clips: Maximum number of clips to fetch
|
||||
progress_callback: Callback function(count) for progress updates
|
||||
cache_thumbnails: Whether to download and cache thumbnails locally
|
||||
|
||||
Returns:
|
||||
List of clip metadata dicts with cached thumbnail paths
|
||||
"""
|
||||
if not self.is_available():
|
||||
return []
|
||||
|
||||
channel_name = self.extract_channel_name(channel_url)
|
||||
if not channel_name:
|
||||
self.log(f"Could not extract channel name from URL: {channel_url}", 'error')
|
||||
return []
|
||||
|
||||
try:
|
||||
clips_url = self.normalize_clips_url(channel_name)
|
||||
|
||||
# Use flat-playlist for faster extraction (full metadata available in flat mode for Twitch clips)
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
clips_url
|
||||
]
|
||||
|
||||
# Add date filter at yt-dlp level for efficiency
|
||||
if since_date:
|
||||
try:
|
||||
from datetime import datetime
|
||||
# Convert ISO date to YYYYMMDD format for yt-dlp
|
||||
date_obj = datetime.fromisoformat(since_date.replace('Z', '+00:00'))
|
||||
dateafter = date_obj.strftime('%Y%m%d')
|
||||
cmd.extend(['--dateafter', dateafter])
|
||||
self.log(f"Filtering clips after {dateafter}", 'debug')
|
||||
except (ValueError, AttributeError):
|
||||
pass
|
||||
|
||||
if max_clips:
|
||||
cmd.extend(['--playlist-items', f'1:{max_clips}'])
|
||||
|
||||
self.log(f"Fetching clips from channel: {channel_name}", 'info')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error = stderr.decode('utf-8', errors='replace')
|
||||
self.log(f"Failed to get channel clips: {error}", 'warning')
|
||||
return []
|
||||
|
||||
clips = []
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
|
||||
clip_id = data.get('id')
|
||||
if not clip_id:
|
||||
continue
|
||||
|
||||
# Parse timestamp to ISO format
|
||||
timestamp = data.get('timestamp')
|
||||
upload_date = data.get('upload_date')
|
||||
if timestamp:
|
||||
try:
|
||||
upload_date = datetime.fromtimestamp(timestamp).isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
elif upload_date:
|
||||
# Convert YYYYMMDD to ISO format
|
||||
try:
|
||||
upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Check if clip is newer than since_date
|
||||
if since_date and upload_date and upload_date <= since_date:
|
||||
self.log(f"Reached clip from {upload_date}, stopping", 'debug')
|
||||
break
|
||||
|
||||
# Extract clip slug from URL
|
||||
clip_url = data.get('url') or data.get('webpage_url', '')
|
||||
clip_slug = clip_url.split('/')[-1] if clip_url else clip_id
|
||||
|
||||
clips.append({
|
||||
'clip_id': clip_id,
|
||||
'clip_slug': clip_slug,
|
||||
'title': data.get('title', f'Clip {clip_id}'),
|
||||
'upload_date': upload_date,
|
||||
'timestamp': timestamp,
|
||||
'duration': data.get('duration'),
|
||||
'view_count': data.get('view_count'),
|
||||
'thumbnail': data.get('thumbnail'),
|
||||
'url': clip_url,
|
||||
'language': data.get('language'),
|
||||
'channel_name': channel_name,
|
||||
})
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(clips))
|
||||
|
||||
if max_clips and len(clips) >= max_clips:
|
||||
break
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
self.log(f"Found {len(clips)} clips", 'info')
|
||||
|
||||
# Cache thumbnails if requested
|
||||
if cache_thumbnails and clips:
|
||||
thumbnail_urls = [c.get('thumbnail') for c in clips if c.get('thumbnail')]
|
||||
if thumbnail_urls:
|
||||
self.log(f"Caching {len(thumbnail_urls)} thumbnails...", 'debug')
|
||||
cached_paths = await self.thumbnail_cache.cache_thumbnails_batch(thumbnail_urls)
|
||||
|
||||
# Update clips with cached thumbnail paths
|
||||
for clip in clips:
|
||||
thumb_url = clip.get('thumbnail')
|
||||
if thumb_url and thumb_url in cached_paths:
|
||||
clip['thumbnail_cached'] = cached_paths[thumb_url]
|
||||
|
||||
self.log(f"Cached {len(cached_paths)} thumbnails", 'debug')
|
||||
|
||||
return clips
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting channel clips: {e}", 'error')
|
||||
return []
|
||||
|
||||
async def download_clip(self, clip_url: str, output_dir: Path, quality: str = 'best',
|
||||
progress_callback=None) -> Dict:
|
||||
"""
|
||||
Download a clip
|
||||
|
||||
Args:
|
||||
clip_url: Twitch clip URL
|
||||
output_dir: Directory to save the clip
|
||||
quality: Quality preset
|
||||
progress_callback: Callback for download progress
|
||||
|
||||
Returns:
|
||||
Dict with success status and file info
|
||||
"""
|
||||
if not self.is_available():
|
||||
return {'success': False, 'error': 'yt-dlp not available'}
|
||||
|
||||
try:
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Output template preserves title and ID
|
||||
output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')
|
||||
|
||||
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
|
||||
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'-f', format_str,
|
||||
'-o', output_template,
|
||||
'--print-json',
|
||||
clip_url
|
||||
]
|
||||
|
||||
self.log(f"Downloading clip: {clip_url}", 'debug')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
if len(error_msg) > 200:
|
||||
error_msg = error_msg[:200] + '...'
|
||||
return {'success': False, 'error': error_msg}
|
||||
|
||||
# Parse output JSON
|
||||
clip_info = None
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
try:
|
||||
clip_info = json.loads(line)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not clip_info:
|
||||
# Try to find downloaded file
|
||||
files = list(output_dir.glob('*.mp4'))
|
||||
if files:
|
||||
file_path = max(files, key=lambda f: f.stat().st_mtime)
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path),
|
||||
'filename': file_path.name,
|
||||
'file_size': file_path.stat().st_size
|
||||
}
|
||||
return {'success': False, 'error': 'Could not find downloaded file'}
|
||||
|
||||
file_path = clip_info.get('_filename') or clip_info.get('filename')
|
||||
if file_path:
|
||||
file_path = Path(file_path)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path) if file_path else None,
|
||||
'filename': file_path.name if file_path else None,
|
||||
'file_size': file_path.stat().st_size if file_path and file_path.exists() else clip_info.get('filesize'),
|
||||
'title': clip_info.get('title'),
|
||||
'duration': clip_info.get('duration'),
|
||||
'clip_id': clip_info.get('id'),
|
||||
'upload_date': clip_info.get('upload_date'),
|
||||
'thumbnail': clip_info.get('thumbnail'),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading clip: {e}", 'error')
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
async def get_channel_avatar(self, channel_name: str) -> Optional[str]:
|
||||
"""
|
||||
Try to fetch channel avatar from Twitch
|
||||
|
||||
Note: This requires either Twitch API credentials or scraping.
|
||||
Returns None if avatar cannot be fetched.
|
||||
"""
|
||||
profile = await self.get_channel_profile(channel_name)
|
||||
return profile.get('avatar') if profile else None
|
||||
|
||||
async def get_channel_profile(self, channel_name: str) -> Optional[Dict]:
|
||||
"""
|
||||
Fetch channel profile info using Twitch's GQL API.
|
||||
|
||||
Returns dict with avatar, banner, display_name, bio, joined_date, external_links
|
||||
"""
|
||||
try:
|
||||
import aiohttp
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
headers = {
|
||||
'Client-Id': 'kimne78kx3ncx6brgo4mv6wki5h1ko', # Public Twitch web client ID
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
}
|
||||
|
||||
# GQL query for comprehensive user info
|
||||
query = '''
|
||||
query {
|
||||
user(login: "%s") {
|
||||
id
|
||||
login
|
||||
displayName
|
||||
description
|
||||
createdAt
|
||||
profileImageURL(width: 300)
|
||||
bannerImageURL
|
||||
offlineImageURL
|
||||
channel {
|
||||
socialMedias {
|
||||
name
|
||||
url
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
''' % channel_name
|
||||
|
||||
async with session.post(
|
||||
'https://gql.twitch.tv/gql',
|
||||
headers=headers,
|
||||
json={'query': query},
|
||||
timeout=aiohttp.ClientTimeout(total=15)
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json()
|
||||
user = data.get('data', {}).get('user')
|
||||
|
||||
if not user:
|
||||
self.log(f"Twitch user not found: {channel_name}", 'warning')
|
||||
return None
|
||||
|
||||
result = {}
|
||||
|
||||
# Avatar
|
||||
if user.get('profileImageURL'):
|
||||
result['avatar'] = user['profileImageURL']
|
||||
|
||||
# Banner - prefer offlineImageURL (larger), fall back to bannerImageURL
|
||||
if user.get('offlineImageURL'):
|
||||
result['banner'] = user['offlineImageURL']
|
||||
elif user.get('bannerImageURL'):
|
||||
result['banner'] = user['bannerImageURL']
|
||||
|
||||
# Display name
|
||||
if user.get('displayName'):
|
||||
result['display_name'] = user['displayName']
|
||||
|
||||
# Bio/description
|
||||
if user.get('description'):
|
||||
result['bio'] = user['description']
|
||||
|
||||
# Joined date (format: "Jun 10, 2016")
|
||||
if user.get('createdAt'):
|
||||
try:
|
||||
created_dt = datetime.fromisoformat(user['createdAt'].replace('Z', '+00:00'))
|
||||
result['joined_date'] = created_dt.strftime('%b %d, %Y')
|
||||
self.log(f"Found Twitch joined date: {result['joined_date']}", 'debug')
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Social links
|
||||
social_medias = user.get('channel', {}).get('socialMedias', [])
|
||||
if social_medias:
|
||||
links = []
|
||||
for social in social_medias:
|
||||
name = social.get('name', 'Link')
|
||||
url = social.get('url', '')
|
||||
if url:
|
||||
# Capitalize first letter of name
|
||||
title = name.capitalize() if name else 'Link'
|
||||
links.append({'title': title, 'url': url})
|
||||
if links:
|
||||
result['external_links'] = json.dumps(links)
|
||||
self.log(f"Found {len(links)} Twitch external links", 'debug')
|
||||
|
||||
if result:
|
||||
self.log(f"Fetched Twitch profile via GQL for {channel_name}: {list(result.keys())}", 'debug')
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch Twitch profile: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_creator(self, channel_url: str) -> Optional[Creator]:
|
||||
"""
|
||||
Get Creator object from channel URL
|
||||
"""
|
||||
info = await self.get_channel_info(channel_url)
|
||||
if not info:
|
||||
return None
|
||||
|
||||
channel_name = info.get('channel_name') or self.extract_channel_name(channel_url)
|
||||
|
||||
# Try to get the actual channel avatar (not clip thumbnail)
|
||||
avatar_url = await self.get_channel_avatar(channel_name)
|
||||
|
||||
return Creator(
|
||||
creator_id=info.get('channel_id') or channel_name,
|
||||
service_id='twitch',
|
||||
platform='twitch',
|
||||
username=channel_name or 'Unknown',
|
||||
display_name=channel_name,
|
||||
profile_image_url=avatar_url, # Use actual avatar, not clip thumbnail
|
||||
post_count=info.get('clip_count', 0)
|
||||
)
|
||||
|
||||
async def get_posts(self, channel_url: str, since_date: str = None,
|
||||
max_clips: int = None, progress_callback=None) -> List[Post]:
|
||||
"""
|
||||
Get clips as Post objects
|
||||
"""
|
||||
clips = await self.get_channel_clips(channel_url, since_date, max_clips, progress_callback)
|
||||
|
||||
posts = []
|
||||
for clip in clips:
|
||||
# Create attachment for the clip
|
||||
attachment = Attachment(
|
||||
name=f"{clip['title']}.mp4",
|
||||
file_type='video',
|
||||
extension='.mp4',
|
||||
server_path=clip['url'], # Use URL as server_path
|
||||
download_url=clip['url'],
|
||||
duration=clip.get('duration'),
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=clip['clip_id'],
|
||||
service_id='twitch',
|
||||
platform='twitch',
|
||||
creator_id=clip.get('channel_name', ''),
|
||||
title=clip['title'],
|
||||
content='', # Clips don't have descriptions
|
||||
published_at=clip.get('upload_date'),
|
||||
attachments=[attachment],
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
484
modules/paid_content/utils.py
Normal file
484
modules/paid_content/utils.py
Normal file
@@ -0,0 +1,484 @@
|
||||
"""
|
||||
Utility functions for Paid Content feature
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Optional, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def _extract_xenforo_search_query(parsed) -> Optional[str]:
|
||||
"""Extract the 'q' search parameter from a XenForo search URL."""
|
||||
from urllib.parse import parse_qs, unquote_plus
|
||||
qs = parse_qs(parsed.query)
|
||||
query = qs.get('q', [''])[0]
|
||||
if not query:
|
||||
m = re.search(r'[&?]q=([^&]+)', parsed.query)
|
||||
if m:
|
||||
query = unquote_plus(m.group(1))
|
||||
return query or None
|
||||
|
||||
|
||||
def parse_creator_url(url: str) -> Optional[Tuple[str, str, str]]:
|
||||
"""
|
||||
Parse a Coomer/Kemono/YouTube/Twitch/Fansly creator URL
|
||||
|
||||
Args:
|
||||
url: URL like https://coomer.party/onlyfans/user/creatorid
|
||||
or https://www.youtube.com/@channelhandle
|
||||
or https://www.youtube.com/channel/UCxxxxx
|
||||
or https://www.twitch.tv/username/clips
|
||||
or https://fansly.com/username
|
||||
|
||||
Returns:
|
||||
Tuple of (service_id, platform, creator_id) or None if invalid
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
host = parsed.netloc.lower()
|
||||
|
||||
# Handle YouTube URLs
|
||||
if 'youtube.com' in host or 'youtu.be' in host:
|
||||
channel_id = _extract_youtube_channel_id(url)
|
||||
if channel_id:
|
||||
return ('youtube', 'youtube', channel_id)
|
||||
return None
|
||||
|
||||
# Handle Twitch URLs
|
||||
if 'twitch.tv' in host:
|
||||
channel_name = _extract_twitch_channel_name(url)
|
||||
if channel_name:
|
||||
return ('twitch', 'twitch', channel_name)
|
||||
return None
|
||||
|
||||
# Handle Fansly URLs (direct API)
|
||||
if 'fansly.com' in host:
|
||||
username = _extract_fansly_username(url)
|
||||
if username:
|
||||
return ('fansly_direct', 'fansly', username)
|
||||
return None
|
||||
|
||||
# Handle OnlyFans URLs (direct API)
|
||||
if 'onlyfans.com' in host:
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
if path_parts:
|
||||
username = path_parts[0]
|
||||
if username.lower() not in ('my', 'api2', 'settings', 'search', 'notifications', 'chats', 'vault', 'lists', 'bookmarks', 'statements', 'help', 'terms', 'privacy', 'dmca', 'contact'):
|
||||
return ('onlyfans_direct', 'onlyfans', username)
|
||||
return None
|
||||
|
||||
# Handle Pornhub URLs
|
||||
if 'pornhub.com' in host:
|
||||
creator_id = _extract_pornhub_creator_id(url)
|
||||
if creator_id:
|
||||
return ('pornhub', 'pornhub', creator_id)
|
||||
return None
|
||||
|
||||
# Handle XHamster URLs
|
||||
if 'xhamster' in host:
|
||||
creator_id = _extract_xhamster_creator_id(url)
|
||||
if creator_id:
|
||||
return ('xhamster', 'xhamster', creator_id)
|
||||
return None
|
||||
|
||||
# Handle TikTok URLs
|
||||
if 'tiktok.com' in host:
|
||||
username = _extract_tiktok_username(url)
|
||||
if username:
|
||||
return ('tiktok', 'tiktok', username)
|
||||
return None
|
||||
|
||||
# Handle Instagram URLs
|
||||
if 'instagram.com' in host:
|
||||
username = _extract_instagram_username(url)
|
||||
if username:
|
||||
return ('instagram', 'instagram', username)
|
||||
return None
|
||||
|
||||
# Handle BestEyeCandy URLs
|
||||
if 'besteyecandy.com' in host:
|
||||
cid_match = re.search(r'cid-(\d+)', parsed.path)
|
||||
slug_match = re.search(r'/([^/]+)\.html$', parsed.path)
|
||||
if cid_match and slug_match:
|
||||
slug = slug_match.group(1)
|
||||
return ('besteyecandy', 'besteyecandy', f"{cid_match.group(1)}/{slug}")
|
||||
elif cid_match:
|
||||
return ('besteyecandy', 'besteyecandy', cid_match.group(1))
|
||||
return None
|
||||
|
||||
# Handle Coppermine gallery URLs
|
||||
# Match: domain.com/gallery/, domain.com/cpg/, domain.com/coppermine/
|
||||
# Also match direct index.php/thumbnails.php/displayimage.php pages
|
||||
if any(p in parsed.path.lower() for p in ['/gallery/', '/cpg/', '/coppermine/']) or \
|
||||
re.search(r'(?:index|thumbnails|displayimage)\.php', parsed.path):
|
||||
# Normalize to gallery root
|
||||
base_path = re.sub(
|
||||
r'(?:index|thumbnails|displayimage)\.php.*$', '', parsed.path
|
||||
)
|
||||
base_path = base_path.rstrip('/')
|
||||
if base_path:
|
||||
# Use domain + path as creator_id (e.g. kylie-jenner.org/gallery)
|
||||
creator_id = host.replace('www.', '') + base_path
|
||||
return ('coppermine', 'coppermine', creator_id)
|
||||
|
||||
# Handle Bellazon URLs (forum threads as creators)
|
||||
if 'bellazon' in host:
|
||||
match = re.search(r'/topic/(\d+)-([^/]+)', parsed.path)
|
||||
if match:
|
||||
topic_id = match.group(1)
|
||||
return ('bellazon', 'bellazon', topic_id)
|
||||
return None
|
||||
|
||||
# Handle Reddit URLs
|
||||
if 'reddit.com' in host:
|
||||
# Handle reddit.com/r/subreddit, old.reddit.com/r/subreddit, etc.
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
if len(path_parts) >= 2 and path_parts[0] == 'r':
|
||||
subreddit = path_parts[1].lower()
|
||||
return ('reddit', 'reddit', subreddit)
|
||||
return None
|
||||
|
||||
# Handle Snapchat URLs
|
||||
if 'snapchat.com' in host:
|
||||
# Handle snapchat.com/@username and story.snapchat.com/@username
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
if path_parts:
|
||||
username = path_parts[0].lstrip('@')
|
||||
if username:
|
||||
return ('snapchat', 'snapchat', username)
|
||||
return None
|
||||
|
||||
# Handle HQCelebCorner URLs
|
||||
if 'hqcelebcorner' in host:
|
||||
query = _extract_xenforo_search_query(parsed)
|
||||
if query:
|
||||
return ('hqcelebcorner', 'hqcelebcorner', query)
|
||||
return None
|
||||
|
||||
# Handle PicturePub URLs
|
||||
if 'picturepub' in host:
|
||||
query = _extract_xenforo_search_query(parsed)
|
||||
if query:
|
||||
return ('picturepub', 'picturepub', query)
|
||||
return None
|
||||
|
||||
# Handle Soundgasm URLs
|
||||
if 'soundgasm.net' in host:
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
if len(path_parts) >= 2 and path_parts[0] in ('u', 'user'):
|
||||
return ('soundgasm', 'soundgasm', path_parts[1])
|
||||
return None
|
||||
|
||||
# Handle Liltsome URLs (archive, maps to soundgasm platform)
|
||||
if 'liltsome.yerf.org' in host:
|
||||
# Hash-based routing: /#/artist/{name}
|
||||
fragment = parsed.fragment # e.g. "/artist/kinkyshibby"
|
||||
if fragment:
|
||||
parts = [p for p in fragment.strip('/').split('/') if p]
|
||||
if len(parts) >= 2 and parts[0] == 'artist':
|
||||
return ('soundgasm', 'soundgasm', parts[1])
|
||||
return None
|
||||
|
||||
# Determine service (Coomer/Kemono)
|
||||
if 'coomer' in host:
|
||||
service_id = 'coomer'
|
||||
elif 'kemono' in host:
|
||||
service_id = 'kemono'
|
||||
else:
|
||||
return None
|
||||
|
||||
# Parse path: /platform/user/creatorid
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
|
||||
if len(path_parts) >= 3 and path_parts[1] == 'user':
|
||||
platform = path_parts[0]
|
||||
creator_id = path_parts[2]
|
||||
return (service_id, platform, creator_id)
|
||||
|
||||
return None
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _extract_youtube_channel_id(url: str) -> Optional[str]:
|
||||
"""
|
||||
Extract channel identifier from various YouTube URL formats
|
||||
|
||||
Supports:
|
||||
- youtube.com/channel/UC...
|
||||
- youtube.com/@handle
|
||||
- youtube.com/c/channelname
|
||||
- youtube.com/user/username
|
||||
"""
|
||||
patterns = [
|
||||
r'youtube\.com/channel/([a-zA-Z0-9_-]+)',
|
||||
r'youtube\.com/@([a-zA-Z0-9_.-]+)',
|
||||
r'youtube\.com/c/([a-zA-Z0-9_-]+)',
|
||||
r'youtube\.com/user/([a-zA-Z0-9_-]+)',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_twitch_channel_name(url: str) -> Optional[str]:
|
||||
"""
|
||||
Extract channel name from Twitch URL
|
||||
|
||||
Supports:
|
||||
- twitch.tv/username
|
||||
- twitch.tv/username/clips
|
||||
- m.twitch.tv/username/clips
|
||||
"""
|
||||
patterns = [
|
||||
r'twitch\.tv/([a-zA-Z0-9_]+)(?:/clips)?',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return match.group(1).lower()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_fansly_username(url: str) -> Optional[str]:
|
||||
"""
|
||||
Extract username from Fansly URL
|
||||
|
||||
Supports:
|
||||
- fansly.com/username
|
||||
- fansly.com/username/posts
|
||||
- fansly.com/username/media
|
||||
"""
|
||||
patterns = [
|
||||
r'fansly\.com/([a-zA-Z0-9_.-]+)(?:/(?:posts|media))?',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
username = match.group(1)
|
||||
# Filter out known non-username paths
|
||||
if username.lower() not in ('explore', 'search', 'settings', 'notifications', 'messages', 'live'):
|
||||
return username
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_pornhub_creator_id(url: str) -> Optional[str]:
|
||||
"""Extract creator identifier from Pornhub URL, returns 'type/name' format"""
|
||||
patterns = [
|
||||
r'pornhub\.com/pornstar/([a-zA-Z0-9_-]+)',
|
||||
r'pornhub\.com/channels/([a-zA-Z0-9_-]+)',
|
||||
r'pornhub\.com/users/([a-zA-Z0-9_-]+)',
|
||||
r'pornhub\.com/model/([a-zA-Z0-9_-]+)',
|
||||
]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
# Store as "type/name" to preserve the URL type
|
||||
type_match = re.search(r'pornhub\.com/(pornstar|channels|users|model)/', url)
|
||||
return f"{type_match.group(1)}/{match.group(1)}" if type_match else match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _extract_xhamster_creator_id(url: str) -> Optional[str]:
|
||||
"""Extract creator identifier from XHamster URL, returns 'type/name' format"""
|
||||
patterns = [
|
||||
r'xhamster\d*\.com/creators/([a-zA-Z0-9_-]+)',
|
||||
r'xhamster\d*\.com/channels/([a-zA-Z0-9_-]+)',
|
||||
]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
type_match = re.search(r'xhamster\d*\.com/(creators|channels)/', url)
|
||||
return f"{type_match.group(1)}/{match.group(1)}" if type_match else match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _extract_tiktok_username(url: str) -> Optional[str]:
|
||||
"""Extract username from TikTok URL"""
|
||||
match = re.search(r'tiktok\.com/@([a-zA-Z0-9_.]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _extract_instagram_username(url: str) -> Optional[str]:
|
||||
"""Extract username from Instagram URL"""
|
||||
match = re.search(r'instagram\.com/([a-zA-Z0-9_.]+)/?', url)
|
||||
if match:
|
||||
username = match.group(1).lower()
|
||||
non_usernames = {
|
||||
'explore', 'reels', 'stories', 'p', 'tv', 'accounts',
|
||||
'direct', 'about', 'legal', 'developer', 'privacy',
|
||||
'terms', 'help', 'api', 'reel', 'tags'
|
||||
}
|
||||
if username not in non_usernames:
|
||||
return username
|
||||
return None
|
||||
|
||||
|
||||
def parse_post_url(url: str) -> Optional[Tuple[str, str, str, str]]:
|
||||
"""
|
||||
Parse a Coomer/Kemono post URL
|
||||
|
||||
Args:
|
||||
url: URL like https://coomer.party/onlyfans/user/creatorid/post/postid
|
||||
|
||||
Returns:
|
||||
Tuple of (service_id, platform, creator_id, post_id) or None if invalid
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
host = parsed.netloc.lower()
|
||||
|
||||
# Determine service
|
||||
if 'coomer' in host:
|
||||
service_id = 'coomer'
|
||||
elif 'kemono' in host:
|
||||
service_id = 'kemono'
|
||||
else:
|
||||
return None
|
||||
|
||||
# Parse path: /platform/user/creatorid/post/postid
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
|
||||
if len(path_parts) >= 5 and path_parts[1] == 'user' and path_parts[3] == 'post':
|
||||
platform = path_parts[0]
|
||||
creator_id = path_parts[2]
|
||||
post_id = path_parts[4]
|
||||
return (service_id, platform, creator_id, post_id)
|
||||
|
||||
return None
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def format_file_size(size_bytes: int) -> str:
|
||||
"""Format file size in human-readable format"""
|
||||
if size_bytes is None:
|
||||
return 'Unknown'
|
||||
|
||||
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
||||
if abs(size_bytes) < 1024.0:
|
||||
return f"{size_bytes:.1f} {unit}"
|
||||
size_bytes /= 1024.0
|
||||
|
||||
return f"{size_bytes:.1f} PB"
|
||||
|
||||
|
||||
def sanitize_filename(name: str, max_length: int = 200) -> str:
|
||||
"""
|
||||
Sanitize a string for use in a filename
|
||||
|
||||
Args:
|
||||
name: String to sanitize
|
||||
max_length: Maximum length of result
|
||||
|
||||
Returns:
|
||||
Sanitized filename
|
||||
"""
|
||||
if not name:
|
||||
return 'unnamed'
|
||||
|
||||
# Remove/replace invalid characters
|
||||
name = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', name)
|
||||
name = re.sub(r'\s+', '-', name.strip())
|
||||
name = name.strip('.-')
|
||||
|
||||
if len(name) > max_length:
|
||||
name = name[:max_length]
|
||||
|
||||
return name or 'unnamed'
|
||||
|
||||
|
||||
def extract_platform_from_domain(domain: str) -> Optional[str]:
|
||||
"""Extract platform name from domain"""
|
||||
domain = domain.lower().replace('www.', '')
|
||||
|
||||
platform_domains = {
|
||||
'onlyfans.com': 'onlyfans',
|
||||
'fansly.com': 'fansly',
|
||||
'patreon.com': 'patreon',
|
||||
'fanbox.cc': 'fanbox',
|
||||
'gumroad.com': 'gumroad',
|
||||
'subscribestar.com': 'subscribestar',
|
||||
'subscribestar.adult': 'subscribestar',
|
||||
'discord.com': 'discord',
|
||||
'discord.gg': 'discord',
|
||||
'candfans.jp': 'candfans',
|
||||
}
|
||||
|
||||
return platform_domains.get(domain)
|
||||
|
||||
|
||||
def detect_content_type(filename: str) -> str:
|
||||
"""Detect content type from filename extension"""
|
||||
if not filename:
|
||||
return 'unknown'
|
||||
|
||||
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
||||
|
||||
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
|
||||
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg', '3gp'}
|
||||
audio_exts = {'mp3', 'wav', 'flac', 'aac', 'm4a', 'ogg', 'wma'}
|
||||
archive_exts = {'zip', 'rar', '7z', 'tar', 'gz', 'bz2'}
|
||||
document_exts = {'pdf', 'doc', 'docx', 'txt', 'rtf', 'odt'}
|
||||
|
||||
if ext in image_exts:
|
||||
return 'image'
|
||||
elif ext in video_exts:
|
||||
return 'video'
|
||||
elif ext in audio_exts:
|
||||
return 'audio'
|
||||
elif ext in archive_exts:
|
||||
return 'archive'
|
||||
elif ext in document_exts:
|
||||
return 'document'
|
||||
else:
|
||||
return 'unknown'
|
||||
|
||||
|
||||
def get_service_platforms(service_id: str) -> list:
|
||||
"""Get supported platforms for a service"""
|
||||
platforms = {
|
||||
'coomer': ['onlyfans', 'fansly', 'candfans'],
|
||||
'kemono': ['patreon', 'fanbox', 'gumroad', 'subscribestar', 'discord'],
|
||||
'youtube': ['youtube'],
|
||||
'twitch': ['twitch'],
|
||||
'fansly_direct': ['fansly'],
|
||||
'onlyfans_direct': ['onlyfans'],
|
||||
'pornhub': ['pornhub'],
|
||||
'xhamster': ['xhamster'],
|
||||
'tiktok': ['tiktok'],
|
||||
'instagram': ['instagram'],
|
||||
'soundgasm': ['soundgasm'],
|
||||
'bellazon': ['bellazon'],
|
||||
'besteyecandy': ['besteyecandy'],
|
||||
'snapchat': ['snapchat'],
|
||||
'reddit': ['reddit'],
|
||||
'coppermine': ['coppermine'],
|
||||
'hqcelebcorner': ['hqcelebcorner'],
|
||||
'picturepub': ['picturepub'],
|
||||
}
|
||||
return platforms.get(service_id, [])
|
||||
|
||||
|
||||
def get_service_base_url(service_id: str) -> Optional[str]:
|
||||
"""
|
||||
Get base URL for a service.
|
||||
|
||||
Note: For dynamic URLs, use the database (paid_content_services table).
|
||||
These are fallback defaults only.
|
||||
"""
|
||||
# Import here to avoid circular dependency
|
||||
from .api_client import PaidContentAPIClient
|
||||
return PaidContentAPIClient.DEFAULT_SERVICE_URLS.get(service_id)
|
||||
744
modules/paid_content/xenforo_forum_client.py
Normal file
744
modules/paid_content/xenforo_forum_client.py
Normal file
@@ -0,0 +1,744 @@
|
||||
"""
|
||||
Generic XenForo Forum Client for Paid Content
|
||||
|
||||
Scrapes XenForo-based celebrity image forums (HQCelebCorner, PicturePub, etc.)
|
||||
treating each celebrity name as a "creator" and each matching thread as a post.
|
||||
|
||||
Images are hosted on external hosts (imagebam, pixhost, imagetwist, etc.)
|
||||
and resolved via ImageHostHandler from forum_downloader.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urlparse, unquote_plus
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class XenForoForumClient(LoggingMixin):
|
||||
"""Generic client for scraping XenForo-based forum threads."""
|
||||
|
||||
FLARESOLVERR_URL = 'http://localhost:8191/v1'
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
|
||||
|
||||
# External image host domains to look for in post links
|
||||
IMAGE_HOST_DOMAINS = [
|
||||
'imagebam.com', 'pixhost.to', 'imagetwist.com', 'imgur.com',
|
||||
'imgbox.com', 'postimg.cc', 'postimages.org', 'catbox.moe',
|
||||
'turboimagehost.com', 'imageban.ru', 'img.yt', 'acidimg.cc',
|
||||
'pixxxels.cc', 'imx.to', 'imgbb.com', 'ibb.co',
|
||||
]
|
||||
|
||||
def __init__(self, service_id: str, base_url: str, cookie_path: str, log_callback=None):
|
||||
self.SERVICE_ID = service_id
|
||||
self.BASE_URL = base_url.rstrip('/')
|
||||
self.COOKIE_PATH = cookie_path
|
||||
self._init_logger('PaidContent', log_callback, default_module=service_id)
|
||||
self._cookies: Optional[Dict[str, str]] = None
|
||||
self._image_host_handler = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Cookie handling
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _load_cookies(self) -> Dict[str, str]:
|
||||
"""Load Playwright-format cookies and convert to {name: value} dict."""
|
||||
if self._cookies is not None:
|
||||
return self._cookies
|
||||
|
||||
try:
|
||||
cookie_path = Path(self.COOKIE_PATH)
|
||||
if cookie_path.exists():
|
||||
with open(cookie_path, 'r') as f:
|
||||
raw_cookies = json.load(f)
|
||||
self._cookies = {c['name']: c['value'] for c in raw_cookies}
|
||||
self.log(f"Loaded {len(self._cookies)} cookies from {self.COOKIE_PATH}", 'debug')
|
||||
else:
|
||||
self.log(f"Cookie file not found: {self.COOKIE_PATH}", 'warning')
|
||||
self._cookies = {}
|
||||
except Exception as e:
|
||||
self.log(f"Error loading cookies: {e}", 'warning')
|
||||
self._cookies = {}
|
||||
|
||||
return self._cookies
|
||||
|
||||
def _get_cookie_header(self) -> str:
|
||||
"""Build Cookie header string from loaded cookies."""
|
||||
cookies = self._load_cookies()
|
||||
return '; '.join(f'{k}={v}' for k, v in cookies.items())
|
||||
|
||||
def _get_request_headers(self) -> Dict[str, str]:
|
||||
"""Get headers with cookies for authenticated requests."""
|
||||
headers = dict(self.HEADERS)
|
||||
cookie_str = self._get_cookie_header()
|
||||
if cookie_str:
|
||||
headers['Cookie'] = cookie_str
|
||||
return headers
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Image host handling
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _get_image_host_handler(self):
|
||||
"""Get or create ImageHostHandler instance."""
|
||||
if self._image_host_handler is None:
|
||||
try:
|
||||
from modules.forum_downloader import ImageHostHandler
|
||||
self._image_host_handler = ImageHostHandler
|
||||
self.log("Loaded ImageHostHandler from forum_downloader", 'debug')
|
||||
except ImportError:
|
||||
self.log("ImageHostHandler not available", 'warning')
|
||||
self._image_host_handler = False # sentinel to avoid retrying
|
||||
return self._image_host_handler if self._image_host_handler is not False else None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTTP helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
||||
"""Fetch a page with cookies. Falls back to FlareSolverr on 403."""
|
||||
headers = self._get_request_headers()
|
||||
try:
|
||||
async with session.get(url, headers=headers, allow_redirects=True) as resp:
|
||||
if resp.status == 200:
|
||||
return await resp.text()
|
||||
if resp.status == 403:
|
||||
self.log(f"Got 403 for {url}, trying FlareSolverr", 'debug')
|
||||
return await self._fetch_via_flaresolverr(url)
|
||||
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {url}: {e}", 'warning')
|
||||
return await self._fetch_via_flaresolverr(url)
|
||||
|
||||
async def _fetch_via_flaresolverr(self, url: str) -> Optional[str]:
|
||||
"""Fetch a page using FlareSolverr to bypass Cloudflare."""
|
||||
try:
|
||||
import requests as std_requests
|
||||
except ImportError:
|
||||
self.log("requests library not available for FlareSolverr", 'warning')
|
||||
return None
|
||||
|
||||
fs_session_id = None
|
||||
try:
|
||||
# Create session
|
||||
resp = std_requests.post(self.FLARESOLVERR_URL, json={
|
||||
'cmd': 'sessions.create'
|
||||
}, timeout=30)
|
||||
data = resp.json()
|
||||
if data.get('status') != 'ok':
|
||||
self.log("Failed to create FlareSolverr session", 'warning')
|
||||
return None
|
||||
fs_session_id = data.get('session')
|
||||
|
||||
# Fetch page
|
||||
cookies = self._load_cookies()
|
||||
resp = std_requests.post(self.FLARESOLVERR_URL, json={
|
||||
'cmd': 'request.get',
|
||||
'url': url,
|
||||
'session': fs_session_id,
|
||||
'cookies': [{'name': k, 'value': v} for k, v in cookies.items()],
|
||||
'maxTimeout': 60000,
|
||||
}, timeout=70)
|
||||
page_data = resp.json()
|
||||
if page_data.get('status') == 'ok':
|
||||
return page_data.get('solution', {}).get('response', '')
|
||||
self.log(f"FlareSolverr failed for {url}: {page_data.get('message', 'unknown')}", 'warning')
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"FlareSolverr error for {url}: {e}", 'warning')
|
||||
return None
|
||||
finally:
|
||||
if fs_session_id:
|
||||
try:
|
||||
std_requests.post(self.FLARESOLVERR_URL, json={
|
||||
'cmd': 'sessions.destroy',
|
||||
'session': fs_session_id,
|
||||
}, timeout=10)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def search_threads(self, query: str) -> List[Dict]:
|
||||
"""Search for threads matching a celebrity name.
|
||||
|
||||
Returns list of {thread_id, title, url, reply_count}.
|
||||
"""
|
||||
threads = []
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# XenForo search: POST form to /search/search
|
||||
search_url = f'{self.BASE_URL}/search/search'
|
||||
headers = self._get_request_headers()
|
||||
headers['Content-Type'] = 'application/x-www-form-urlencoded'
|
||||
|
||||
# Need CSRF token - fetch search page first
|
||||
search_page_url = f'{self.BASE_URL}/search/'
|
||||
page_html = await self._fetch_page(session, search_page_url)
|
||||
if not page_html:
|
||||
self.log("Failed to fetch search page", 'warning')
|
||||
return threads
|
||||
|
||||
# Extract CSRF token
|
||||
csrf_match = re.search(r'name="_xfToken"\s+value="([^"]+)"', page_html)
|
||||
xf_token = csrf_match.group(1) if csrf_match else ''
|
||||
|
||||
form_data = {
|
||||
'keywords': query,
|
||||
'search_type': 'post',
|
||||
'c[title_only]': '1',
|
||||
'order': 'date',
|
||||
'_xfToken': xf_token,
|
||||
}
|
||||
|
||||
try:
|
||||
async with session.post(search_url, headers=headers, data=form_data,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Search returned HTTP {resp.status}", 'warning')
|
||||
return threads
|
||||
result_html = await resp.text()
|
||||
result_url = str(resp.url)
|
||||
except Exception as e:
|
||||
self.log(f"Search failed: {e}", 'error')
|
||||
return threads
|
||||
|
||||
threads = self._parse_search_results(result_html)
|
||||
|
||||
# Handle search result pagination
|
||||
page = 2
|
||||
while True:
|
||||
next_url = self._find_next_search_page(result_html, result_url, page)
|
||||
if not next_url:
|
||||
break
|
||||
await asyncio.sleep(0.3)
|
||||
result_html = await self._fetch_page(session, next_url)
|
||||
if not result_html:
|
||||
break
|
||||
more = self._parse_search_results(result_html)
|
||||
if not more:
|
||||
break
|
||||
threads.extend(more)
|
||||
page += 1
|
||||
|
||||
self.log(f"Search for '{query}' found {len(threads)} threads", 'info')
|
||||
return threads
|
||||
|
||||
async def get_thread_info(self, thread_url: str) -> Optional[Dict]:
|
||||
"""Fetch page 1 of a thread and extract metadata.
|
||||
|
||||
Returns {thread_id, title, reply_count, page_count, url}.
|
||||
"""
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
page_html = await self._fetch_page(session, thread_url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
title = self._extract_title(page_html)
|
||||
page_count = self._extract_page_count(page_html)
|
||||
reply_count = self._extract_reply_count(page_html)
|
||||
thread_id = self._extract_thread_id(thread_url)
|
||||
|
||||
return {
|
||||
'thread_id': thread_id,
|
||||
'title': title or 'Untitled',
|
||||
'reply_count': reply_count,
|
||||
'page_count': page_count,
|
||||
'url': thread_url.split('#')[0].rstrip('/'),
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error getting thread info for {thread_url}: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def get_thread_images(self, thread_url: str, page_count: int = None,
|
||||
start_page: int = 1) -> List[Dict]:
|
||||
"""Scrape all pages of a thread and extract image host links.
|
||||
|
||||
Returns list of {url, host, post_number} dicts (deduplicated).
|
||||
"""
|
||||
images = []
|
||||
seen_urls: Set[str] = set()
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# If page_count not provided, fetch page 1 to determine it
|
||||
if page_count is None:
|
||||
page1_html = await self._fetch_page(session, thread_url)
|
||||
if not page1_html:
|
||||
return images
|
||||
page_count = self._extract_page_count(page1_html)
|
||||
page_images = self._extract_image_links(page1_html)
|
||||
for img in page_images:
|
||||
if img['url'] not in seen_urls:
|
||||
seen_urls.add(img['url'])
|
||||
images.append(img)
|
||||
start_page = 2
|
||||
|
||||
for page_num in range(start_page, page_count + 1):
|
||||
page_url = self._build_page_url(thread_url, page_num)
|
||||
await asyncio.sleep(0.5) # Rate limit
|
||||
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if not page_html:
|
||||
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
|
||||
break
|
||||
|
||||
page_images = self._extract_image_links(page_html)
|
||||
new_count = 0
|
||||
for img in page_images:
|
||||
if img['url'] not in seen_urls:
|
||||
seen_urls.add(img['url'])
|
||||
images.append(img)
|
||||
new_count += 1
|
||||
|
||||
self.log(f"Page {page_num}/{page_count}: {new_count} new image links", 'debug')
|
||||
|
||||
self.log(f"Total: {len(images)} unique image links from {page_count} pages", 'info')
|
||||
return images
|
||||
|
||||
async def resolve_image_url(self, host_page_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
|
||||
"""Resolve an image host page URL to a direct image URL.
|
||||
|
||||
Uses ImageHostHandler from forum_downloader where possible.
|
||||
"""
|
||||
handler = self._get_image_host_handler()
|
||||
|
||||
# Try direct extraction without fetching the page
|
||||
if handler:
|
||||
direct = handler.extract_direct_url(host_page_url)
|
||||
if direct:
|
||||
return direct
|
||||
|
||||
# imgbox thumbnail → full image conversion (thumbs2 → images2)
|
||||
m = re.match(r'https?://thumbs(\d*)\.imgbox\.com/([a-f0-9]+/[a-f0-9]+/)(\w+)_t\.\w+', host_page_url)
|
||||
if m:
|
||||
return f"https://images{m.group(1)}.imgbox.com/{m.group(2)}{m.group(3)}_o.jpg"
|
||||
|
||||
# For hosts that need page content, fetch and parse
|
||||
own_session = session is None
|
||||
if own_session:
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
session = aiohttp.ClientSession(timeout=timeout)
|
||||
|
||||
try:
|
||||
# ImageBam requires sfw_inter=1 cookie to bypass consent page
|
||||
headers = dict(self.HEADERS)
|
||||
if 'imagebam' in host_page_url:
|
||||
headers['Cookie'] = 'sfw_inter=1'
|
||||
|
||||
try:
|
||||
async with session.get(host_page_url, headers=headers,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
return None
|
||||
page_content = await resp.text()
|
||||
final_url = str(resp.url)
|
||||
except Exception as e:
|
||||
self.log(f"Failed to fetch image host page {host_page_url}: {e}", 'debug')
|
||||
return None
|
||||
|
||||
# Try handler with page content
|
||||
if handler:
|
||||
direct = handler.extract_direct_url(host_page_url, page_content=page_content)
|
||||
if direct:
|
||||
return direct
|
||||
|
||||
# Manual extraction fallbacks
|
||||
return self._extract_direct_image_from_html(host_page_url, page_content, final_url)
|
||||
|
||||
finally:
|
||||
if own_session:
|
||||
await session.close()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTML parsing helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _parse_search_results(self, html_content: str) -> List[Dict]:
|
||||
"""Parse XenForo search results page for thread links."""
|
||||
threads = []
|
||||
|
||||
# Parse each contentRow block to extract title, URL, and date
|
||||
for block_match in re.finditer(
|
||||
r'<div\s+class="contentRow[^"]*"[^>]*>(.*?)</div>\s*</div>\s*</div>',
|
||||
html_content, re.DOTALL
|
||||
):
|
||||
block = block_match.group(1)
|
||||
|
||||
# Extract thread URL and title
|
||||
title_match = re.search(
|
||||
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
|
||||
block, re.DOTALL
|
||||
)
|
||||
if not title_match:
|
||||
continue
|
||||
|
||||
url = title_match.group(1)
|
||||
title_raw = title_match.group(2)
|
||||
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
|
||||
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
|
||||
|
||||
if not title:
|
||||
continue
|
||||
|
||||
if not url.startswith('http'):
|
||||
url = self.BASE_URL + url
|
||||
|
||||
thread_id = self._extract_thread_id(url)
|
||||
if not thread_id:
|
||||
continue
|
||||
|
||||
# Extract date from <time datetime="..."> tag
|
||||
published_at = None
|
||||
time_match = re.search(r'<time[^>]+datetime="([^"]+)"', block)
|
||||
if time_match:
|
||||
published_at = time_match.group(1)
|
||||
|
||||
threads.append({
|
||||
'thread_id': thread_id,
|
||||
'title': title,
|
||||
'url': url.split('#')[0].rstrip('/'),
|
||||
'reply_count': 0,
|
||||
'published_at': published_at,
|
||||
})
|
||||
|
||||
# Fallback: if contentRow block parsing found nothing, try simpler title-only parsing
|
||||
if not threads:
|
||||
for m in re.finditer(
|
||||
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
|
||||
html_content, re.DOTALL
|
||||
):
|
||||
url = m.group(1)
|
||||
title_raw = m.group(2)
|
||||
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
|
||||
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
|
||||
if not title:
|
||||
continue
|
||||
if not url.startswith('http'):
|
||||
url = self.BASE_URL + url
|
||||
thread_id = self._extract_thread_id(url)
|
||||
if not thread_id:
|
||||
continue
|
||||
threads.append({
|
||||
'thread_id': thread_id,
|
||||
'title': title,
|
||||
'url': url.split('#')[0].rstrip('/'),
|
||||
'reply_count': 0,
|
||||
'published_at': None,
|
||||
})
|
||||
|
||||
# Deduplicate by thread_id
|
||||
seen = set()
|
||||
unique = []
|
||||
for t in threads:
|
||||
if t['thread_id'] not in seen:
|
||||
seen.add(t['thread_id'])
|
||||
unique.append(t)
|
||||
|
||||
return unique
|
||||
|
||||
def _find_next_search_page(self, html_content: str, current_url: str, page_num: int) -> Optional[str]:
|
||||
"""Find URL for the next page of search results."""
|
||||
# XenForo pagination: <a href="...page-{N}..." class="pageNav-page">
|
||||
pattern = rf'<a\s+href="([^"]*)"[^>]*class="pageNav-jump[^"]*"[^>]*>\s*Next'
|
||||
m = re.search(pattern, html_content, re.IGNORECASE)
|
||||
if m:
|
||||
url = m.group(1)
|
||||
if not url.startswith('http'):
|
||||
url = self.BASE_URL + html.unescape(url)
|
||||
return url
|
||||
return None
|
||||
|
||||
# Domains/patterns for non-content images (reaction GIFs, emojis, signatures, etc.)
|
||||
JUNK_URL_PATTERNS = [
|
||||
'giphy.com', 'tenor.com', 'gfycat.com', # reaction GIFs
|
||||
'jsdelivr.net', 'joypixels', 'twemoji', # emoji CDNs
|
||||
'wp-content/', # WordPress media (blog graphics, profile pics)
|
||||
'/unicode/', '/emoji/', # emoji paths
|
||||
'haboodadi.com', # forum signature images
|
||||
]
|
||||
|
||||
# Image hosts that are permanently dead (DNS gone / domain expired)
|
||||
DEAD_HOSTS = [
|
||||
'someimage.com',
|
||||
]
|
||||
|
||||
def _extract_image_links(self, page_html: str) -> List[Dict]:
|
||||
"""Extract image host links from all posts on a page."""
|
||||
images = []
|
||||
|
||||
# Find all message bodies: XenForo uses <article class="message ..."> and
|
||||
# <div class="bbWrapper"> for post content
|
||||
for content_match in re.finditer(
|
||||
r'<div\s+class="bbWrapper">(.*?)</div>\s*(?:</div>|<div\s+class="(?:js-post|message))',
|
||||
page_html, re.DOTALL
|
||||
):
|
||||
content = content_match.group(1)
|
||||
|
||||
# Extract links to known image hosts
|
||||
for link_match in re.finditer(r'<a\s+[^>]*href="([^"]+)"[^>]*>', content):
|
||||
link_url = html.unescape(link_match.group(1))
|
||||
if self._is_image_host_url(link_url) and not self._is_junk_url(link_url):
|
||||
images.append({'url': link_url, 'host': self._identify_host(link_url)})
|
||||
|
||||
# Also catch direct image URLs (full-size, not thumbnails)
|
||||
# NOTE: Skip images hosted on known image host CDNs (imgbox, imgur, etc.)
|
||||
# — legitimate gallery images are posted as <a href> links to host pages
|
||||
# (handled above), while inline <img> from these hosts are signatures.
|
||||
for img_match in re.finditer(r'<img\s+[^>]*src="([^"]+)"[^>]*>', content):
|
||||
img_url = html.unescape(img_match.group(1))
|
||||
# Skip thumbnails, avatars, smilies, and junk
|
||||
if any(skip in img_url.lower() for skip in [
|
||||
'thumb', 'avatar', 'smili', 'emoji', 'icon', 'logo',
|
||||
'data/assets', '/styles/', 'xenforo'
|
||||
]):
|
||||
continue
|
||||
if self._is_junk_url(img_url):
|
||||
continue
|
||||
# Skip inline images from known image hosts — these are signatures,
|
||||
# not gallery content (gallery images come through as <a> links above)
|
||||
if self._is_image_host_url(img_url):
|
||||
continue
|
||||
if self._is_direct_image_url(img_url):
|
||||
images.append({'url': img_url, 'host': 'direct'})
|
||||
|
||||
return images
|
||||
|
||||
def _is_junk_url(self, url: str) -> bool:
|
||||
"""Filter out non-content images: reaction GIFs, emojis, blog graphics, dead hosts, etc."""
|
||||
url_lower = url.lower()
|
||||
if any(pat in url_lower for pat in self.JUNK_URL_PATTERNS):
|
||||
return True
|
||||
if any(host in url_lower for host in self.DEAD_HOSTS):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_image_host_url(self, url: str) -> bool:
|
||||
"""Check if a URL belongs to a known image hosting service."""
|
||||
try:
|
||||
domain = urlparse(url).netloc.lower()
|
||||
return any(host in domain for host in self.IMAGE_HOST_DOMAINS)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _is_direct_image_url(self, url: str) -> bool:
|
||||
"""Check if a URL points directly to an image file."""
|
||||
try:
|
||||
path = urlparse(url).path.lower()
|
||||
return any(path.endswith(f'.{ext}') for ext in self.IMAGE_EXTS)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _identify_host(self, url: str) -> str:
|
||||
"""Identify which image host a URL belongs to."""
|
||||
handler = self._get_image_host_handler()
|
||||
if handler:
|
||||
host = handler.identify_host(url)
|
||||
if host:
|
||||
return host
|
||||
# Fallback
|
||||
try:
|
||||
domain = urlparse(url).netloc.lower()
|
||||
for host_domain in self.IMAGE_HOST_DOMAINS:
|
||||
if host_domain in domain:
|
||||
return host_domain.split('.')[0]
|
||||
except Exception:
|
||||
pass
|
||||
return 'unknown'
|
||||
|
||||
def _extract_direct_image_from_html(self, url: str, page_content: str, final_url: str) -> Optional[str]:
|
||||
"""Manually extract direct image URL from host page HTML."""
|
||||
domain = urlparse(url).netloc.lower()
|
||||
|
||||
# imagebam: <img class="main-image ..." src="..."> (class may have extra classes)
|
||||
if 'imagebam' in domain:
|
||||
m = re.search(r'<img\s+[^>]*src="(https?://images\d*\.imagebam\.com/[^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
m = re.search(r'<img\s+[^>]*class="main-image[^"]*"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
# Alternative: og:image meta tag
|
||||
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# pixhost: <img id="image" src="..."> or img.pixhost.to URL
|
||||
if 'pixhost' in domain:
|
||||
m = re.search(r'<img\s+[^>]*id="image"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
# Convert thumbnail URL to full: t{N}.pixhost.to/thumbs/ -> img{N}.pixhost.to/images/
|
||||
m = re.search(r'https?://t(\d+)\.pixhost\.to/thumbs/(\d+)/(.+)', url)
|
||||
if m:
|
||||
return f"https://img{m.group(1)}.pixhost.to/images/{m.group(2)}/{m.group(3)}"
|
||||
|
||||
# imagetwist: <img class="pic" src="...">
|
||||
if 'imagetwist' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="pic"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
m = re.search(r'<p\s+[^>]*style="text-align:center"[^>]*>\s*<img\s+[^>]*src="([^"]+)"',
|
||||
page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# imgbox: <img id="img" src="..."> or src before id
|
||||
if 'imgbox' in domain:
|
||||
m = re.search(r'<img\s+[^>]*id="img"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
m = re.search(r'<img\s+[^>]*src="([^"]+)"[^>]*id="img"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
# Direct image URL pattern
|
||||
m = re.search(r'(https?://images\d*\.imgbox\.com/[^\s"<>]+)', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# turboimagehost: <img class="uImage" src="...">
|
||||
if 'turboimagehost' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="uImage"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# acidimg: <img class="centred" src="...">
|
||||
if 'acidimg' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# pixxxels: same pattern as acidimg
|
||||
if 'pixxxels' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# imx.to: <img class="image-show" src="...">
|
||||
if 'imx.to' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="image-show"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# Generic: try og:image meta tag
|
||||
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
|
||||
if m:
|
||||
img_url = html.unescape(m.group(1))
|
||||
if self._is_direct_image_url(img_url):
|
||||
return img_url
|
||||
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utility helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _extract_title(page_html: str) -> Optional[str]:
|
||||
"""Extract thread title from XenForo <h1 class="p-title-value">."""
|
||||
m = re.search(r'<h1\s+class="p-title-value"[^>]*>(.*?)</h1>', page_html, re.DOTALL)
|
||||
if m:
|
||||
# Remove inner tags (like <span> for prefixes/labels, viewer count spans)
|
||||
title = re.sub(r'<[^>]+>', '', m.group(1))
|
||||
# Clean up non-breaking spaces and extra whitespace
|
||||
title = title.replace('\xa0', ' ')
|
||||
title = re.sub(r'\s*\(\d+\s*Viewer[s]?\)', '', title) # Remove "(1 Viewer)"
|
||||
title = re.sub(r'\s+', ' ', title).strip()
|
||||
return html.unescape(title)
|
||||
# Fallback: <title> — strip common XenForo site name suffixes
|
||||
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
title = html.unescape(m.group(1).strip())
|
||||
title = re.sub(r'\s*[-–—|]\s*(?:HQCelebCorner|PicturePub|XenForo).*$', '', title, flags=re.IGNORECASE).strip()
|
||||
return title
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_page_count(page_html: str) -> int:
|
||||
"""Extract total page count from XenForo pagination."""
|
||||
# <li class="pageNav-page"><a href="...">42</a></li>
|
||||
pages = re.findall(r'<li\s+class="pageNav-page[^"]*">\s*<a[^>]*>(\d+)</a>', page_html)
|
||||
if pages:
|
||||
return max(int(p) for p in pages)
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def _extract_reply_count(page_html: str) -> int:
|
||||
"""Extract reply count from XenForo thread info."""
|
||||
# <dl class="pairs pairs--inline"><dt>Replies</dt><dd>123</dd></dl>
|
||||
m = re.search(r'<dt>Replies</dt>\s*<dd>([\d,]+)</dd>', page_html)
|
||||
if m:
|
||||
return int(m.group(1).replace(',', ''))
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def _extract_thread_id(url: str) -> Optional[str]:
|
||||
"""Extract thread ID from XenForo URL.
|
||||
|
||||
Handles both formats:
|
||||
- /threads/title.12345/
|
||||
- /index.php?threads/title.12345/
|
||||
"""
|
||||
m = re.search(r'threads/[^/]*?\.(\d+)', url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
# Fallback: just /threads/{id}/
|
||||
m = re.search(r'threads/(\d+)', url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _build_page_url(thread_url: str, page_num: int) -> str:
|
||||
"""Build paginated thread URL for XenForo.
|
||||
|
||||
Handles: /index.php?threads/slug.12345/page-2
|
||||
"""
|
||||
# Remove existing page- suffix and fragment
|
||||
base = thread_url.split('#')[0].rstrip('/')
|
||||
base = re.sub(r'/page-\d+$', '', base)
|
||||
if page_num == 1:
|
||||
return base + '/'
|
||||
return f'{base}/page-{page_num}'
|
||||
|
||||
@staticmethod
|
||||
def _get_extension(filename_or_url: str) -> str:
|
||||
"""Get lowercase file extension."""
|
||||
clean = filename_or_url.split('?')[0].split('#')[0]
|
||||
if '.' in clean.split('/')[-1]:
|
||||
return clean.rsplit('.', 1)[-1].lower()
|
||||
return ''
|
||||
|
||||
@staticmethod
|
||||
def _filename_from_url(url: str) -> str:
|
||||
"""Extract filename from URL path."""
|
||||
path = urlparse(url).path
|
||||
name = path.rstrip('/').split('/')[-1]
|
||||
return name if name else 'unnamed.jpg'
|
||||
1414
modules/paid_content/xhamster_client.py
Normal file
1414
modules/paid_content/xhamster_client.py
Normal file
File diff suppressed because it is too large
Load Diff
1087
modules/paid_content/youtube_client.py
Normal file
1087
modules/paid_content/youtube_client.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user