Files
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

9844 lines
451 KiB
Python

"""
Main scraper orchestrating content download from Coomer/Kemono
"""
import asyncio
import hashlib
import json
import re
import os
import shutil
import subprocess
import sys
import tempfile
from datetime import datetime
from io import BytesIO
from pathlib import Path
from typing import Callable, Dict, List, Optional, Tuple
from urllib.parse import urlparse
import aiohttp
import aiofiles
try:
from PIL import Image
HAS_PIL = True
except ImportError:
HAS_PIL = False
from modules.base_module import LoggingMixin, DeferredDownloadsMixin
from modules.activity_status import get_activity_manager
from .api_client import PaidContentAPIClient
from .db_adapter import PaidContentDBAdapter
from .youtube_client import YouTubeClient
from .twitch_client import TwitchClient
from .fansly_direct_client import FanslyDirectClient
from .onlyfans_client import OnlyFansClient
from .pornhub_client import PornhubClient
from .xhamster_client import XHamsterClient
from .tiktok_client import TikTokClient
from .instagram_adapter import InstagramAdapter
from .soundgasm_client import SoundgasmClient, format_tag_display
from .bellazon_client import BellazonClient
from .besteyecandy_client import BestEyeCandyClient
from .snapchat_client import SnapchatPaidContentClient
from .reddit_client import RedditClient
from .xenforo_forum_client import XenForoForumClient
from .coppermine_client import CoppermineClient
from .models import Post, Attachment, SyncResult, DownloadResult
class PaidContentScraper(LoggingMixin, DeferredDownloadsMixin):
"""
Main scraper for Paid Content feature
Responsibilities:
- Sync creators (fetch new posts)
- Download attachments
- Download embedded videos
- File organization
- Duplicate detection
- Progress reporting
"""
# Regex patterns for extracting embedded URLs from post content
EMBED_PATTERNS = [
(r'https?://(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', 'youtube'),
(r'https?://youtu\.be/([a-zA-Z0-9_-]{11})', 'youtube'),
(r'https?://(?:www\.)?vimeo\.com/(\d+)', 'vimeo'),
(r'https?://(?:www\.)?dailymotion\.com/video/([a-zA-Z0-9]+)', 'dailymotion'),
(r'https?://(?:www\.)?twitch\.tv/videos/(\d+)', 'twitch'),
]
# XenForo-based forum configs (service_id → settings)
XENFORO_FORUMS = {
'hqcelebcorner': {
'base_url': 'https://www.hqcelebcorner.net',
'cookie_path': '/opt/media-downloader/cookies/forum_cookies_HQCelebCorner.json',
},
'picturepub': {
'base_url': 'https://picturepub.net',
'cookie_path': '/opt/media-downloader/cookies/forum_cookies_PicturePub.json',
},
}
def __init__(self, unified_db, event_emitter=None, notifier=None, log_callback=None, websocket_manager=None, app_state=None):
self._init_logger('PaidContent', log_callback, default_module='Scraper')
self._init_deferred_downloads()
self.unified_db = unified_db
self.db = PaidContentDBAdapter(unified_db)
self.event_emitter = event_emitter
self.notifier = notifier
self.websocket_manager = websocket_manager
self.app_state = app_state
# API clients (initialized lazily with session cookies)
self._clients: Dict[str, PaidContentAPIClient] = {}
# YouTube client (uses yt-dlp)
self._youtube_client: Optional[YouTubeClient] = None
# Twitch client (uses yt-dlp)
self._twitch_client: Optional[TwitchClient] = None
# Fansly Direct client
self._fansly_direct_client = None
# OnlyFans Direct client
self._onlyfans_direct_client = None
# Pornhub client (uses yt-dlp)
self._pornhub_client: Optional[PornhubClient] = None
# XHamster client (uses yt-dlp)
self._xhamster_client: Optional[XHamsterClient] = None
# TikTok client (uses yt-dlp + gallery-dl)
self._tiktok_client: Optional[TikTokClient] = None
# Soundgasm + Liltsome client
self._soundgasm_client: Optional[SoundgasmClient] = None
# Bellazon forum client
self._bellazon_client: Optional[BellazonClient] = None
# BestEyeCandy client
self._besteyecandy_client: Optional[BestEyeCandyClient] = None
# Snapchat client
self._snapchat_client: Optional[SnapchatPaidContentClient] = None
# Reddit client (uses gallery-dl)
self._reddit_client: Optional[RedditClient] = None
# XenForo forum clients (hqcelebcorner, picturepub, etc.)
self._xenforo_clients: Dict[str, XenForoForumClient] = {}
# Coppermine gallery client
self._coppermine_client: Optional[CoppermineClient] = None
# Load config
self.config = self.db.get_config()
# Download settings
self.max_concurrent_downloads = self.config.get('max_concurrent_downloads', 3)
self.log(f"Max concurrent downloads: {self.max_concurrent_downloads}", 'info')
# Initialize activity manager for database-backed progress tracking
# This works across processes (scheduler, API, etc.)
self.activity_manager = get_activity_manager(unified_db)
# Keep app_state for backwards compatibility, but prefer activity_manager
if self.app_state and not hasattr(self.app_state, 'active_paid_content_syncs'):
self.app_state.active_paid_content_syncs = {}
self.log(f"Scraper initialized", 'info')
def _is_creator_syncing(self, creator_id: int) -> bool:
"""Check if a creator already has an active sync running."""
task_id = f"paid_content_sync_{creator_id}"
try:
all_tasks = self.activity_manager.get_active_background_tasks()
return any(
t.get('task_id') == task_id and t.get('task_type') == 'paid_content_sync'
for t in all_tasks
)
except Exception:
return False
def _register_active_sync(self, creator_id: int, data: Dict):
"""Register an active sync task for polling-based status updates.
Uses activity_manager for database-backed tracking that works across processes.
"""
task_id = f"paid_content_sync_{creator_id}"
creator_name = data.get('username') or data.get('creator', 'Unknown')
self.log(f"Registering active sync for creator {creator_id} ({creator_name})", 'info')
# Use activity_manager for database-backed tracking (works across processes)
self.activity_manager.start_background_task(
task_id=task_id,
task_type="paid_content_sync",
display_name=f"Sync: {creator_name}",
status="Starting",
extra_data={
**data,
'creator_id': creator_id,
'started_at': datetime.now().isoformat()
}
)
# Also update app_state for backwards compatibility (API process only)
if self.app_state:
if not hasattr(self.app_state, 'active_paid_content_syncs'):
self.app_state.active_paid_content_syncs = {}
self.app_state.active_paid_content_syncs[creator_id] = {
**data,
'creator_id': creator_id,
'started_at': datetime.now().isoformat()
}
self.log(f"Registered sync for creator {creator_id}", 'info')
def _is_permanent_error(self, error: str) -> bool:
"""
Check if an error is permanent (should not be retried) vs retriable.
Permanent errors (don't retry):
- HTTP 500, 502, 503 server errors
- HTTP 404 not found
- HTTP 403 forbidden
- HTTP 410 gone
Retriable errors (auto-retry later):
- Timeouts / stalls
- Partial downloads
- Connection resets
- Network errors
"""
error_lower = error.lower()
# Permanent HTTP errors
permanent_patterns = [
'http 500', 'http 502', 'http 503', 'http 504',
'http 404', 'http 403', 'http 410',
'500 internal', '502 bad gateway', '503 service',
'404 not found', '403 forbidden', '410 gone',
'no video url', 'invalid url'
]
for pattern in permanent_patterns:
if pattern in error_lower:
return True
return False
def _update_active_sync(self, creator_id: int, updates: Dict):
"""Update an active sync task's status.
Uses activity_manager for database-backed tracking that works across processes.
"""
task_id = f"paid_content_sync_{creator_id}"
# Build status message from updates
status = updates.get('status', 'Running')
phase = updates.get('phase', '')
progress_current = updates.get('progress')
progress_total = updates.get('total_files')
# Merge updates into existing extra_data to preserve persistent fields
# (username, platform, service, etc. set during _register_active_sync)
existing_extra = {}
try:
tasks = self.activity_manager.get_active_background_tasks()
for t in tasks:
if t.get('task_id') == task_id:
existing_extra = t.get('extra_data', {}) or {}
break
except Exception:
pass
merged_extra = {**existing_extra, **updates, 'updated_at': datetime.now().isoformat()}
# Update activity_manager (database-backed, works across processes)
self.activity_manager.update_background_task(
task_id=task_id,
detailed_status=status,
progress_current=progress_current,
progress_total=progress_total,
extra_data=merged_extra
)
# Also update app_state for backwards compatibility (API process only)
if self.app_state and hasattr(self.app_state, 'active_paid_content_syncs'):
if creator_id in self.app_state.active_paid_content_syncs:
self.app_state.active_paid_content_syncs[creator_id].update(updates)
self.app_state.active_paid_content_syncs[creator_id]['updated_at'] = datetime.now().isoformat()
def _unregister_active_sync(self, creator_id: int):
"""Remove an active sync task when complete.
Uses activity_manager for database-backed tracking that works across processes.
"""
task_id = f"paid_content_sync_{creator_id}"
self.log(f"Unregistering active sync for creator {creator_id}", 'info')
# Stop in activity_manager (database-backed)
self.activity_manager.stop_background_task(task_id)
# Also clear from app_state for backwards compatibility
if self.app_state and hasattr(self.app_state, 'active_paid_content_syncs'):
self.app_state.active_paid_content_syncs.pop(creator_id, None)
self.log(f"Unregistered sync for creator {creator_id}", 'info')
def _update_download_status(self, creator_id: int, total_files: int):
"""Update the sync status with current download progress"""
active_count = len(self._active_downloads)
if active_count == 0:
status = f"Downloaded {self._download_progress['completed']}/{total_files} files"
elif active_count == 1:
# Single file - show detailed progress
dl = list(self._active_downloads.values())[0]
progress_str = self._format_bytes(dl['progress'])
if dl['size']:
pct = int(dl['progress'] / dl['size'] * 100)
total_str = self._format_bytes(dl['size'])
status = f"Downloading: {dl['name'][:40]} ({progress_str}/{total_str} - {pct}%)"
else:
status = f"Downloading: {dl['name'][:40]} ({progress_str})"
else:
# Multiple concurrent downloads - show summary
total_progress = sum(d['progress'] for d in self._active_downloads.values())
total_size = sum(d['size'] or 0 for d in self._active_downloads.values())
progress_str = self._format_bytes(total_progress)
if total_size > 0:
total_str = self._format_bytes(total_size)
status = f"Downloading {active_count} files: {progress_str}/{total_str}"
else:
status = f"Downloading {active_count} files: {progress_str}"
# Build list of active downloads for UI
active_list = [
{
'name': d['name'],
'size': d['size'],
'progress': d['progress']
}
for d in self._active_downloads.values()
]
self._update_active_sync(creator_id, {
'phase': 'downloading',
'status': status,
'active_downloads': active_list,
'active_count': active_count,
'downloaded': self._download_progress['success'],
'failed': self._download_progress['failed'],
'total_files': total_files,
'progress': self._download_progress['completed']
})
def _emit_event(self, event_type: str, data: Dict):
"""Emit WebSocket event for real-time updates.
When websocket_manager is not available (e.g., running from scheduler),
status updates are still available via polling /dashboard/active-syncs
which uses activity_manager for database-backed cross-process tracking.
"""
if self.websocket_manager:
try:
self.log(f"Emitting WebSocket event: {event_type}", 'debug')
self.websocket_manager.broadcast_sync({
'type': event_type,
'data': {
**data,
'timestamp': datetime.now().isoformat()
}
})
except Exception as e:
self.log(f"Failed to emit event {event_type}: {e}", 'warning')
# Note: When websocket is unavailable, status updates are still tracked
# via activity_manager and available through polling endpoints
def _get_client(self, service_id: str) -> PaidContentAPIClient:
"""Get or create API client for service"""
if service_id not in self._clients:
service = self.db.get_service(service_id)
session_cookie = service.get('session_cookie') if service else None
base_url = service.get('base_url') if service else None
self.log(f"Creating API client for {service_id}: base_url={base_url}, has_cookie={session_cookie is not None}", 'debug')
self._clients[service_id] = PaidContentAPIClient(
service_id,
session_cookie=session_cookie,
base_url=base_url,
log_callback=self.log_callback
)
return self._clients[service_id]
def _get_youtube_client(self) -> YouTubeClient:
"""Get or create YouTube client"""
if self._youtube_client is None:
# Try to load YouTube Data API key from ytdlp scraper settings
api_key = None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT settings_json FROM scrapers WHERE id = ?", ('ytdlp',))
row = cursor.fetchone()
if row and row[0]:
settings = json.loads(row[0])
api_key = settings.get('youtube_api_key') or None
except Exception as e:
self.log(f"Could not load YouTube API key from settings: {e}", 'debug')
self._youtube_client = YouTubeClient(
unified_db=self.unified_db,
log_callback=self.log_callback,
api_key=api_key
)
if not self._youtube_client.is_available():
self.log("yt-dlp not found, YouTube support will be disabled", 'warning')
return self._youtube_client
def _get_twitch_client(self) -> TwitchClient:
"""Get or create Twitch client"""
if self._twitch_client is None:
self._twitch_client = TwitchClient(unified_db=self.unified_db, log_callback=self.log_callback)
if not self._twitch_client.is_available():
self.log("yt-dlp not found, Twitch support will be disabled", 'warning')
return self._twitch_client
def _get_pornhub_client(self) -> PornhubClient:
"""Get or create Pornhub client"""
if self._pornhub_client is None:
self._pornhub_client = PornhubClient(unified_db=self.unified_db, log_callback=self.log_callback)
if not self._pornhub_client.is_available():
self.log("yt-dlp not found, Pornhub support will be disabled", 'warning')
return self._pornhub_client
def _get_xhamster_client(self) -> XHamsterClient:
"""Get or create XHamster client"""
if self._xhamster_client is None:
self._xhamster_client = XHamsterClient(unified_db=self.unified_db, log_callback=self.log_callback)
if not self._xhamster_client.is_available():
self.log("yt-dlp not found, XHamster support will be disabled", 'warning')
return self._xhamster_client
def _get_tiktok_client(self) -> TikTokClient:
"""Get or create TikTok client"""
if self._tiktok_client is None:
self._tiktok_client = TikTokClient(unified_db=self.unified_db, log_callback=self.log_callback)
if not self._tiktok_client.is_available():
self.log("yt-dlp/gallery-dl not found, TikTok support will be disabled", 'warning')
return self._tiktok_client
def _get_soundgasm_client(self) -> SoundgasmClient:
"""Get or create Soundgasm + Liltsome client"""
if self._soundgasm_client is None:
self._soundgasm_client = SoundgasmClient(log_callback=self.log_callback)
return self._soundgasm_client
def _get_bellazon_client(self) -> BellazonClient:
"""Get or create Bellazon forum client"""
if self._bellazon_client is None:
self._bellazon_client = BellazonClient(log_callback=self.log_callback)
return self._bellazon_client
def _get_besteyecandy_client(self) -> BestEyeCandyClient:
"""Get or create BestEyeCandy client"""
if self._besteyecandy_client is None:
self._besteyecandy_client = BestEyeCandyClient(
unified_db=self.unified_db, log_callback=self.log_callback)
return self._besteyecandy_client
def _get_snapchat_client(self) -> SnapchatPaidContentClient:
"""Get or create Snapchat client"""
if self._snapchat_client is None:
self._snapchat_client = SnapchatPaidContentClient(
unified_db=self.unified_db, log_callback=self.log_callback)
return self._snapchat_client
def _get_reddit_client(self) -> RedditClient:
"""Get or create Reddit client"""
if self._reddit_client is None:
self._reddit_client = RedditClient(
unified_db=self.unified_db, log_callback=self.log_callback)
return self._reddit_client
def _get_xenforo_client(self, service_id: str) -> XenForoForumClient:
"""Get or create a XenForo forum client for the given service."""
if service_id not in self._xenforo_clients:
config = self.XENFORO_FORUMS[service_id]
self._xenforo_clients[service_id] = XenForoForumClient(
service_id=service_id,
base_url=config['base_url'],
cookie_path=config['cookie_path'],
log_callback=self.log_callback,
)
return self._xenforo_clients[service_id]
def _get_coppermine_client(self) -> CoppermineClient:
"""Get or create Coppermine gallery client"""
if self._coppermine_client is None:
self._coppermine_client = CoppermineClient(log_callback=self.log_callback)
return self._coppermine_client
async def close(self):
"""Close all API clients"""
for client in self._clients.values():
await client.close()
self._clients.clear()
# Cleanup YouTube client temp files
if self._youtube_client:
self._youtube_client.cleanup()
# Cleanup Twitch client temp files
if self._twitch_client:
self._twitch_client.cleanup()
# Cleanup Pornhub client temp files
if self._pornhub_client:
self._pornhub_client.cleanup()
# Cleanup XHamster client temp files
if self._xhamster_client:
self._xhamster_client.cleanup()
# Cleanup TikTok client
if self._tiktok_client:
self._tiktok_client.cleanup()
async def _cache_profile_image(self, url: str, platform: str, creator_id: str, image_type: str) -> Optional[str]:
"""Download a profile image and cache it locally.
Args:
url: Remote image URL to download
platform: Platform name (instagram, onlyfans, etc.)
creator_id: Creator's ID on the platform
image_type: 'avatar' or 'banner'
Returns:
Local serving URL like /api/paid-content/cache/profile-image/instagram_username_avatar.jpg
or None on failure
"""
if not url:
return None
cache_dir = Path(__file__).parent.parent.parent / 'data' / 'cache' / 'profile_images'
cache_dir.mkdir(parents=True, exist_ok=True)
# Determine extension from URL
ext = '.jpg' # default
parsed_path = urlparse(url).path.lower()
if '.png' in parsed_path:
ext = '.png'
elif '.webp' in parsed_path:
ext = '.webp'
elif '.gif' in parsed_path:
ext = '.gif'
# Sanitize creator_id for filename
safe_id = re.sub(r'[^\w.-]', '_', str(creator_id))
filename = f"{platform}_{safe_id}_{image_type}{ext}"
filepath = cache_dir / filename
try:
# Instagram needs curl_cffi due to CDN restrictions
if platform == 'instagram':
downloaded = await asyncio.to_thread(
self._download_with_instagram_session, url, filepath
)
if downloaded:
return f"/api/paid-content/cache/profile-image/{filename}"
return None
# All other platforms: use aiohttp
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
}
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=30)) as resp:
if resp.status == 200:
data = await resp.read()
filepath.write_bytes(data)
return f"/api/paid-content/cache/profile-image/{filename}"
except Exception as e:
self.log(f"Failed to cache {image_type} for {platform}/{creator_id}: {e}", 'debug')
return None
def _download_with_instagram_session(self, url: str, filepath: Path) -> bool:
"""Download an image using the ImgInn session (sync, for thread)."""
try:
adapter = InstagramAdapter(unified_db=self.unified_db, log_callback=self.log_callback)
client = adapter._get_client()
resp = client.session.get(url, headers=client._default_headers, timeout=30)
if resp.status_code == 200:
filepath.write_bytes(resp.content)
return True
else:
self.log(f"Instagram avatar download HTTP {resp.status_code}", 'debug')
except Exception as e:
self.log(f"Instagram session download failed: {e}", 'debug')
return False
async def sync_creator(self, creator_id: int, download: bool = True, scheduled: bool = False,
force_backfill: bool = False) -> SyncResult:
"""Sync a single creator - fetch new posts and optionally queue downloads
Args:
creator_id: The creator's database ID
download: Whether to download files after syncing
scheduled: If True, create notifications (for scheduled syncs only)
force_backfill: If True, run Pullpush historical backfill even if not first sync (Reddit only)
"""
creator = self.db.get_creator(creator_id)
if not creator:
return SyncResult(success=False, error="Creator not found")
# Handle YouTube channels separately
if creator['service_id'] == 'youtube':
return await self._sync_youtube_creator(creator, download, scheduled=scheduled)
# Handle Twitch channels separately
if creator['service_id'] == 'twitch':
return await self._sync_twitch_creator(creator, download, scheduled=scheduled)
# Handle Fansly Direct separately
if creator['service_id'] == 'fansly_direct':
return await self._sync_fansly_direct_creator(creator, download, scheduled=scheduled)
# Handle OnlyFans Direct separately
if creator['service_id'] == 'onlyfans_direct':
return await self._sync_onlyfans_direct_creator(creator, download, scheduled=scheduled)
# Handle Pornhub creators
if creator['service_id'] == 'pornhub':
return await self._sync_pornhub_creator(creator, download, scheduled=scheduled)
# Handle XHamster creators
if creator['service_id'] == 'xhamster':
return await self._sync_xhamster_creator(creator, download, scheduled=scheduled)
# Handle TikTok creators
if creator['service_id'] == 'tiktok':
return await self._sync_tiktok_creator(creator, download, scheduled=scheduled)
# Handle Instagram creators
if creator['service_id'] == 'instagram':
return await self._sync_instagram_creator(creator, download, scheduled=scheduled, force_backfill=force_backfill)
# Handle Soundgasm creators
if creator['service_id'] == 'soundgasm':
return await self._sync_soundgasm_creator(creator, download, scheduled=scheduled)
# Handle BestEyeCandy creators
if creator['service_id'] == 'besteyecandy':
return await self._sync_besteyecandy_creator(creator, download, scheduled=scheduled)
# Handle Bellazon creators
if creator['service_id'] == 'bellazon':
return await self._sync_bellazon_creator(creator, download, scheduled=scheduled)
# Handle Snapchat creators
if creator['service_id'] == 'snapchat':
return await self._sync_snapchat_creator(creator, download, scheduled=scheduled)
# Handle Reddit subreddits
if creator['service_id'] == 'reddit':
return await self._sync_reddit_creator(creator, download, scheduled=scheduled,
force_backfill=force_backfill)
# Handle Coppermine gallery creators
if creator['service_id'] == 'coppermine':
return await self._sync_coppermine_creator(creator, download, scheduled=scheduled)
# Handle XenForo forum creators (HQCelebCorner, PicturePub, etc.)
if creator['service_id'] in self.XENFORO_FORUMS:
return await self._sync_xenforo_creator(creator, download, scheduled=scheduled)
self.log(f"Syncing creator: {creator['username']} ({creator['platform']})", 'info')
# Register active sync for polling-based updates
sync_data = {
'username': creator['username'],
'platform': creator['platform'],
'service': creator['service_id'],
'status': 'Fetching posts...',
'phase': 'fetching',
'failed': 0
}
self._register_active_sync(creator_id, sync_data)
# Also emit WebSocket event (for clients that support it)
self._emit_event('paid_content_sync_started', {
'creator_id': creator_id,
**sync_data
})
try:
client = self._get_client(creator['service_id'])
# Fetch and update creator profile (display name, avatar, banner)
try:
creator_info = await client.get_creator(creator['platform'], creator['creator_id'])
if creator_info:
profile_updates = {}
if creator_info.display_name:
profile_updates['display_name'] = creator_info.display_name
if creator_info.profile_image_url:
cached = await self._cache_profile_image(creator_info.profile_image_url, creator['platform'], creator['creator_id'], 'avatar')
profile_updates['profile_image_url'] = cached or creator_info.profile_image_url
if creator_info.banner_image_url:
cached = await self._cache_profile_image(creator_info.banner_image_url, creator['platform'], creator['creator_id'], 'banner')
profile_updates['banner_image_url'] = cached or creator_info.banner_image_url
if profile_updates:
self.db.update_creator(creator_id, profile_updates)
self.log(f"Updated creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
except Exception as e:
self.log(f"Failed to update creator profile: {e}", 'warning')
# Fetch posts since last check with progress callback
since_date = creator.get('last_post_date')
def progress_callback(page: int, total_posts: int):
# Update polling-based status
self._update_active_sync(creator_id, {
'status': f'Fetched {total_posts} posts (page {page})...',
'posts_fetched': total_posts,
'page': page
})
# Also emit WebSocket event
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': f'Fetched {total_posts} posts (page {page})...',
'phase': 'fetching',
'posts_fetched': total_posts,
'page': page
})
posts = await client.get_all_creator_posts(
creator['platform'],
creator['creator_id'],
since_date=since_date,
progress_callback=progress_callback
)
if not posts:
self.log(f"No new posts for {creator['username']}", 'debug')
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
# Still download any pending attachments even if no new posts
downloaded = 0
failed = 0
if download and creator.get('auto_download', True):
pending_count = self.db.get_pending_attachment_count(creator_id)
if pending_count > 0:
self.log(f"Downloading {pending_count} pending attachments for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'status': f'Downloading {pending_count} pending files...',
'phase': 'downloading',
'total_files': pending_count
})
result = await self.download_pending_for_creator(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
# Unregister from active syncs
self._unregister_active_sync(creator_id)
# Emit completed event
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': 0,
'new_attachments': 0,
'downloaded': downloaded,
'failed': failed
})
return SyncResult(success=True, new_posts=0, new_attachments=0, downloaded_files=downloaded, failed_files=failed)
self.log(f"Found {len(posts)} new posts for {creator['username']}", 'info')
# Update polling status and emit processing event
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} posts...',
'phase': 'processing',
'total_posts': len(posts)
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': f'Processing {len(posts)} posts...',
'phase': 'processing',
'total_posts': len(posts)
})
new_posts = 0
new_attachments = 0
for i, post in enumerate(posts):
# Fetch full post content (list endpoint only returns truncated 'substring')
full_post = await client.get_post(creator['platform'], creator['creator_id'], post.post_id)
if full_post:
# Use full content from individual post endpoint
post.content = full_post.content
# Also update attachments if the full post has more details
if full_post.attachments:
post.attachments = full_post.attachments
# Update progress
if (i + 1) % 10 == 0:
self._update_active_sync(creator_id, {
'status': f'Processing post {i + 1}/{len(posts)}...',
'phase': 'processing'
})
# Insert/update post in database
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
# Insert attachments
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
# Extract and store embedded URLs
embeds = self._extract_embeds(post.content)
for embed_url, platform, video_id in embeds:
self.db.upsert_embed(post_db_id, {
'url': embed_url,
'platform': platform,
'video_id': video_id
})
self._apply_auto_tag_rules(post_db_id, is_new_post)
# Update creator stats - find the actual newest post date (posts may not be sorted by date)
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'last_post_date': latest_post_date or creator.get('last_post_date'),
'post_count': self.db.get_creator_post_count(creator_id)
})
# Download if enabled
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self.download_pending_for_creator(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
# Unregister from active syncs
self._unregister_active_sync(creator_id)
# Emit completed event
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': new_posts,
'new_attachments': new_attachments,
'downloaded': downloaded,
'failed': failed
})
# Send push notification for new downloads
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True,
new_posts=new_posts,
new_attachments=new_attachments,
downloaded_files=downloaded,
failed_files=failed,
downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing {creator['username']}: {e}", 'error')
# Unregister from active syncs
self._unregister_active_sync(creator_id)
# Emit error event
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id,
'username': creator['username'],
'error': str(e)
})
return SyncResult(success=False, error=str(e))
async def sync_all_creators(self, enabled_only: bool = True, scheduled: bool = False, download: bool = True) -> Dict[int, SyncResult]:
"""Sync all enabled creators - YouTube runs in parallel with Coomer/Kemono
Args:
enabled_only: Only sync enabled creators
scheduled: If True, create notifications (for scheduled syncs only)
download: If True, download files after syncing. If False, only sync metadata.
"""
creators = self.db.get_creators(enabled_only=enabled_only)
# Separate creators by service type for parallel execution
youtube_creators = [c for c in creators if c['service_id'] == 'youtube']
pornhub_creators = [c for c in creators if c['service_id'] == 'pornhub']
reddit_creators = [c for c in creators if c['service_id'] == 'reddit']
other_creators = [c for c in creators if c['service_id'] not in ('youtube', 'pornhub', 'reddit')]
self.log(f"Syncing {len(creators)} creators ({len(youtube_creators)} YouTube, {len(pornhub_creators)} Pornhub, {len(reddit_creators)} Reddit, {len(other_creators)} other)", 'info')
# Run each service group in parallel
tasks = []
if youtube_creators:
tasks.append(self._sync_creator_group(youtube_creators, 'YouTube', scheduled=scheduled, download=download))
if pornhub_creators:
tasks.append(self._sync_creator_group(pornhub_creators, 'Pornhub', scheduled=scheduled, download=download))
if reddit_creators:
tasks.append(self._sync_creator_group(reddit_creators, 'Reddit', scheduled=scheduled, download=download))
if other_creators:
tasks.append(self._sync_creator_group(other_creators, 'Coomer/Kemono', scheduled=scheduled, download=download))
if not tasks:
return {}
# Wait for both groups to complete
group_results = await asyncio.gather(*tasks, return_exceptions=True)
# Merge results
results = {}
total_new_posts = 0
total_new_files = 0
all_downloaded_files = []
for group_result in group_results:
if isinstance(group_result, Exception):
self.log(f"Error in sync group: {group_result}", 'error')
continue
for creator_id, result in group_result.items():
results[creator_id] = result
total_new_posts += result.new_posts
total_new_files += result.new_attachments
# Collect downloaded file info for notifications
if result.downloaded_file_info:
all_downloaded_files.extend(result.downloaded_file_info)
# Note: Per-creator notifications are sent in individual sync methods
# Skip aggregate notification to avoid duplicates
# self._send_sync_notification(total_new_posts, total_new_files, all_downloaded_files)
self.log(f"Sync complete: {total_new_posts} new posts, {total_new_files} new files", 'info')
return results
async def _sync_creator_group(self, creators: List[Dict], group_name: str, scheduled: bool = False, download: bool = True) -> Dict[int, SyncResult]:
"""Sync a group of creators sequentially (used for parallel service groups)"""
results = {}
self.log(f"Starting {group_name} sync group ({len(creators)} creators)", 'info')
# Crash recovery checkpoint
from modules.task_checkpoint import TaskCheckpoint
checkpoint = TaskCheckpoint(f'paid_content:{group_name}', 'background')
checkpoint.start(total_items=len(creators))
if checkpoint.is_recovering():
self.log(f"{group_name}: recovering — skipping already-synced creators", 'info')
for creator in creators:
creator_id = str(creator['id'])
if checkpoint.is_completed(creator_id):
continue
# Skip creators that already have an active sync running
if self._is_creator_syncing(creator['id']):
self.log(f"Skipping {creator['username']} — already syncing", 'info')
checkpoint.mark_completed(creator_id)
continue
checkpoint.set_current(creator_id)
try:
result = await self.sync_creator(creator['id'], download=download, scheduled=scheduled)
results[creator['id']] = result
except Exception as e:
self.log(f"Error syncing {creator['username']}: {e}", 'error')
results[creator['id']] = SyncResult(success=False, error=str(e))
checkpoint.mark_completed(creator_id)
checkpoint.finish()
self.log(f"Completed {group_name} sync group", 'info')
return results
async def _sync_youtube_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
"""Sync a YouTube channel - fetch new videos and optionally download them"""
creator_id = creator['id']
self.log(f"Syncing YouTube channel: {creator['username']}", 'info')
# Register active sync for polling-based updates
sync_data = {
'username': creator['username'],
'platform': 'youtube',
'service': 'youtube',
'status': 'Fetching videos...',
'phase': 'fetching',
'failed': 0
}
self._register_active_sync(creator_id, sync_data)
# Emit WebSocket event
self._emit_event('paid_content_sync_started', {
'creator_id': creator_id,
**sync_data
})
try:
youtube = self._get_youtube_client()
if not youtube.is_available():
error = "yt-dlp not available"
self._unregister_active_sync(creator_id)
return SyncResult(success=False, error=error)
# Build channel URL from creator_id (which stores the channel ID or handle)
channel_url = youtube.normalize_channel_url(creator['creator_id'])
# Fetch and update creator profile (display name, avatar, banner)
try:
profile_updates = {}
# Get channel name from yt-dlp
channel_info = await youtube.get_channel_info(channel_url)
if channel_info and channel_info.get('channel_name'):
profile_updates['display_name'] = channel_info['channel_name']
# Get avatar by scraping the page (yt-dlp doesn't provide it)
avatar_url = await youtube.get_channel_avatar(channel_url)
if avatar_url:
cached = await self._cache_profile_image(avatar_url, 'youtube', creator['creator_id'], 'avatar')
profile_updates['profile_image_url'] = cached or avatar_url
# Get banner by scraping the page
banner_url = await youtube.get_channel_banner(channel_url)
if banner_url:
cached = await self._cache_profile_image(banner_url, 'youtube', creator['creator_id'], 'banner')
profile_updates['banner_image_url'] = cached or banner_url
# Get metadata (bio, joined date, location, external links) by scraping the page
metadata = await youtube.get_channel_metadata(channel_url)
if metadata:
if metadata.get('bio'):
profile_updates['bio'] = metadata['bio']
if metadata.get('joined_date'):
profile_updates['joined_date'] = metadata['joined_date']
if metadata.get('location'):
profile_updates['location'] = metadata['location']
if metadata.get('external_links'):
profile_updates['external_links'] = metadata['external_links']
if profile_updates:
self.db.update_creator(creator_id, profile_updates)
self.log(f"Updated YouTube creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
except Exception as e:
self.log(f"Failed to update YouTube creator profile: {e}", 'warning')
# Fetch videos since last check with progress callback
# Scheduled syncs only check last 3 days for efficiency
from datetime import timedelta
if scheduled:
since_date = (datetime.now() - timedelta(days=3)).isoformat()
else:
since_date = creator.get('last_post_date')
def progress_callback(count: int):
self._update_active_sync(creator_id, {
'status': f'Fetched {count} videos...',
'posts_fetched': count
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': f'Fetched {count} videos...',
'phase': 'fetching',
'posts_fetched': count
})
# Get known video IDs from DB so the members-only scan
# doesn't re-fetch videos we already have
known_video_ids = set()
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
(creator_id,)
)
known_video_ids = {row[0] for row in cursor.fetchall()}
except Exception:
pass
# Get videos as Post objects
# For scheduled syncs, limit to 20 videos max (recent content only)
max_videos = 20 if scheduled else None
posts = await youtube.get_posts(
channel_url,
since_date=since_date,
max_videos=max_videos,
progress_callback=progress_callback,
known_video_ids=known_video_ids
)
if not posts:
self.log(f"No new videos for {creator['username']}", 'debug')
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': 0,
'new_attachments': 0,
'downloaded': 0,
'failed': 0
})
return SyncResult(success=True, new_posts=0, new_attachments=0)
self.log(f"Found {len(posts)} new videos for {creator['username']}", 'info')
# Update status
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} videos...',
'phase': 'processing',
'total_posts': len(posts)
})
new_posts = 0
new_attachments = 0
for post in posts:
# Insert/update post in database
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
# Insert video attachment
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
# Apply auto tags from source (e.g. "Members Only" for subscriber-only videos)
for tag_name in getattr(post, 'auto_tags', []):
tag = self.db.get_tag_by_slug(tag_name.lower().replace(' ', '-'))
if not tag:
tag_id = self.db.create_tag(tag_name, color='#8b5cf6', description='Auto-applied by YouTube sync')
else:
tag_id = tag['id']
if tag_id:
self.db.add_tag_to_post(post_db_id, tag_id)
self._apply_auto_tag_rules(post_db_id, is_new_post)
# Update creator stats - find the actual newest post date (posts may not be sorted by date)
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'last_post_date': latest_post_date or creator.get('last_post_date'),
'post_count': self.db.get_creator_post_count(creator_id)
})
# Download videos if enabled
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self._download_youtube_videos(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
# Unregister from active syncs
self._unregister_active_sync(creator_id)
# Emit completed event
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': new_posts,
'new_attachments': new_attachments,
'downloaded': downloaded,
'failed': failed
})
# Send push notification for new downloads
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True,
new_posts=new_posts,
new_attachments=new_attachments,
downloaded_files=downloaded,
failed_files=failed,
downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing YouTube channel {creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id,
'username': creator['username'],
'error': str(e)
})
return SyncResult(success=False, error=str(e))
async def _download_youtube_videos(self, creator_id: int) -> Dict:
"""Download pending YouTube videos using yt-dlp"""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
youtube = self._get_youtube_client()
if not youtube.is_available():
return {'downloaded': 0, 'failed': 0, 'error': 'yt-dlp not available'}
self.log(f"Downloading {len(pending)} YouTube videos for {creator['username']}", 'info')
# Update status
self._update_active_sync(creator_id, {
'phase': 'downloading',
'status': f'Downloading {len(pending)} videos...',
'total_files': len(pending),
'downloaded': 0
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
quality = self.config.get('embed_quality', 'best')
downloaded = 0
failed = 0
downloaded_file_info = []
for i, att in enumerate(pending):
try:
post = self.db.get_post(att['post_id'])
if not post:
self.log(f"Post not found for attachment {att.get('id')}", 'warning')
failed += 1
continue
# Build output directory
published_at = post.get('published_at') or ''
post_date = published_at[:10] if published_at else 'unknown-date'
output_dir = base_path / 'youtube' / self._sanitize_filename(creator['username']) / post_date
# Video URL is stored in download_url
video_url = att.get('download_url')
if not video_url:
self.log(f"No download URL for attachment {att.get('id')}, {att.get('name')}", 'warning')
self.db.update_attachment_status(att['id'], 'failed',
error_message='No video URL'
)
failed += 1
continue
# Update status
self._update_active_sync(creator_id, {
'status': f'Downloading video {i + 1}/{len(pending)}: {att.get("name", "")[:40]}...',
'downloaded': downloaded
})
self.db.update_attachment_status(att['id'], 'downloading')
# Download using yt-dlp to temp dir first (avoids mergerfs .part rename issues)
import tempfile, shutil
tmp_dir = tempfile.mkdtemp(prefix='ytdlp_')
try:
result = await youtube.download_video(video_url, Path(tmp_dir), quality=quality)
# Move completed file to final destination
if result and result.get('success') and result.get('file_path'):
tmp_file = Path(result['file_path'])
if tmp_file.exists():
output_dir.mkdir(parents=True, exist_ok=True)
final_path = output_dir / tmp_file.name
shutil.move(str(tmp_file), str(final_path))
result['file_path'] = str(final_path)
result['file_size'] = final_path.stat().st_size
finally:
shutil.rmtree(tmp_dir, ignore_errors=True)
if not result:
self.log(f"No result from yt-dlp for {att.get('name')}", 'warning')
self.db.update_attachment_status(att['id'], 'failed',
error_message='yt-dlp returned no result',
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
if result.get('success'):
file_path = result.get('file_path')
file_size = result.get('file_size', 0)
# Download YouTube thumbnail instead of generating from video
thumbnail_data = None
video_id = post.get('post_id') # post_id is the YouTube video ID
if video_id:
thumbnail_data = await self._download_youtube_thumbnail(video_id)
if thumbnail_data:
self.log(f"Downloaded YouTube thumbnail for {att.get('name', 'video')} ({len(thumbnail_data)} bytes)", 'debug')
# Extract video dimensions
width, height, duration = None, None, None
if file_path:
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
if width and height:
self.log(f"Extracted dimensions for {att.get('name', 'video')}: {width}x{height}, {duration}s", 'debug')
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name if file_path else None,
file_size=file_size,
width=width,
height=height,
duration=duration,
thumbnail_data=thumbnail_data,
downloaded_at=datetime.now().isoformat()
)
# Update post as downloaded if all attachments are done
self.db.mark_post_downloaded(att['post_id'])
# Update creator stats
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
downloaded += 1
self.log(f"Downloaded: {att.get('name', 'video')}", 'debug')
# Collect file info for notifications
if file_path:
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name if file_path else None,
'source': creator['username'],
'content_type': att.get('file_type', 'video')
})
else:
error = result.get('error', 'Unknown error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
self.log(f"Failed to download {att.get('name', 'video')}: {error}", 'warning')
except Exception as e:
self.log(f"Error downloading YouTube video: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
async def _sync_pornhub_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
"""Sync a Pornhub creator - fetch new videos and optionally download them"""
creator_id = creator['id']
self.log(f"Syncing Pornhub creator: {creator['username']}", 'info')
# Register active sync for polling-based updates
sync_data = {
'username': creator['username'],
'platform': 'pornhub',
'service': 'pornhub',
'status': 'Fetching videos...',
'phase': 'fetching',
'failed': 0
}
self._register_active_sync(creator_id, sync_data)
# Emit WebSocket event
self._emit_event('paid_content_sync_started', {
'creator_id': creator_id,
**sync_data
})
try:
pornhub = self._get_pornhub_client()
if not pornhub.is_available():
error = "yt-dlp not available"
self._unregister_active_sync(creator_id)
return SyncResult(success=False, error=error)
# Build creator URL from creator_id (which stores type/name)
creator_url = pornhub.normalize_creator_url(creator['creator_id'])
# Fetch and update creator profile
try:
profile_updates = {}
# Get profile image
avatar_url = await pornhub.get_profile_image(creator_url)
if avatar_url:
cached = await self._cache_profile_image(avatar_url, 'pornhub', creator['creator_id'], 'avatar')
profile_updates['profile_image_url'] = cached or avatar_url
# Get banner
banner_url = await pornhub.get_profile_banner(creator_url)
if banner_url:
cached = await self._cache_profile_image(banner_url, 'pornhub', creator['creator_id'], 'banner')
profile_updates['banner_image_url'] = cached or banner_url
# Get bio
bio = await pornhub.get_profile_bio(creator_url)
if bio:
profile_updates['bio'] = bio
# Get joined/career start date
joined_date = await pornhub.get_joined_date(creator_url)
if joined_date:
profile_updates['joined_date'] = joined_date
if profile_updates:
self.db.update_creator(creator_id, profile_updates)
self.log(f"Updated Pornhub creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
except Exception as e:
self.log(f"Failed to update Pornhub creator profile: {e}", 'warning')
# Fetch videos since last check with progress callback
from datetime import timedelta
if scheduled:
since_date = (datetime.now() - timedelta(days=3)).isoformat()
else:
since_date = creator.get('last_post_date')
def progress_callback(count: int):
self._update_active_sync(creator_id, {
'status': f'Fetched {count} videos...',
'posts_fetched': count
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': f'Fetched {count} videos...',
'phase': 'fetching',
'posts_fetched': count
})
# Get videos as Post objects
max_videos = 20 if scheduled else None
posts = await pornhub.get_posts(
creator_url,
since_date=since_date,
max_videos=max_videos,
progress_callback=progress_callback
)
if not posts:
self.log(f"No new videos for {creator['username']}", 'debug')
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': 0,
'new_attachments': 0,
'downloaded': 0,
'failed': 0
})
return SyncResult(success=True, new_posts=0, new_attachments=0)
self.log(f"Found {len(posts)} new videos for {creator['username']}", 'info')
# Update status
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} videos...',
'phase': 'processing',
'total_posts': len(posts)
})
new_posts = 0
new_attachments = 0
for post in posts:
# Insert/update post in database
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
# Insert video attachment
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
self._apply_auto_tag_rules(post_db_id, is_new_post)
# Update creator stats
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'last_post_date': latest_post_date or creator.get('last_post_date'),
'post_count': self.db.get_creator_post_count(creator_id)
})
# Download videos if enabled
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self._download_pornhub_videos(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
# Post dates are backfilled during download (from yt-dlp metadata)
# Update creator's last_post_date now that we have actual dates
if downloaded > 0:
try:
latest = result.get('latest_post_date')
if latest:
self.db.update_creator(creator_id, {'last_post_date': latest})
except Exception:
pass
# Unregister from active syncs
self._unregister_active_sync(creator_id)
# Emit completed event
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': new_posts,
'new_attachments': new_attachments,
'downloaded': downloaded,
'failed': failed
})
# Send push notification for new downloads
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True,
new_posts=new_posts,
new_attachments=new_attachments,
downloaded_files=downloaded,
failed_files=failed,
downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing Pornhub creator {creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id,
'username': creator['username'],
'error': str(e)
})
return SyncResult(success=False, error=str(e))
async def _download_pornhub_videos(self, creator_id: int) -> Dict:
"""Download pending Pornhub videos using yt-dlp"""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
pornhub = self._get_pornhub_client()
if not pornhub.is_available():
return {'downloaded': 0, 'failed': 0, 'error': 'yt-dlp not available'}
self.log(f"Downloading {len(pending)} Pornhub videos for {creator['username']}", 'info')
# Update status
self._update_active_sync(creator_id, {
'phase': 'downloading',
'status': f'Downloading {len(pending)} videos...',
'total_files': len(pending),
'downloaded': 0
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
quality = self.config.get('embed_quality', 'best')
downloaded = 0
failed = 0
downloaded_file_info = []
latest_post_date = None
for i, att in enumerate(pending):
try:
post = self.db.get_post(att['post_id'])
if not post:
self.log(f"Post not found for attachment {att.get('id')}", 'warning')
failed += 1
continue
# Build output directory
published_at = post.get('published_at') or ''
post_date = published_at[:10] if published_at else 'unknown-date'
output_dir = base_path / 'pornhub' / self._sanitize_filename(creator['username']) / post_date
# Video URL is stored in download_url
video_url = att.get('download_url')
if not video_url:
self.log(f"No download URL for attachment {att.get('id')}, {att.get('name')}", 'warning')
self.db.update_attachment_status(att['id'], 'failed',
error_message='No video URL'
)
failed += 1
continue
# Update status
self._update_active_sync(creator_id, {
'status': f'Downloading video {i + 1}/{len(pending)}: {att.get("name", "")[:40]}...',
'downloaded': downloaded
})
self.db.update_attachment_status(att['id'], 'downloading')
# Download using yt-dlp
result = await pornhub.download_video(video_url, output_dir, quality=quality)
if not result:
self.log(f"No result from yt-dlp for {att.get('name')}", 'warning')
self.db.update_attachment_status(att['id'], 'failed',
error_message='yt-dlp returned no result',
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
if result.get('success'):
file_path = result.get('file_path')
file_size = result.get('file_size', 0)
# Extract video dimensions first (need duration for thumbnail seek)
width, height, duration = None, None, None
if file_path:
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
if width and height:
self.log(f"Extracted dimensions for {att.get('name', 'video')}: {width}x{height}, {duration}s", 'debug')
# Generate thumbnail from video file (Pornhub CDN thumbnails are just logos)
# Seek to 10 seconds to skip intro branding
thumbnail_data = None
if file_path and Path(file_path).exists():
seek_secs = 10
seek_time = f'{seek_secs // 3600:02d}:{(seek_secs % 3600) // 60:02d}:{seek_secs % 60:02d}'
thumbnail_data = self._generate_video_thumbnail(Path(file_path), seek_time=seek_time)
if thumbnail_data:
self.log(f"Generated video thumbnail at {seek_time} for {att.get('name', 'video')}", 'debug')
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name if file_path else None,
file_size=file_size,
width=width,
height=height,
duration=duration,
thumbnail_data=thumbnail_data,
downloaded_at=datetime.now().isoformat()
)
# Update post date from yt-dlp metadata if it was missing
# Prefer 'timestamp' (epoch) for full date+time, fallback to 'upload_date' (YYYYMMDD)
timestamp = result.get('timestamp')
upload_date = result.get('upload_date')
if (timestamp or upload_date) and post and not post.get('published_at'):
try:
if timestamp:
from datetime import timezone
formatted_date = datetime.fromtimestamp(int(timestamp), tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%S')
elif len(upload_date) == 8 and upload_date.isdigit():
formatted_date = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}"
else:
formatted_date = upload_date
self.db.update_post(att['post_id'], {'published_at': formatted_date})
self.log(f"Updated post date: {formatted_date}", 'debug')
# Track latest date for creator update
if not latest_post_date or formatted_date > latest_post_date:
latest_post_date = formatted_date
except Exception as e:
self.log(f"Could not update post date: {e}", 'debug')
# Update post as downloaded if all attachments are done
self.db.mark_post_downloaded(att['post_id'])
# Update creator stats
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
downloaded += 1
self.log(f"Downloaded: {att.get('name', 'video')}", 'debug')
# Collect file info for notifications
if file_path:
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name if file_path else None,
'source': creator['username'],
'content_type': att.get('file_type', 'video')
})
else:
error = result.get('error', 'Unknown error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
self.log(f"Failed to download {att.get('name', 'video')}: {error}", 'warning')
except Exception as e:
self.log(f"Error downloading Pornhub video: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info, 'latest_post_date': latest_post_date}
# =========================================================================
# XHamster sync/download/add
# =========================================================================
async def _sync_xhamster_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
"""Sync an XHamster creator - fetch new videos and optionally download them"""
creator_id = creator['id']
self.log(f"Syncing XHamster creator: {creator['username']}", 'info')
sync_data = {
'username': creator['username'],
'platform': 'xhamster',
'service': 'xhamster',
'status': 'Fetching media...',
'phase': 'fetching',
'failed': 0
}
self._register_active_sync(creator_id, sync_data)
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
try:
xhamster = self._get_xhamster_client()
if not xhamster.is_available():
self._unregister_active_sync(creator_id)
return SyncResult(success=False, error="yt-dlp not available")
creator_url = xhamster.normalize_creator_url(creator['creator_id'])
# Update profile
try:
profile_updates = {}
avatar_url = await xhamster.get_profile_image(creator_url)
if avatar_url:
cached = await self._cache_profile_image(avatar_url, 'xhamster', creator['creator_id'], 'avatar')
profile_updates['profile_image_url'] = cached or avatar_url
bio = await xhamster.get_profile_bio(creator_url)
if bio:
profile_updates['bio'] = bio
if profile_updates:
self.db.update_creator(creator_id, profile_updates)
self.log(f"Updated XHamster creator profile: {list(profile_updates.keys())}", 'debug')
except Exception as e:
self.log(f"Failed to update XHamster creator profile: {e}", 'warning')
# Fetch videos
from datetime import timedelta
if scheduled:
since_date = (datetime.now() - timedelta(days=3)).isoformat()
else:
since_date = creator.get('last_post_date')
def progress_callback(count: int):
self._update_active_sync(creator_id, {
'status': f'Fetched {count} media items...',
'posts_fetched': count
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': f'Fetched {count} media items...',
'phase': 'fetching',
'posts_fetched': count
})
max_videos = 20 if scheduled else None
posts = await xhamster.get_posts(
creator_url,
since_date=since_date,
max_videos=max_videos,
progress_callback=progress_callback
)
if not posts:
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': 0, 'new_attachments': 0, 'downloaded': 0, 'failed': 0
})
return SyncResult(success=True, new_posts=0, new_attachments=0)
self.log(f"Found {len(posts)} new media posts for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} media posts...',
'phase': 'processing', 'total_posts': len(posts)
})
# Get or create "Short" tag for auto-tagging moments
short_tag = self.db.get_tag_by_slug('short')
short_tag_id = short_tag['id'] if short_tag else None
if not short_tag_id:
short_tag_id = self.db.create_tag('Short', '#8b5cf6', 'Short-form content (moments, clips)')
new_posts = 0
new_attachments = 0
for post in posts:
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
# Auto-tag shorts/moments
if short_tag_id:
is_short = any(
'/moments/' in (att.server_path or '') or '/moments/' in (att.download_url or '')
for att in post.attachments
)
if is_short:
self.db.add_tag_to_post(post_db_id, short_tag_id)
self._apply_auto_tag_rules(post_db_id, is_new_post)
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'last_post_date': latest_post_date or creator.get('last_post_date'),
'post_count': self.db.get_creator_post_count(creator_id)
})
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self._download_xhamster_videos(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
if downloaded > 0 and result.get('latest_post_date'):
try:
self.db.update_creator(creator_id, {'last_post_date': result['latest_post_date']})
except Exception:
pass
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': new_posts, 'new_attachments': new_attachments,
'downloaded': downloaded, 'failed': failed
})
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True, new_posts=new_posts, new_attachments=new_attachments,
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing XHamster creator {creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
})
return SyncResult(success=False, error=str(e))
async def _download_xhamster_videos(self, creator_id: int) -> Dict:
"""Download pending XHamster media (videos and images)"""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
xhamster = self._get_xhamster_client()
if not xhamster.is_available():
return {'downloaded': 0, 'failed': 0, 'error': 'yt-dlp not available'}
self.log(f"Downloading {len(pending)} XHamster media files for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'phase': 'downloading', 'status': f'Downloading {len(pending)} media files...',
'total_files': len(pending), 'downloaded': 0
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
quality = self.config.get('embed_quality', 'best')
downloaded = 0
failed = 0
downloaded_file_info = []
latest_post_date = None
for i, att in enumerate(pending):
try:
# Rate limit: delay between downloads to avoid Cloudflare blocks
if i > 0:
file_type = att.get('file_type', 'video')
await asyncio.sleep(0.5 if file_type == 'image' else 2)
post = self.db.get_post(att['post_id'])
if not post:
failed += 1
continue
published_at = post.get('published_at') or ''
post_date = published_at[:10] if published_at else None
creator_dir = base_path / 'xhamster' / self._sanitize_filename(creator['username'])
output_dir = creator_dir / (post_date or 'unknown-date')
download_url = att.get('download_url')
if not download_url:
self.db.update_attachment_status(att['id'], 'failed', error_message='No download URL')
failed += 1
continue
file_type = att.get('file_type', 'video')
self._update_active_sync(creator_id, {
'status': f'Downloading {file_type} {i + 1}/{len(pending)}: {att.get("name", "")[:40]}...',
'downloaded': downloaded
})
self.db.update_attachment_status(att['id'], 'downloading')
# --- Image download path ---
if file_type == 'image':
output_dir.mkdir(parents=True, exist_ok=True)
image_filename = att.get('name', f'{att["id"]}.jpg')
image_path = output_dir / image_filename
result = await xhamster.download_image(download_url, image_path)
if result and result.get('success'):
file_path = result['file_path']
file_size = result.get('file_size', 0)
width, height, duration = self._extract_dimensions(Path(file_path), 'image')
thumbnail_data = None
if Path(file_path).exists():
thumbnail_data = self._generate_image_thumbnail(Path(file_path))
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name,
file_size=file_size,
width=width, height=height, duration=duration,
thumbnail_data=thumbnail_data,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(att['post_id'])
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
downloaded += 1
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name,
'source': creator['username'],
'content_type': 'image'
})
else:
error = result.get('error', 'Unknown error') if result else 'Download returned no result'
self.db.update_attachment_status(att['id'], 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
# --- Video download path ---
result = await xhamster.download_video(download_url, output_dir, quality=quality)
if not result:
self.db.update_attachment_status(att['id'], 'failed',
error_message='Download returned no result',
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
if result.get('success'):
file_path = result.get('file_path')
file_size = result.get('file_size', 0)
# Extract date from download result and move file if needed
timestamp = result.get('timestamp')
upload_date = result.get('upload_date')
formatted_date = None
if timestamp or upload_date:
try:
if timestamp:
from datetime import timezone
formatted_date = datetime.fromtimestamp(int(timestamp), tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%S')
elif len(upload_date) == 8 and upload_date.isdigit():
formatted_date = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}"
else:
formatted_date = upload_date
except Exception:
pass
# Move file to correct date directory if downloaded to unknown-date
if formatted_date and file_path and not post_date:
correct_date_dir = creator_dir / formatted_date[:10]
if str(correct_date_dir) != str(output_dir):
try:
correct_date_dir.mkdir(parents=True, exist_ok=True)
new_path = correct_date_dir / Path(file_path).name
import shutil
shutil.move(str(file_path), str(new_path))
file_path = str(new_path)
self.log(f"Moved to date directory: {formatted_date[:10]}", 'debug')
except Exception as e:
self.log(f"Failed to move file to date directory: {e}", 'debug')
# Update post date
if formatted_date and not post.get('published_at'):
try:
self.db.update_post(att['post_id'], {'published_at': formatted_date})
if not latest_post_date or formatted_date > latest_post_date:
latest_post_date = formatted_date
except Exception:
pass
width, height, duration = None, None, None
if file_path:
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
thumbnail_data = None
if file_path and Path(file_path).exists():
seek_time = '00:00:10'
thumbnail_data = self._generate_video_thumbnail(Path(file_path), seek_time=seek_time)
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name if file_path else None,
file_size=file_size,
width=width, height=height, duration=duration,
thumbnail_data=thumbnail_data,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(att['post_id'])
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
downloaded += 1
if file_path:
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name,
'source': creator['username'],
'content_type': att.get('file_type', 'video')
})
else:
error = result.get('error', 'Unknown error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
except Exception as e:
self.log(f"Error downloading XHamster media: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info, 'latest_post_date': latest_post_date}
async def _add_xhamster_creator(self, creator_id_str: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add an XHamster creator"""
xhamster = self._get_xhamster_client()
if not xhamster.is_available():
return {'success': False, 'error': 'yt-dlp not available'}
creator_url = xhamster.normalize_creator_url(creator_id_str)
creator_info = await xhamster.get_creator(creator_url)
if not creator_info:
return {'success': False, 'error': 'XHamster creator not found'}
creator_data = creator_info.to_dict()
creator_data['auto_download'] = 1 if auto_download else 0
creator_data['download_embeds'] = 1 if download_embeds else 0
db_id = self.db.add_creator(creator_data)
return {'success': True, 'creator': {'id': db_id, **creator_data}}
# =========================================================================
# TikTok sync/download/add
# =========================================================================
async def _sync_tiktok_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
"""Sync a TikTok creator - fetch new videos and optionally download them"""
creator_id = creator['id']
self.log(f"Syncing TikTok creator: {creator['username']}", 'info')
sync_data = {
'username': creator['username'],
'platform': 'tiktok',
'service': 'tiktok',
'status': 'Fetching videos...',
'phase': 'fetching',
'failed': 0
}
self._register_active_sync(creator_id, sync_data)
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
try:
tiktok = self._get_tiktok_client()
if not tiktok.is_available():
self._unregister_active_sync(creator_id)
return SyncResult(success=False, error="yt-dlp/gallery-dl not available")
creator_url = tiktok.normalize_creator_url(creator['creator_id'])
# Update profile info (avatar, bio, display name)
try:
profile_info = await tiktok.get_creator_info(creator_url)
if profile_info:
profile_updates = {}
if profile_info.get('creator_name') and profile_info['creator_name'] != creator.get('creator_id'):
profile_updates['display_name'] = profile_info['creator_name']
if profile_info.get('profile_image_url'):
cached = await self._cache_profile_image(profile_info['profile_image_url'], 'tiktok', creator['creator_id'], 'avatar')
profile_updates['profile_image_url'] = cached or profile_info['profile_image_url']
if profile_info.get('bio'):
profile_updates['bio'] = profile_info['bio']
if profile_updates:
self.db.update_creator(creator_id, profile_updates)
self.log(f"Updated TikTok profile for @{creator['creator_id']}: {list(profile_updates.keys())}", 'debug')
except Exception as e:
self.log(f"Failed to update TikTok profile: {e}", 'debug')
# Fetch videos
from datetime import timedelta
if scheduled:
since_date = (datetime.now() - timedelta(days=3)).isoformat()
else:
since_date = creator.get('last_post_date')
def progress_callback(count: int):
self._update_active_sync(creator_id, {
'status': f'Fetched {count} videos...',
'posts_fetched': count
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': f'Fetched {count} videos...',
'phase': 'fetching',
'posts_fetched': count
})
max_videos = 20 if scheduled else None
posts = await tiktok.get_posts(
creator_url,
since_date=since_date,
max_videos=max_videos,
progress_callback=progress_callback
)
if not posts:
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': 0, 'new_attachments': 0, 'downloaded': 0, 'failed': 0
})
return SyncResult(success=True, new_posts=0, new_attachments=0)
self.log(f"Found {len(posts)} new videos for @{creator['username']}", 'info')
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} posts...',
'phase': 'processing', 'total_posts': len(posts)
})
new_posts = 0
new_attachments = 0
for post in posts:
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
self._apply_auto_tag_rules(post_db_id, is_new_post)
# Update pinned posts from profile page data
if hasattr(tiktok, '_last_pinned_posts') and tiktok._last_pinned_posts:
self.db.update_pinned_posts(creator_id, tiktok._last_pinned_posts)
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'last_post_date': latest_post_date or creator.get('last_post_date'),
'post_count': self.db.get_creator_post_count(creator_id)
})
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self._download_tiktok_posts(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': new_posts, 'new_attachments': new_attachments,
'downloaded': downloaded, 'failed': failed
})
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True, new_posts=new_posts, new_attachments=new_attachments,
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing TikTok creator @{creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
})
return SyncResult(success=False, error=str(e))
async def _download_tiktok_posts(self, creator_id: int) -> Dict:
"""Download pending TikTok posts using gallery-dl"""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
tiktok = self._get_tiktok_client()
if not tiktok.is_available():
return {'downloaded': 0, 'failed': 0, 'error': 'yt-dlp/gallery-dl not available'}
self.log(f"Downloading {len(pending)} TikTok posts for @{creator['username']}", 'info')
self._update_active_sync(creator_id, {
'phase': 'downloading', 'status': f'Downloading {len(pending)} posts...',
'total_files': len(pending), 'downloaded': 0
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
downloaded = 0
failed = 0
downloaded_file_info = []
for i, att in enumerate(pending):
try:
post = self.db.get_post(att['post_id'])
if not post:
failed += 1
continue
published_at = post.get('published_at') or ''
post_date = published_at[:10] if published_at else 'unknown-date'
output_dir = base_path / 'tiktok' / self._sanitize_filename(creator['username']) / post_date
video_url = att.get('download_url')
if not video_url:
self.db.update_attachment_status(att['id'], 'failed', error_message='No video URL')
failed += 1
continue
self._update_active_sync(creator_id, {
'status': f'Downloading post {i + 1}/{len(pending)}...',
'downloaded': downloaded
})
self.db.update_attachment_status(att['id'], 'downloading')
result = await tiktok.download_video(video_url, output_dir, username=creator['username'])
if not result or not result.get('success'):
error = result.get('error', 'Download failed') if result else 'No result'
self.db.update_attachment_status(att['id'], 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
all_files = result.get('all_files', [])
is_carousel = result.get('is_carousel', False)
# Process all downloaded files (carousel photos or single video)
first_file_size = None
for file_idx, file_str in enumerate(all_files):
fp = Path(file_str)
if not fp.exists():
continue
f_size = fp.stat().st_size
ext = fp.suffix.lower()
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp'}
c_type = 'image' if ext in image_exts else 'video'
# Skip duplicate files: gallery-dl sometimes returns the same
# video twice (main + subdirectory copy) for non-carousel posts
if file_idx > 0 and not is_carousel and first_file_size and f_size == first_file_size:
self.log(f"Skipping duplicate file {fp.name} (same size as primary)", 'debug')
try:
fp.unlink()
except Exception:
pass
continue
w, h, dur = None, None, None
if c_type == 'video':
w, h, dur = self._extract_dimensions(fp, 'video')
else:
w, h, _ = self._extract_dimensions(fp, 'image')
thumb_data = None
if c_type == 'video':
thumb_data = self._generate_video_thumbnail(fp, seek_time='00:00:01')
if file_idx == 0:
first_file_size = f_size
# Update the existing attachment with the first file
self.db.update_attachment_status(att['id'], 'completed',
local_path=str(fp),
local_filename=fp.name,
name=fp.name,
extension=ext,
file_size=f_size,
file_type=c_type,
width=w, height=h, duration=dur,
thumbnail_data=thumb_data,
downloaded_at=datetime.now().isoformat()
)
else:
# Create additional attachments for carousel photos
# Use unique server_path per file to avoid upsert collisions
self.db.upsert_attachment(att['post_id'], {
'name': fp.name,
'file_type': c_type,
'extension': ext,
'server_path': f"{video_url}#slide_{file_idx}",
'download_url': video_url,
'status': 'completed',
'local_path': str(fp),
'local_filename': fp.name,
'file_size': f_size,
'width': w, 'height': h, 'duration': dur,
'thumbnail_data': thumb_data,
'downloaded_at': datetime.now().isoformat(),
})
downloaded_file_info.append({
'file_path': str(fp),
'filename': fp.name,
'source': creator['username'],
'content_type': c_type
})
total_size = sum(Path(f).stat().st_size for f in all_files if Path(f).exists())
self.db.mark_post_downloaded(att['post_id'])
self.db.increment_creator_download_stats(creator_id, 1, total_size)
downloaded += 1
except Exception as e:
self.log(f"Error downloading TikTok post: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
async def _add_tiktok_creator(self, username: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a TikTok creator"""
tiktok = self._get_tiktok_client()
if not tiktok.is_available():
return {'success': False, 'error': 'yt-dlp/gallery-dl not available'}
creator_url = tiktok.normalize_creator_url(username)
# Get full creator info including profile image and bio
creator_info_raw = await tiktok.get_creator_info(creator_url)
if not creator_info_raw:
return {'success': False, 'error': 'TikTok creator not found'}
creator_info = await tiktok.get_creator(creator_url)
if not creator_info:
return {'success': False, 'error': 'TikTok creator not found'}
creator_data = creator_info.to_dict()
creator_data['auto_download'] = 1 if auto_download else 0
creator_data['download_embeds'] = 1 if download_embeds else 0
db_id = self.db.add_creator(creator_data)
# Update bio separately (not in Creator model)
if creator_info_raw.get('bio'):
self.db.update_creator(db_id, {'bio': creator_info_raw['bio']})
return {'success': True, 'creator': {'id': db_id, **creator_data}}
# =========================================================================
# Instagram sync/download/add
# =========================================================================
async def _sync_instagram_creator(self, creator: Dict, download: bool = True, scheduled: bool = False, force_backfill: bool = False) -> SyncResult:
"""Sync an Instagram creator - fetch new posts and optionally download them"""
from modules.instagram_rate_limiter import rate_limiter as ig_rate_limiter
creator_id = creator['id']
self.log(f"Syncing Instagram creator: {creator['username']}", 'info')
sync_data = {
'username': creator['username'],
'platform': 'instagram',
'service': 'instagram',
'status': 'Backfilling full timeline...' if force_backfill else 'Scanning profile...',
'phase': 'backfilling' if force_backfill else 'fetching',
}
self._register_active_sync(creator_id, sync_data)
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
# Concurrent download loop — downloads attachments while scraping
# continues, so CDN URLs don't expire waiting for all posts to be fetched.
_dl_done = asyncio.Event()
_dl_results = {'downloaded': 0, 'failed': 0, 'downloaded_file_info': []}
async def _bg_download_loop():
await asyncio.sleep(15) # Let some posts queue up first
while not _dl_done.is_set():
r = await self._download_instagram_posts(creator_id, quiet=True)
_dl_results['downloaded'] += r.get('downloaded', 0)
_dl_results['failed'] += r.get('failed', 0)
_dl_results['downloaded_file_info'].extend(r.get('downloaded_file_info', []))
if r.get('downloaded', 0) == 0:
try:
await asyncio.wait_for(_dl_done.wait(), timeout=10)
except asyncio.TimeoutError:
pass
_dl_task = None
# Acquire operation lock (in thread to avoid blocking event loop)
await asyncio.to_thread(ig_rate_limiter.operation_lock.acquire)
try:
# Use ImgInn API adapter directly — paid content must NOT use the
# universal Instagram system (instagram_client / real IG API).
imginn_posts_adapter = InstagramAdapter(unified_db=self.unified_db, log_callback=self.log_callback)
# Fetch and update creator profile (avatar, bio, display name, etc.)
# Profile data comes from ImgInn HTML scraping, not Instagram API.
needs_profile = not creator.get('profile_image_url')
if not needs_profile and creator.get('updated_at'):
try:
last_update = datetime.fromisoformat(creator['updated_at'])
needs_profile = (datetime.now() - last_update).total_seconds() > 86400
except (ValueError, TypeError):
needs_profile = True
if needs_profile or not scheduled:
try:
profile_info = await imginn_posts_adapter.get_profile_info(creator['creator_id'])
if profile_info:
profile_updates = {}
if profile_info.get('avatar_url'):
cached = await self._cache_profile_image(profile_info['avatar_url'], 'instagram', creator['creator_id'], 'avatar')
profile_updates['profile_image_url'] = cached or profile_info['avatar_url']
if profile_info.get('bio'):
profile_updates['bio'] = profile_info['bio']
if profile_info.get('display_name'):
profile_updates['display_name'] = profile_info['display_name']
if profile_info.get('post_count'):
profile_updates['post_count'] = profile_info['post_count']
if profile_info.get('external_links'):
profile_updates['external_links'] = profile_info['external_links']
if profile_updates:
self.db.update_creator(creator_id, profile_updates)
self.log(f"Updated Instagram profile for {creator['username']}: {list(profile_updates.keys())}", 'info')
except Exception as e:
self.log(f"Failed to fetch Instagram profile info: {e}", 'warning')
new_posts = 0
new_attachments = 0
creator_url = InstagramAdapter.normalize_creator_url(creator['creator_id'])
# Start concurrent download task
if download and creator.get('auto_download', True):
_dl_task = asyncio.create_task(_bg_download_loop())
# Fetch posts — only new ones since the last sync (or full timeline if backfilling)
if creator.get('sync_posts', 1):
since_date = None if force_backfill else creator.get('last_post_date')
# Get known post IDs so the scroll can stop when it hits existing posts
# (for backfill, always load known IDs to skip duplicates)
known_post_ids = None
if since_date or force_backfill:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
(creator_id,)
)
known_post_ids = {row[0] for row in cursor.fetchall()}
_use_auth = bool(creator.get('use_authenticated_api'))
_is_first_sync = not creator.get('last_post_date')
_backfill_phase = 'backfilling' if force_backfill else 'fetching'
_profile_post_count = creator.get('post_count') or 0
_is_paginating = force_backfill or (_use_auth and _is_first_sync)
def progress_callback(current, total=None):
if _is_paginating:
# current = total scanned, total = new posts found
new_count = total or 0
if _profile_post_count:
status = f'Scanned {current} of {_profile_post_count} posts ({new_count} new)...'
else:
status = f'Scanned {current} posts ({new_count} new)...'
elif total and total > 1:
status = f'Fetching full-res post {current}/{total}...'
else:
status = f'Fetched {current} posts...'
self._update_active_sync(creator_id, {
'status': status,
'phase': _backfill_phase,
'posts_fetched': current,
'progress': current,
'total_files': _profile_post_count if _is_paginating else total,
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': status,
'phase': _backfill_phase,
'progress': current,
'total_files': _profile_post_count if _is_paginating else total,
'posts_fetched': current,
})
# Use ImgInn API adapter directly for posts — it has proper cursor-based
# pagination that fetches ALL posts. The instagram_client adapter (browser)
# only scrapes the first page of the HTML profile.
# Callback to upsert each post immediately as it's fetched,
# so downloads are queued before CDN URLs expire.
def post_callback(post):
nonlocal new_posts, new_attachments
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
self._apply_auto_tag_rules(post_db_id, is_new_post)
if post.tagged_users:
self.db.set_post_tagged_users(post_db_id, post.tagged_users)
posts = await imginn_posts_adapter.get_posts(
creator_url,
since_date=since_date,
progress_callback=progress_callback,
known_post_ids=known_post_ids,
post_callback=post_callback,
use_authenticated_api=_use_auth,
paginate_all=force_backfill or (_use_auth and _is_first_sync),
)
if posts:
self.log(f"Processed {len(posts)} new posts for @{creator['username']}", 'info')
# Update post count and notify frontend
current_count = self.db.get_creator_post_count(creator_id)
self.db.update_creator(creator_id, {'post_count': current_count})
self._emit_event('paid_content_creator_updated', {
'creator_id': creator_id,
'post_count': current_count,
})
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None)
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'last_post_date': latest_post_date or creator.get('last_post_date'),
'post_count': self.db.get_creator_post_count(creator_id)
})
else:
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
# Update pinned posts from Instagram API data
if hasattr(imginn_posts_adapter, '_last_pinned_posts') and imginn_posts_adapter._last_pinned_posts:
self.db.update_pinned_posts(creator_id, imginn_posts_adapter._last_pinned_posts)
else:
self.log(f"Skipping posts for @{creator['username']} (disabled in sync settings)", 'info')
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
# --- Fetch stories ---
if creator.get('sync_stories', 1):
try:
self._update_active_sync(creator_id, {
'status': 'Fetching stories...',
'phase': 'fetching_stories',
})
# Get known story post_ids for incremental sync
story_known_ids = None
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT post_id FROM paid_content_posts WHERE creator_id = ? AND post_id LIKE 'story_%'",
(creator_id,)
)
story_known_ids = {row[0] for row in cursor.fetchall()}
# Use FastDL directly for stories — it hits the Instagram API
# and returns stable pk-based IDs. ImgInn was causing duplicate
# stories because its AQ-code IDs differ from FastDL's.
fastdl_file_map = {}
story_posts = []
self._update_active_sync(creator_id, {'status': 'Fetching stories...', 'phase': 'fetching_stories'})
try:
story_posts, fastdl_file_map = await self._fetch_stories_via_fastdl(creator['username'])
if story_known_ids and story_posts:
story_posts = [p for p in story_posts if p.post_id not in story_known_ids]
if story_posts:
self.log(f"FastDL found {len(story_posts)} new stories for @{creator['username']}", 'info')
except Exception as e:
self.log(f"FastDL story fetch failed for @{creator['username']}: {e}", 'warning')
if story_posts:
self.log(f"Found {len(story_posts)} new stories for @{creator['username']}", 'info')
# Get or create the "Story" tag
story_tag = self.db.get_tag_by_slug('story')
if not story_tag:
story_tag_id = self.db.create_tag('Story', color='#8b5cf6')
else:
story_tag_id = story_tag['id']
base_path = Path(self.config.get('base_download_path', '/paid-content'))
for post in story_posts:
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
post_all_completed = True
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
# If FastDL pre-downloaded this file, move it directly
temp_file = fastdl_file_map.get(attachment.name)
if temp_file and Path(temp_file).exists():
published_at = post.published_at or ''
post_date = published_at[:10] if published_at else 'unknown-date'
output_dir = base_path / 'instagram' / self._sanitize_filename(creator['username']) / post_date
output_dir.mkdir(parents=True, exist_ok=True)
dest_path = output_dir / attachment.name
shutil.move(str(temp_file), str(dest_path))
file_size = dest_path.stat().st_size if dest_path.exists() else 0
width, height, duration = None, None, None
if attachment.file_type == 'video':
width, height, duration = self._extract_dimensions(dest_path, 'video')
else:
width, height, _ = self._extract_dimensions(dest_path, 'image')
thumbnail_data = None
if dest_path.exists():
thumbnail_data = self._generate_thumbnail(dest_path, attachment.file_type or 'image')
# Compute file_hash for future dedup (stories often lack pk)
try:
story_file_hash = hashlib.md5(dest_path.read_bytes()).hexdigest()
except Exception:
story_file_hash = None
att_data['local_path'] = str(dest_path)
att_data['local_filename'] = attachment.name
att_data['file_size'] = file_size
att_data['file_hash'] = story_file_hash
att_data['width'] = width
att_data['height'] = height
att_data['duration'] = duration
att_data['status'] = 'completed'
att_data['downloaded_at'] = datetime.now().isoformat()
att_data['thumbnail_data'] = thumbnail_data
else:
post_all_completed = False
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
# Mark post as downloaded if all attachments were pre-downloaded by FastDL
if post_all_completed and fastdl_file_map:
self.db.mark_post_downloaded(post_db_id)
self._apply_auto_tag_rules(post_db_id, is_new_post)
if story_tag_id:
self.db.add_tag_to_post(post_db_id, story_tag_id)
# Update post count after stories
current_count = self.db.get_creator_post_count(creator_id)
self.db.update_creator(creator_id, {'post_count': current_count})
# Clean up FastDL temp directory
fastdl_temp = Path('/opt/media-downloader/temp/paid_content_fastdl')
if fastdl_temp.exists():
shutil.rmtree(fastdl_temp, ignore_errors=True)
except Exception as e:
self.log(f"Error fetching stories for @{creator['username']}: {e}", 'warning')
# Non-fatal — posts were already synced successfully
else:
self.log(f"Skipping stories for @{creator['username']} (disabled in sync settings)", 'info')
# --- Fetch highlights ---
if creator.get('sync_highlights', 1):
try:
self._update_active_sync(creator_id, {
'status': 'Fetching highlights...',
'phase': 'fetching_highlights',
})
# Get known highlight post_ids for incremental sync
highlight_known_ids = None
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT post_id FROM paid_content_posts WHERE creator_id = ? AND post_id LIKE 'highlight_%'",
(creator_id,)
)
highlight_known_ids = {row[0] for row in cursor.fetchall()}
def highlight_progress(current, total=None):
self._update_active_sync(creator_id, {
'status': f'Fetching highlight {current}/{total}...' if total else 'Fetching highlights...',
'phase': 'fetching_highlights',
})
highlight_posts = await imginn_posts_adapter.get_highlights(
creator_url,
known_post_ids=highlight_known_ids,
progress_callback=highlight_progress,
)
if highlight_posts:
self.log(f"Found {len(highlight_posts)} new highlights for @{creator['username']}", 'info')
# Get or create the "Highlight" tag
highlight_tag = self.db.get_tag_by_slug('highlight')
if not highlight_tag:
highlight_tag_id = self.db.create_tag('Highlight', color='#f59e0b')
else:
highlight_tag_id = highlight_tag['id']
for post in highlight_posts:
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
self._apply_auto_tag_rules(post_db_id, is_new_post)
if highlight_tag_id:
self.db.add_tag_to_post(post_db_id, highlight_tag_id)
# Update post count after highlights
current_count = self.db.get_creator_post_count(creator_id)
self.db.update_creator(creator_id, {'post_count': current_count})
except Exception as e:
self.log(f"Error fetching highlights for @{creator['username']}: {e}", 'warning')
# Non-fatal — posts were already synced successfully
# Refresh failed highlight URLs — re-fetch API for fresh CDN links
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT a.id, a.name, a.download_url
FROM paid_content_attachments a
JOIN paid_content_posts p ON a.post_id = p.id
WHERE p.creator_id = ? AND p.post_id LIKE 'highlight_%'
AND a.status = 'failed'
""", (creator_id,))
failed_highlights = [dict(row) for row in cursor.fetchall()]
if failed_highlights:
self.log(f"Refreshing URLs for {len(failed_highlights)} failed highlight attachments", 'info')
updated_urls = await imginn_posts_adapter.refresh_failed_highlight_urls(failed_highlights)
for att_id, new_url in updated_urls.items():
self.db.update_attachment(att_id, {
'download_url': new_url,
'status': 'pending',
'error_message': None,
})
if updated_urls:
self.log(f"Reset {len(updated_urls)} highlight attachments with fresh URLs", 'info')
else:
self.log(f"No fresh URLs available yet for failed highlights (CDN URLs still expired at source)", 'info')
except Exception as e:
self.log(f"Error refreshing failed highlight URLs: {e}", 'warning')
else:
self.log(f"Skipping highlights for @{creator['username']} (disabled in sync settings)", 'info')
# Stop concurrent download loop and collect its results
_dl_done.set()
if _dl_task:
await _dl_task
downloaded = _dl_results['downloaded']
failed = _dl_results['failed']
downloaded_file_info = list(_dl_results['downloaded_file_info'])
# Final download sweep for remaining attachments
# (stories, highlights, or posts queued near the end)
if download and creator.get('auto_download', True):
result = await self._download_instagram_posts(creator_id)
downloaded += result.get('downloaded', 0)
failed += result.get('failed', 0)
downloaded_file_info.extend(result.get('downloaded_file_info', []))
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': new_posts, 'new_attachments': new_attachments,
'downloaded': downloaded, 'failed': failed
})
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True, new_posts=new_posts, new_attachments=new_attachments,
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
)
except Exception as e:
_dl_done.set()
if _dl_task:
try:
await _dl_task
except Exception:
pass
self.log(f"Error syncing Instagram creator @{creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
})
return SyncResult(success=False, error=str(e))
finally:
ig_rate_limiter.operation_lock.release()
async def _fetch_stories_via_fastdl(self, username: str) -> Tuple[List, Dict]:
"""Fetch Instagram stories via FastDL subprocess as a fallback when ImgInn returns nothing.
Returns:
Tuple of (list of Post objects, dict mapping attachment_name -> temp_file_path)
"""
temp_dir = Path('/opt/media-downloader/temp/paid_content_fastdl')
temp_dir.mkdir(parents=True, exist_ok=True)
config = {
'username': username,
'content_type': 'stories',
'temp_dir': str(temp_dir),
'days_back': 3,
'max_downloads': 50,
'headless': True,
'db_path': '/opt/media-downloader/database/media_downloader.db',
'defer_database': True,
}
wrapper_path = '/opt/media-downloader/wrappers/fastdl_subprocess_wrapper.py'
self.log(f"Running FastDL subprocess for @{username} stories...", 'info')
process = await asyncio.create_subprocess_exec(
sys.executable, wrapper_path,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await asyncio.wait_for(
process.communicate(input=json.dumps(config).encode()),
timeout=300,
)
if stderr:
for line in stderr.decode(errors='replace').strip().splitlines():
self.log(f"[FastDL] {line}", 'debug')
if process.returncode != 0:
error_msg = 'FastDL subprocess failed'
if stdout:
try:
err_result = json.loads(stdout.decode().strip())
error_msg = err_result.get('message', error_msg)
except (json.JSONDecodeError, UnicodeDecodeError):
pass
raise RuntimeError(error_msg)
result = json.loads(stdout.decode().strip())
pending_downloads = result.get('pending_downloads', [])
if not pending_downloads:
return [], {}
posts = []
file_map = {}
for item in pending_downloads:
media_id = item.get('media_id', '')
url = item.get('url', '')
post_date = item.get('post_date')
filename = item.get('filename', '')
file_path = item.get('file_path', '')
# Determine extension and file type from CDN URL or FastDL filename
cdn_path = url.split('?')[0] if url else ''
is_video = '.mp4' in cdn_path or '.mp4' in filename
ext = '.mp4' if is_video else '.jpg'
file_type = 'video' if is_video else 'image'
# Use Instagram pk (stable numeric primary key) as story_id.
# Falls back to media_id if pk not available.
metadata = item.get('metadata') or {}
pk = metadata.get('pk', '')
story_id = pk if pk else media_id
# When pk is missing (browser fallback), check file hash and local_path
# against existing attachments to prevent duplicates with different ID formats.
if not pk and file_path and Path(file_path).exists():
file_hash = hashlib.md5(Path(file_path).read_bytes()).hexdigest()
existing = self.db.check_duplicate_hash(file_hash)
if existing:
existing_name = existing.get('name', '')
self.log(f"Story {media_id} is duplicate of existing {existing_name} (same file hash), skipping", 'info')
try:
Path(file_path).unlink()
except Exception:
pass
continue
# Also check by computed destination path — catches duplicates where
# neither story had file_hash populated (e.g. pk-based vs CDN-based IDs)
att_name_check = f"story_{story_id}{ext}"
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT a.id, a.name FROM paid_content_attachments a
JOIN paid_content_posts p ON a.post_id = p.id
WHERE p.creator_id = ? AND p.service_id = 'instagram'
AND a.name LIKE 'story_%' AND a.local_path IS NOT NULL
AND a.local_path = (
SELECT a2.local_path FROM paid_content_attachments a2
JOIN paid_content_posts p2 ON a2.post_id = p2.id
WHERE p2.creator_id = ? AND a2.name = ? AND a2.status = 'completed'
LIMIT 1
)
AND a.name != ?
LIMIT 1
""", (creator_id, creator_id, att_name_check, att_name_check))
dup_row = cursor.fetchone()
if dup_row:
self.log(f"Story {media_id} maps to same path as existing {dup_row['name']}, skipping", 'info')
try:
Path(file_path).unlink()
except Exception:
pass
continue
except Exception:
pass
att_name = f"story_{story_id}{ext}"
attachment = Attachment(
name=att_name,
file_type=file_type,
extension=ext,
server_path=f"https://www.instagram.com/stories/{username}/",
download_url=url,
)
post = Post(
post_id=f'story_{story_id}',
service_id='instagram',
platform='instagram',
creator_id=username,
title=f'Story by @{username}',
content='',
published_at=post_date,
attachments=[attachment],
)
posts.append(post)
# Track pre-downloaded file for direct move
if file_path and Path(file_path).exists():
file_map[att_name] = file_path
self.log(f"FastDL returned {len(posts)} stories ({len(file_map)} pre-downloaded)", 'info')
return posts, file_map
async def _download_instagram_posts(self, creator_id: int, quiet: bool = False) -> Dict:
"""Download pending Instagram media from CDN URLs"""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
instagram = InstagramAdapter(unified_db=self.unified_db, log_callback=self.log_callback)
self.log(f"Downloading {len(pending)} Instagram media for @{creator['username']}", 'info')
if not quiet:
self._update_active_sync(creator_id, {
'phase': 'downloading', 'status': f'Downloading {len(pending)} media files...',
'total_files': len(pending), 'downloaded': 0
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id, 'username': creator['username'],
'status': f'Downloading {len(pending)} media files...',
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
downloaded = 0
failed = 0
downloaded_file_info = []
for i, att in enumerate(pending):
try:
post = self.db.get_post(att['post_id'])
if not post:
failed += 1
continue
published_at = post.get('published_at') or ''
post_date = published_at[:10] if published_at else 'unknown-date'
output_dir = base_path / 'instagram' / self._sanitize_filename(creator['username']) / post_date
download_url = att.get('download_url')
if not download_url:
self.db.update_attachment_status(att['id'], 'failed', error_message='No download URL')
failed += 1
continue
if not quiet:
self._update_active_sync(creator_id, {
'status': f'Downloading media {i + 1}/{len(pending)}...',
'phase': 'downloading',
'downloaded': downloaded,
'progress': i + 1,
'total_files': len(pending),
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id, 'username': creator['username'],
'status': f'Downloading media {i + 1}/{len(pending)}...',
'phase': 'downloading', 'downloaded': downloaded,
'progress': i + 1, 'total_files': len(pending),
})
# Determine output filename from attachment name or generate one
filename = att.get('name') or att.get('local_filename') or f"media_{i}{att.get('extension', '.jpg')}"
output_path = Path(output_dir) / filename
self.db.update_attachment_status(att['id'], 'downloading')
result = await instagram.download_media(download_url, output_path)
if not result or not result.get('success'):
error = result.get('error', 'Download failed') if result else 'No result'
self.db.update_attachment_status(att['id'], 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
file_path = result.get('file_path')
file_size = result.get('file_size', 0)
content_type = 'image'
if file_path:
ext = Path(file_path).suffix.lower()
if ext in ('.mp4', '.mov', '.webm', '.avi'):
content_type = 'video'
width, height, duration = None, None, None
if file_path and content_type == 'video':
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
elif file_path and content_type == 'image':
width, height, _ = self._extract_dimensions(Path(file_path), 'image')
thumbnail_data = None
if file_path and Path(file_path).exists():
thumbnail_data = self._generate_thumbnail(Path(file_path), content_type)
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name if file_path else None,
file_size=file_size,
width=width, height=height, duration=duration,
thumbnail_data=thumbnail_data,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(att['post_id'])
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
downloaded += 1
if file_path:
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name,
'source': creator['username'],
'content_type': content_type
})
except Exception as e:
self.log(f"Error downloading Instagram post: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
async def _add_instagram_creator(self, username: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add an Instagram creator"""
instagram = InstagramAdapter(unified_db=self.unified_db, log_callback=self.log_callback)
creator_url = InstagramAdapter.normalize_creator_url(username)
# Fetch profile info for richer data (avatar, bio, display name)
profile_info = await instagram.get_profile_info(username)
creator_info = await instagram.get_creator(creator_url)
if not creator_info:
return {'success': False, 'error': 'Instagram creator not found'}
creator_data = creator_info.to_dict()
creator_data['auto_download'] = 1 if auto_download else 0
creator_data['download_embeds'] = 1 if download_embeds else 0
# Cache avatar locally so CDN URL expiry doesn't break it
if profile_info and profile_info.get('avatar_url'):
try:
cached = await self._cache_profile_image(
profile_info['avatar_url'], 'instagram', username, 'avatar'
)
if cached:
creator_data['profile_image_url'] = cached
except Exception as e:
self.log(f"Failed to cache avatar for @{username}: {e}", 'warning')
# Store bio and display name from profile
if profile_info:
if profile_info.get('bio'):
creator_data['bio'] = profile_info['bio']
if profile_info.get('display_name'):
creator_data['display_name'] = profile_info['display_name']
if profile_info.get('post_count'):
creator_data['post_count'] = profile_info['post_count']
if profile_info.get('external_links'):
creator_data['external_links'] = profile_info['external_links']
db_id = self.db.add_creator(creator_data)
return {'success': True, 'creator': {'id': db_id, **creator_data}}
# =========================================================================
# Soundgasm + Liltsome sync/download/add
# =========================================================================
async def _sync_soundgasm_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
"""Sync a Soundgasm creator - fetch audio from Soundgasm and Liltsome archive"""
creator_id = creator['id']
self.log(f"Syncing Soundgasm creator: {creator['username']}", 'info')
sync_data = {
'username': creator['username'],
'platform': 'soundgasm',
'service': 'soundgasm',
'status': 'Fetching audio posts...',
'phase': 'fetching',
}
self._register_active_sync(creator_id, sync_data)
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
try:
client = self._get_soundgasm_client()
# Get known post IDs for incremental sync (no dates on Soundgasm)
known_post_ids = set()
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
(creator_id,)
)
known_post_ids = {row[0] for row in cursor.fetchall()}
def progress_callback(count):
self._update_active_sync(creator_id, {
'status': f'Fetched {count} audio posts...',
'posts_fetched': count,
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': f'Fetched {count} audio posts...',
'phase': 'fetching',
'posts_fetched': count,
})
posts = await client.get_posts(
creator['creator_id'],
known_post_ids=known_post_ids,
progress_callback=progress_callback,
)
if not posts:
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': 0, 'new_attachments': 0, 'downloaded': 0, 'failed': 0
})
return SyncResult(success=True, new_posts=0, new_attachments=0)
self.log(f"Found {len(posts)} new audio posts for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} posts...',
'phase': 'processing', 'total_posts': len(posts)
})
new_posts = 0
new_attachments = 0
for post in posts:
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
# Apply bracket auto-tags
if is_new_post and hasattr(post, 'auto_tags') and post.auto_tags:
for tag_name in post.auto_tags:
try:
slug = tag_name.lower().replace(' ', '-').replace('/', '-')
tag = self.db.get_tag_by_slug(slug)
if not tag:
display = format_tag_display(tag_name)
tag_id = self.db.create_tag(
display, color='#8b5cf6',
description='Auto-extracted from audio title'
)
else:
tag_id = tag['id']
if tag_id:
self.db.add_tag_to_post(post_db_id, tag_id)
except Exception:
pass # Don't let tagging errors break sync
self._apply_auto_tag_rules(post_db_id, is_new_post)
# Update post count and notify frontend
current_count = self.db.get_creator_post_count(creator_id)
self.db.update_creator(creator_id, {'post_count': current_count})
self._emit_event('paid_content_creator_updated', {
'creator_id': creator_id,
'post_count': current_count,
})
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'post_count': self.db.get_creator_post_count(creator_id)
})
# Download audio files
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self._download_soundgasm_audio(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': new_posts, 'new_attachments': new_attachments,
'downloaded': downloaded, 'failed': failed
})
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True, new_posts=new_posts, new_attachments=new_attachments,
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing Soundgasm creator {creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
})
return SyncResult(success=False, error=str(e))
async def _download_soundgasm_audio(self, creator_id: int) -> Dict:
"""Download pending Soundgasm/Liltsome audio files via direct HTTP"""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
client = self._get_soundgasm_client()
self.log(f"Downloading {len(pending)} audio files for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'phase': 'downloading', 'status': f'Downloading {len(pending)} audio files...',
'total_files': len(pending), 'downloaded': 0
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id, 'username': creator['username'],
'status': f'Downloading {len(pending)} audio files...',
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
downloaded = 0
failed = 0
downloaded_file_info = []
for i, att in enumerate(pending):
try:
post = self.db.get_post(att['post_id'])
if not post:
failed += 1
continue
output_dir = base_path / 'soundgasm' / self._sanitize_filename(creator['username'])
download_url = att.get('download_url')
if not download_url:
self.db.update_attachment_status(att['id'], 'failed', error_message='No download URL')
failed += 1
continue
self._update_active_sync(creator_id, {
'status': f'Downloading audio {i + 1}/{len(pending)}...',
'phase': 'downloading',
'downloaded': downloaded,
'progress': i + 1,
'total_files': len(pending),
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id, 'username': creator['username'],
'status': f'Downloading audio {i + 1}/{len(pending)}...',
'phase': 'downloading', 'downloaded': downloaded,
'progress': i + 1, 'total_files': len(pending),
})
filename = att.get('name') or att.get('local_filename') or f"audio_{i}.m4a"
output_path = Path(output_dir) / filename
self.db.update_attachment_status(att['id'], 'downloading')
result = await client.download_audio(download_url, output_path)
if not result or not result.get('success'):
error = result.get('error', 'Download failed') if result else 'No result'
self.db.update_attachment_status(att['id'], 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
file_path = result.get('file_path')
file_size = result.get('file_size', 0)
# Extract duration via ffprobe
width, height, duration = None, None, None
if file_path:
_, _, duration = self._extract_dimensions(Path(file_path), 'audio')
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name if file_path else None,
file_size=file_size,
width=width, height=height, duration=duration,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(att['post_id'])
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
downloaded += 1
if file_path:
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name,
'source': creator['username'],
'content_type': 'audio'
})
except Exception as e:
self.log(f"Error downloading Soundgasm audio: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
async def _add_soundgasm_creator(self, username: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a Soundgasm creator (checks both Soundgasm and Liltsome)"""
client = self._get_soundgasm_client()
profile_info = await client.get_profile_info(username)
if not profile_info:
return {'success': False, 'error': 'Creator not found on Soundgasm or Liltsome archive'}
creator_data = {
'service_id': 'soundgasm',
'platform': 'soundgasm',
'creator_id': username,
'username': username,
'display_name': username,
'post_count': profile_info.get('post_count', 0),
'auto_download': 1 if auto_download else 0,
'download_embeds': 1 if download_embeds else 0,
}
db_id = self.db.add_creator(creator_data)
return {'success': True, 'creator': {'id': db_id, **creator_data}}
# ------------------------------------------------------------------
# Bellazon (forum thread scraping)
# ------------------------------------------------------------------
async def _add_bellazon_creator(self, topic_id: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a Bellazon forum thread as a creator"""
client = self._get_bellazon_client()
profile_info = await client.get_profile_info(topic_id)
if not profile_info:
return {'success': False, 'error': 'Thread not found on Bellazon'}
creator_data = {
'service_id': 'bellazon',
'platform': 'bellazon',
'creator_id': topic_id,
'username': profile_info['username'],
'display_name': profile_info['display_name'],
'post_count': profile_info.get('post_count', 0),
'bio': profile_info.get('topic_url', ''),
'auto_download': 1 if auto_download else 0,
'download_embeds': 1 if download_embeds else 0,
}
db_id = self.db.add_creator(creator_data)
return {'success': True, 'creator': {'id': db_id, **creator_data}}
async def _sync_bellazon_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
"""Sync a Bellazon forum thread — scrape pages for image/video posts"""
creator_id = creator['id']
self.log(f"Syncing Bellazon thread: {creator['display_name'] or creator['username']}", 'info')
sync_data = {
'username': creator['username'],
'platform': 'bellazon',
'service': 'bellazon',
'status': 'Fetching forum posts...',
'phase': 'fetching',
}
self._register_active_sync(creator_id, sync_data)
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
try:
client = self._get_bellazon_client()
# Get known post IDs for incremental sync
known_post_ids = set()
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
(creator_id,)
)
known_post_ids = {row[0] for row in cursor.fetchall()}
# Reconstruct topic URL from bio (stored at add time) or from creator_id + username
topic_url = creator.get('bio', '').strip()
if not topic_url or 'bellazon.com' not in topic_url:
topic_url = f"{client.BASE_URL}/topic/{creator['creator_id']}-{creator['username']}"
def progress_callback(count):
self._update_active_sync(creator_id, {
'status': f'Fetched {count} media posts...',
'posts_fetched': count,
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': f'Fetched {count} media posts...',
'phase': 'fetching',
'posts_fetched': count,
})
posts = await client.get_posts(
creator['creator_id'],
topic_url,
known_post_ids=known_post_ids,
progress_callback=progress_callback,
)
if posts:
self.log(f"Found {len(posts)} new media posts for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} posts...',
'phase': 'processing', 'total_posts': len(posts)
})
new_posts = 0
new_attachments = 0
for post in posts:
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
self._apply_auto_tag_rules(post_db_id, is_new_post)
current_count = self.db.get_creator_post_count(creator_id)
self.db.update_creator(creator_id, {'post_count': current_count})
self._emit_event('paid_content_creator_updated', {
'creator_id': creator_id,
'post_count': current_count,
})
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'post_count': self.db.get_creator_post_count(creator_id)
})
# Download media files
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self._download_bellazon_media(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
# Clean up permanently failed attachments and empty posts
self._cleanup_bellazon_failed(creator_id)
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': new_posts, 'new_attachments': new_attachments,
'downloaded': downloaded, 'failed': failed
})
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True, new_posts=new_posts, new_attachments=new_attachments,
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing Bellazon thread {creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
})
return SyncResult(success=False, error=str(e))
def _cleanup_bellazon_failed(self, creator_id: int):
"""Remove permanently failed attachments and delete posts left with zero attachments."""
try:
with self.unified_db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Delete failed attachments for this creator
cursor.execute("""
DELETE FROM paid_content_attachments
WHERE status = 'failed'
AND post_id IN (SELECT id FROM paid_content_posts WHERE creator_id = ?)
""", (creator_id,))
removed_atts = cursor.rowcount
if removed_atts > 0:
# Delete posts that now have zero attachments
cursor.execute("""
DELETE FROM paid_content_posts
WHERE creator_id = ?
AND id NOT IN (SELECT DISTINCT post_id FROM paid_content_attachments)
""", (creator_id,))
removed_posts = cursor.rowcount
# Update attachment_count on remaining posts
cursor.execute("""
UPDATE paid_content_posts
SET attachment_count = (
SELECT COUNT(*) FROM paid_content_attachments WHERE post_id = paid_content_posts.id
)
WHERE creator_id = ?
""", (creator_id,))
conn.commit()
if removed_atts or removed_posts:
self.log(f"Cleanup: removed {removed_atts} failed attachments, {removed_posts} empty posts", 'info')
# Update creator post count
current_count = self.db.get_creator_post_count(creator_id)
self.db.update_creator(creator_id, {'post_count': current_count})
except Exception as e:
self.log(f"Error during bellazon cleanup: {e}", 'warning')
async def _download_bellazon_media(self, creator_id: int) -> Dict:
"""Download pending Bellazon images/videos via direct HTTP"""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
self.log(f"Downloading {len(pending)} media files for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'phase': 'downloading', 'status': f'Downloading {len(pending)} files...',
'total_files': len(pending), 'downloaded': 0
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id, 'username': creator['username'],
'status': f'Downloading {len(pending)} files...',
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
headers = BellazonClient.HEADERS
downloaded = 0
failed = 0
downloaded_file_info = []
timeout = aiohttp.ClientTimeout(total=120)
async with aiohttp.ClientSession(timeout=timeout) as session:
for i, att in enumerate(pending):
try:
post = self.db.get_post(att['post_id'])
if not post:
failed += 1
continue
# Organize by date: paid_content/bellazon/{slug}/{YYYY-MM-DD}/
date_str = (post.get('published_at') or '')[:10] or 'unknown-date'
output_dir = base_path / 'bellazon' / self._sanitize_filename(creator['username']) / date_str
download_url = att.get('download_url')
if not download_url:
self.db.update_attachment_status(att['id'], 'failed', error_message='No download URL')
failed += 1
continue
self._update_active_sync(creator_id, {
'status': f'Downloading {i + 1}/{len(pending)}...',
'phase': 'downloading',
'downloaded': downloaded,
'progress': i + 1,
'total_files': len(pending),
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id, 'username': creator['username'],
'status': f'Downloading {i + 1}/{len(pending)}...',
'phase': 'downloading', 'downloaded': downloaded,
'progress': i + 1, 'total_files': len(pending),
})
filename = att.get('name') or att.get('local_filename') or f"media_{i}.jpg"
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / filename
self.db.update_attachment_status(att['id'], 'downloading')
# Direct HTTP download
try:
async with session.get(download_url, headers=headers) as resp:
if resp.status != 200:
self.db.update_attachment_status(att['id'], 'failed',
error_message=f'HTTP {resp.status}',
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
total = 0
async with aiofiles.open(str(output_path), 'wb') as f:
async for chunk in resp.content.iter_chunked(65536):
await f.write(chunk)
total += len(chunk)
except Exception as dl_err:
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(dl_err),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
file_path = str(output_path)
file_size = total
# Extract dimensions for images/videos
width, height, duration = None, None, None
file_type = att.get('file_type', 'image')
if file_path:
width, height, duration = self._extract_dimensions(Path(file_path), file_type)
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name,
file_size=file_size,
width=width, height=height, duration=duration,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(att['post_id'])
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
downloaded += 1
if file_path:
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name,
'source': creator['username'],
'content_type': file_type
})
except Exception as e:
self.log(f"Error downloading Bellazon media: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
# -------------------------------------------------------------------------
# BestEyeCandy
# -------------------------------------------------------------------------
async def _add_besteyecandy_creator(self, creator_id_str: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a BestEyeCandy celeb as a creator.
creator_id_str is in 'cid/slug' format (e.g. '800/Myleene-Klass')
or just 'cid' if slug wasn't available.
"""
client = self._get_besteyecandy_client()
# Parse cid and slug from creator_id_str
if '/' in creator_id_str:
cid, celeb_slug = creator_id_str.split('/', 1)
else:
cid = creator_id_str
celeb_slug = cid # fallback
profile_info = await client.get_profile_info(cid, celeb_slug)
if not profile_info:
return {'success': False, 'error': 'Celeb not found on BestEyeCandy'}
# Ensure scraper row exists for cookie storage
try:
with self.unified_db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute("SELECT id FROM scrapers WHERE id = ?", ('besteyecandy',))
if not cursor.fetchone():
cursor.execute(
"INSERT INTO scrapers (id, name, enabled) VALUES (?, ?, ?)",
('besteyecandy', 'BestEyeCandy', 1)
)
conn.commit()
self.log("Created besteyecandy scraper row", 'info')
except Exception as e:
self.log(f"Error ensuring scraper row: {e}", 'warning')
creator_data = {
'service_id': 'besteyecandy',
'platform': 'besteyecandy',
'creator_id': cid,
'username': profile_info['username'],
'display_name': profile_info['display_name'],
'post_count': profile_info.get('post_count', 0),
'bio': profile_info.get('celeb_url', ''),
'auto_download': 1 if auto_download else 0,
'download_embeds': 1 if download_embeds else 0,
}
db_id = self.db.add_creator(creator_data)
return {'success': True, 'creator': {'id': db_id, **creator_data}}
async def _sync_besteyecandy_creator(self, creator: Dict, download: bool = True,
scheduled: bool = False) -> SyncResult:
"""Sync a BestEyeCandy celeb -- scrape listing pages for photo posts."""
creator_id = creator['id']
self.log(f"Syncing BestEyeCandy celeb: {creator['display_name'] or creator['username']}",
'info')
sync_data = {
'username': creator['username'],
'platform': 'besteyecandy',
'service': 'besteyecandy',
'status': 'Fetching photo listings...',
'phase': 'fetching',
}
self._register_active_sync(creator_id, sync_data)
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
try:
client = self._get_besteyecandy_client()
# Get known post IDs for incremental sync
known_post_ids = set()
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
(creator_id,)
)
known_post_ids = {row[0] for row in cursor.fetchall()}
cid = creator['creator_id']
celeb_slug = creator['username']
def progress_callback(status_msg):
self._update_active_sync(creator_id, {
'status': status_msg,
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': status_msg,
'phase': 'fetching',
})
posts = await client.get_posts(
cid, celeb_slug,
known_post_ids=known_post_ids,
progress_callback=progress_callback,
)
if posts:
self.log(f"Found {len(posts)} new photos for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} photos...',
'phase': 'processing', 'total_posts': len(posts)
})
new_posts = 0
new_attachments = 0
for post in posts:
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
self._apply_auto_tag_rules(post_db_id, is_new_post)
current_count = self.db.get_creator_post_count(creator_id)
self.db.update_creator(creator_id, {'post_count': current_count})
self._emit_event('paid_content_creator_updated', {
'creator_id': creator_id,
'post_count': current_count,
})
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'post_count': self.db.get_creator_post_count(creator_id)
})
# Download media files
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self._download_besteyecandy_media(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
# Clean up permanently failed attachments and empty posts
self._cleanup_besteyecandy_failed(creator_id)
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': new_posts, 'new_attachments': new_attachments,
'downloaded': downloaded, 'failed': failed
})
self._send_creator_notification(creator, new_posts, downloaded,
downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True, new_posts=new_posts, new_attachments=new_attachments,
downloaded_files=downloaded, failed_files=failed,
downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing BestEyeCandy celeb {creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
})
return SyncResult(success=False, error=str(e))
def _cleanup_besteyecandy_failed(self, creator_id: int):
"""Remove permanently failed attachments and delete posts left with zero attachments."""
try:
with self.unified_db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute("""
DELETE FROM paid_content_attachments
WHERE status = 'failed'
AND post_id IN (SELECT id FROM paid_content_posts WHERE creator_id = ?)
""", (creator_id,))
removed_atts = cursor.rowcount
if removed_atts > 0:
cursor.execute("""
DELETE FROM paid_content_posts
WHERE creator_id = ?
AND id NOT IN (SELECT DISTINCT post_id FROM paid_content_attachments)
""", (creator_id,))
removed_posts = cursor.rowcount
cursor.execute("""
UPDATE paid_content_posts
SET attachment_count = (
SELECT COUNT(*) FROM paid_content_attachments WHERE post_id = paid_content_posts.id
)
WHERE creator_id = ?
""", (creator_id,))
conn.commit()
if removed_atts or removed_posts:
self.log(f"Cleanup: removed {removed_atts} failed attachments, "
f"{removed_posts} empty posts", 'info')
current_count = self.db.get_creator_post_count(creator_id)
self.db.update_creator(creator_id, {'post_count': current_count})
except Exception as e:
self.log(f"Error during besteyecandy cleanup: {e}", 'warning')
async def _download_besteyecandy_media(self, creator_id: int) -> Dict:
"""Download pending BestEyeCandy images via direct HTTP with cookies."""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
self.log(f"Downloading {len(pending)} images for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'phase': 'downloading', 'status': f'Downloading {len(pending)} files...',
'total_files': len(pending), 'downloaded': 0
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id, 'username': creator['username'],
'status': f'Downloading {len(pending)} files...',
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
client = self._get_besteyecandy_client()
headers = BestEyeCandyClient.HEADERS
downloaded = 0
failed = 0
downloaded_file_info = []
timeout = aiohttp.ClientTimeout(total=120)
async with client._create_session(timeout=timeout) as session:
for i, att in enumerate(pending):
try:
post = self.db.get_post(att['post_id'])
if not post:
failed += 1
continue
# Organize: paid_content/besteyecandy/{slug}/{filename}
output_dir = base_path / 'besteyecandy' / self._sanitize_filename(
creator['username'])
download_url = att.get('download_url')
if not download_url:
self.db.update_attachment_status(att['id'], 'failed',
error_message='No download URL')
failed += 1
continue
self._update_active_sync(creator_id, {
'status': f'Downloading {i + 1}/{len(pending)}...',
'phase': 'downloading',
'downloaded': downloaded,
'progress': i + 1,
'total_files': len(pending),
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id, 'username': creator['username'],
'status': f'Downloading {i + 1}/{len(pending)}...',
'phase': 'downloading', 'downloaded': downloaded,
'progress': i + 1, 'total_files': len(pending),
})
filename = att.get('name') or att.get('local_filename') or f"photo_{i}.jpg"
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / filename
self.db.update_attachment_status(att['id'], 'downloading')
# Direct HTTP download with cookies
try:
async with session.get(download_url, headers=headers) as resp:
if resp.status != 200:
self.db.update_attachment_status(att['id'], 'failed',
error_message=f'HTTP {resp.status}',
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
total = 0
async with aiofiles.open(str(output_path), 'wb') as f:
async for chunk in resp.content.iter_chunked(65536):
await f.write(chunk)
total += len(chunk)
except Exception as dl_err:
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(dl_err),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
file_path = str(output_path)
file_size = total
# Extract dimensions for images
width, height, duration = None, None, None
file_type = att.get('file_type', 'image')
if file_path:
width, height, duration = self._extract_dimensions(
Path(file_path), file_type)
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name,
file_size=file_size,
width=width, height=height, duration=duration,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(att['post_id'])
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
downloaded += 1
if file_path:
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name,
'source': creator['username'],
'content_type': file_type
})
# Rate limit: 2s between downloads
await asyncio.sleep(2)
except Exception as e:
self.log(f"Error downloading BestEyeCandy media: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
# -------------------------------------------------------------------------
# Coppermine Gallery
# -------------------------------------------------------------------------
async def _add_coppermine_creator(self, creator_id_str: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a Coppermine gallery as a creator.
creator_id_str is domain/path (e.g. 'kylie-jenner.org/gallery').
"""
client = self._get_coppermine_client()
# Reconstruct gallery URL from creator_id
gallery_url = f"https://{creator_id_str}"
profile_info = await client.get_profile_info(gallery_url)
if not profile_info:
return {'success': False, 'error': 'Gallery not found or not a Coppermine gallery'}
creator_data = {
'service_id': 'coppermine',
'platform': 'coppermine',
'creator_id': creator_id_str,
'username': profile_info['username'],
'display_name': profile_info['display_name'],
'post_count': profile_info.get('post_count', 0),
'bio': gallery_url,
'auto_download': 1 if auto_download else 0,
'download_embeds': 1 if download_embeds else 0,
}
db_id = self.db.add_creator(creator_data)
return {'success': True, 'creator': {'id': db_id, **creator_data}}
async def _sync_coppermine_creator(self, creator: Dict, download: bool = True,
scheduled: bool = False) -> SyncResult:
"""Sync a Coppermine gallery — crawl categories/albums, upsert + download per album."""
creator_id = creator['id']
do_download = download and creator.get('auto_download', True)
self.log(f"Syncing Coppermine gallery: {creator['display_name'] or creator['username']}",
'info')
sync_data = {
'username': creator['username'],
'platform': 'coppermine',
'service': 'coppermine',
'status': 'Crawling gallery categories...',
'phase': 'fetching',
}
self._register_active_sync(creator_id, sync_data)
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
try:
client = self._get_coppermine_client()
# Get known post IDs for incremental sync
known_post_ids = set()
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
(creator_id,)
)
known_post_ids = {row[0] for row in cursor.fetchall()}
# Reconstruct gallery URL from bio field
gallery_url = (creator.get('bio') or '').strip()
if not gallery_url or not gallery_url.startswith('http'):
gallery_url = f"https://{creator['creator_id']}"
def progress_callback(status_msg):
self._update_active_sync(creator_id, {
'status': status_msg,
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': status_msg,
'phase': 'fetching',
})
# Streaming state — upsert each album immediately, download via worker pool
new_posts = 0
new_attachments = 0
downloaded = 0
failed = 0
downloaded_file_info = []
base_path = Path(self.config.get('base_download_path', '/paid-content'))
headers = CoppermineClient.HEADERS
dl_timeout = aiohttp.ClientTimeout(total=None, sock_connect=30, sock_read=120)
dl_session = aiohttp.ClientSession(timeout=dl_timeout) if do_download else None
dl_queue: asyncio.Queue = asyncio.Queue()
dl_done = asyncio.Event() # signals workers to stop
async def _dl_worker():
"""Worker that pulls items from the download queue."""
nonlocal downloaded, failed
while True:
try:
att, output_dir = await asyncio.wait_for(dl_queue.get(), timeout=2.0)
except asyncio.TimeoutError:
if dl_done.is_set() and dl_queue.empty():
return
continue
try:
download_url = att.get('download_url')
if not download_url:
self.db.update_attachment_status(
att['id'], 'failed', error_message='No download URL')
failed += 1
continue
filename = att.get('name') or att.get('local_filename') or 'photo.jpg'
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / filename
self.db.update_attachment_status(att['id'], 'downloading')
try:
async with dl_session.get(download_url, headers=headers) as resp:
if resp.status != 200:
self.db.update_attachment_status(att['id'], 'failed',
error_message=f'HTTP {resp.status}',
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat())
failed += 1
continue
total = 0
async with aiofiles.open(str(output_path), 'wb') as f:
async for chunk in resp.content.iter_chunked(65536):
await f.write(chunk)
total += len(chunk)
except Exception as dl_err:
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(dl_err),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat())
failed += 1
continue
file_path = str(output_path)
file_type = att.get('file_type', 'image')
width, height, duration = self._extract_dimensions(
Path(file_path), file_type)
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name,
file_size=total,
width=width, height=height, duration=duration,
downloaded_at=datetime.now().isoformat())
self.db.mark_post_downloaded(att['post_id'])
self.db.increment_creator_download_stats(creator_id, 1, total or 0)
downloaded += 1
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name,
'source': creator['username'],
'content_type': file_type,
})
except Exception as e:
self.log(f"Download worker error: {e}", 'error')
finally:
dl_queue.task_done()
async def on_album(post):
nonlocal new_posts, new_attachments
# Upsert post
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if not post_db_id:
return
if is_new_post:
new_posts += 1
# Upsert attachments
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
self._apply_auto_tag_rules(post_db_id, is_new_post)
current_count = self.db.get_creator_post_count(creator_id)
self.db.update_creator(creator_id, {'post_count': current_count})
self._emit_event('paid_content_creator_updated', {
'creator_id': creator_id,
'post_count': current_count,
})
# Queue downloads (don't block crawling)
if do_download and dl_session:
pending = self.db.get_pending_attachments_for_post(post_db_id)
if pending:
album_title = self._sanitize_filename(
post.content or post.title or f"album_{post.post_id}")
domain = self._sanitize_filename(creator['username'])
output_dir = base_path / 'coppermine' / domain / album_title
for att in pending:
await dl_queue.put((att, output_dir))
# Start download workers
workers = []
if do_download and dl_session:
workers = [asyncio.create_task(_dl_worker()) for _ in range(5)]
try:
await client.get_posts(
gallery_url,
known_post_ids=known_post_ids,
progress_callback=progress_callback,
post_callback=on_album,
)
# Signal workers that no more items will be added, then wait for queue drain
if workers:
self._update_active_sync(creator_id, {
'status': f'Finishing downloads ({dl_queue.qsize()} remaining)...',
'phase': 'downloading',
})
await dl_queue.join()
dl_done.set()
await asyncio.gather(*workers, return_exceptions=True)
finally:
dl_done.set()
if dl_session:
await dl_session.close()
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'post_count': self.db.get_creator_post_count(creator_id)
})
# Clean up permanently failed attachments and empty posts
self._cleanup_coppermine_failed(creator_id)
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': new_posts, 'new_attachments': new_attachments,
'downloaded': downloaded, 'failed': failed
})
self._send_creator_notification(creator, new_posts, downloaded,
downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True, new_posts=new_posts, new_attachments=new_attachments,
downloaded_files=downloaded, failed_files=failed,
downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing Coppermine gallery {creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
})
return SyncResult(success=False, error=str(e))
def _cleanup_coppermine_failed(self, creator_id: int):
"""Remove permanently failed attachments and delete posts left with zero attachments."""
try:
with self.unified_db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute("""
DELETE FROM paid_content_attachments
WHERE status = 'failed'
AND post_id IN (SELECT id FROM paid_content_posts WHERE creator_id = ?)
""", (creator_id,))
removed_atts = cursor.rowcount
if removed_atts > 0:
cursor.execute("""
DELETE FROM paid_content_posts
WHERE creator_id = ?
AND id NOT IN (SELECT DISTINCT post_id FROM paid_content_attachments)
""", (creator_id,))
removed_posts = cursor.rowcount
cursor.execute("""
UPDATE paid_content_posts
SET attachment_count = (
SELECT COUNT(*) FROM paid_content_attachments WHERE post_id = paid_content_posts.id
)
WHERE creator_id = ?
""", (creator_id,))
conn.commit()
if removed_atts or removed_posts:
self.log(f"Cleanup: removed {removed_atts} failed attachments, {removed_posts} empty posts", 'info')
current_count = self.db.get_creator_post_count(creator_id)
self.db.update_creator(creator_id, {'post_count': current_count})
except Exception as e:
self.log(f"Error during coppermine cleanup: {e}", 'warning')
async def _download_coppermine_media(self, creator_id: int) -> Dict:
"""Download pending Coppermine images via direct HTTP (no auth needed)."""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
self.log(f"Downloading {len(pending)} images for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'phase': 'downloading', 'status': f'Downloading {len(pending)} files...',
'total_files': len(pending), 'downloaded': 0
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id, 'username': creator['username'],
'status': f'Downloading {len(pending)} files...',
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
headers = CoppermineClient.HEADERS
downloaded = 0
failed = 0
downloaded_file_info = []
timeout = aiohttp.ClientTimeout(total=120)
async with aiohttp.ClientSession(timeout=timeout) as session:
for i, att in enumerate(pending):
try:
post = self.db.get_post(att['post_id'])
if not post:
failed += 1
continue
# Organize: /paid-content/coppermine/{domain}/{album-title}/
album_title = self._sanitize_filename(post.get('content') or post.get('title') or f"album_{post.get('post_id', 'unknown')}")
domain = self._sanitize_filename(creator['username'])
output_dir = base_path / 'coppermine' / domain / album_title
download_url = att.get('download_url')
if not download_url:
self.db.update_attachment_status(att['id'], 'failed',
error_message='No download URL')
failed += 1
continue
self._update_active_sync(creator_id, {
'status': f'Downloading {i + 1}/{len(pending)}...',
'phase': 'downloading',
'downloaded': downloaded,
'progress': i + 1,
'total_files': len(pending),
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id, 'username': creator['username'],
'status': f'Downloading {i + 1}/{len(pending)}...',
'phase': 'downloading', 'downloaded': downloaded,
'progress': i + 1, 'total_files': len(pending),
})
filename = att.get('name') or att.get('local_filename') or f"photo_{i}.jpg"
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / filename
self.db.update_attachment_status(att['id'], 'downloading')
# Direct HTTP download (no auth needed)
try:
async with session.get(download_url, headers=headers) as resp:
if resp.status != 200:
self.db.update_attachment_status(att['id'], 'failed',
error_message=f'HTTP {resp.status}',
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
total = 0
async with aiofiles.open(str(output_path), 'wb') as f:
async for chunk in resp.content.iter_chunked(65536):
await f.write(chunk)
total += len(chunk)
except Exception as dl_err:
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(dl_err),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
file_path = str(output_path)
file_size = total
# Extract dimensions for images
width, height, duration = None, None, None
file_type = att.get('file_type', 'image')
if file_path:
width, height, duration = self._extract_dimensions(
Path(file_path), file_type)
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name,
file_size=file_size,
width=width, height=height, duration=duration,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(att['post_id'])
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
downloaded += 1
if file_path:
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name,
'source': creator['username'],
'content_type': file_type
})
# Rate limit: 0.5s between downloads (no auth = lighter)
await asyncio.sleep(0.5)
except Exception as e:
self.log(f"Error downloading Coppermine media: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
# -------------------------------------------------------------------------
# XenForo Forums (HQCelebCorner, PicturePub, etc.)
# -------------------------------------------------------------------------
async def _add_xenforo_creator(self, service_id: str, celebrity_name: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a XenForo forum celebrity as a creator (creator_id = celebrity name)."""
client = self._get_xenforo_client(service_id)
# Search for threads to validate the celebrity name returns results
threads = await client.search_threads(celebrity_name)
if not threads:
return {'success': False, 'error': f'No threads found for "{celebrity_name}" on {service_id}'}
# Sanitize name for username
username = re.sub(r'[^a-zA-Z0-9_-]', '-', celebrity_name.lower()).strip('-')
username = re.sub(r'-+', '-', username)
# Build search URL for re-use during sync
from urllib.parse import quote_plus
search_url = f"{client.BASE_URL}/index.php?search/&q={quote_plus(celebrity_name)}&c[title_only]=1&o=date"
creator_data = {
'service_id': service_id,
'platform': service_id,
'creator_id': celebrity_name,
'username': username,
'display_name': celebrity_name,
'post_count': len(threads),
'bio': search_url,
'auto_download': 1 if auto_download else 0,
'download_embeds': 1 if download_embeds else 0,
}
db_id = self.db.add_creator(creator_data)
self.log(f"Added {service_id} creator: {celebrity_name} ({len(threads)} threads found)", 'info')
return {'success': True, 'creator': {'id': db_id, **creator_data}}
async def _sync_xenforo_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
"""Sync a XenForo forum celebrity — search threads, scrape images."""
creator_id = creator['id']
service_id = creator['service_id']
celebrity_name = creator['creator_id'] # The search query
self.log(f"Syncing {service_id}: {celebrity_name}", 'info')
sync_data = {
'username': creator['username'],
'platform': service_id,
'service': service_id,
'status': 'Searching threads...',
'phase': 'fetching',
}
self._register_active_sync(creator_id, sync_data)
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
_dl_done = asyncio.Event()
_dl_task = None
try:
client = self._get_xenforo_client(service_id)
# Get known post IDs and their stored metadata (reply counts)
known_post_ids = set()
known_metadata = {}
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT post_id, metadata FROM paid_content_posts WHERE creator_id = ?",
(creator_id,)
)
for row in cursor.fetchall():
known_post_ids.add(row[0])
if row[1]:
try:
known_metadata[row[0]] = json.loads(row[1])
except (json.JSONDecodeError, TypeError):
pass
# Search for threads
threads = await client.search_threads(celebrity_name)
if not threads:
self.log(f"No threads found for {celebrity_name}", 'info')
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
self._unregister_active_sync(creator_id)
return SyncResult(success=True, new_posts=0, new_attachments=0)
self._update_active_sync(creator_id, {
'status': f'Found {len(threads)} threads, checking for updates...',
'posts_fetched': len(threads),
})
new_posts = 0
new_attachments = 0
threads_to_scrape = []
for thread in threads:
post_id = f"thread_{thread['thread_id']}"
if post_id in known_post_ids:
# Known thread — skip for now (update detection happens on
# re-scrape when we have a way to check reply counts efficiently)
continue
else:
# New thread — queue for scraping (page_count=None lets
# get_thread_images auto-detect it from page 1)
threads_to_scrape.append({
**thread,
'post_id': post_id,
'is_update': False,
'start_page': 1,
'page_count': None,
'reply_count': 0,
})
self.log(f"{len(threads_to_scrape)} threads to scrape ({len(threads)} total found)", 'info')
self._update_active_sync(creator_id, {
'status': f'Scraping {len(threads_to_scrape)} threads...',
'phase': 'processing',
})
# Concurrent download loop — downloads while scraping continues
_dl_results = {'downloaded': 0, 'failed': 0, 'downloaded_file_info': []}
async def _bg_download_loop():
await asyncio.sleep(15) # Let some posts queue up first
while not _dl_done.is_set():
r = await self._download_xenforo_media(creator_id, quiet=True)
_dl_results['downloaded'] += r.get('downloaded', 0)
_dl_results['failed'] += r.get('failed', 0)
_dl_results['downloaded_file_info'].extend(r.get('downloaded_file_info', []))
if r.get('downloaded', 0) == 0:
try:
await asyncio.wait_for(_dl_done.wait(), timeout=10)
except asyncio.TimeoutError:
pass
if download and creator.get('auto_download', True):
_dl_task = asyncio.create_task(_bg_download_loop())
# Track direct image URLs across threads to detect signatures
from collections import Counter
_cross_thread_urls = Counter()
# Shared per-host semaphores for URL resolution across all threads
_resolve_sem: Dict[str, asyncio.Semaphore] = {}
# Semaphore to limit concurrent thread scraping (avoid hammering the forum)
_scrape_sem = asyncio.Semaphore(3)
_threads_done = 0
# Lock for DB writes (not thread-safe from concurrent coroutines)
_db_lock = asyncio.Lock()
async def _scrape_resolve_and_store(thread: Dict) -> None:
"""Scrape a thread, resolve image URLs, and write to DB immediately."""
nonlocal _threads_done, new_posts, new_attachments
async with _scrape_sem:
image_links = await client.get_thread_images(
thread['url'],
page_count=thread['page_count'],
start_page=thread['start_page'],
)
if not image_links:
_threads_done += 1
self._update_active_sync(creator_id, {
'status': f'Processed {_threads_done}/{len(threads_to_scrape)} threads...',
'phase': 'processing',
})
return
# Resolve image host URLs to direct URLs concurrently
async def _resolve_one(img_info: Dict, session: aiohttp.ClientSession) -> Optional[Attachment]:
img_url = img_info['url']
# Skip dead hosts
if any(dh in img_url.lower() for dh in XenForoForumClient.DEAD_HOSTS):
return None
if img_info.get('host') == 'direct':
direct_url = img_url
else:
try:
host = urlparse(img_url).netloc.lower()
except Exception:
host = '_default'
if host not in _resolve_sem:
_resolve_sem[host] = asyncio.Semaphore(3)
async with _resolve_sem[host]:
direct_url = await client.resolve_image_url(img_url, session=session)
await asyncio.sleep(0.3)
if direct_url:
filename = client._filename_from_url(direct_url)
ext = client._get_extension(filename)
_cross_thread_urls[direct_url] += 1
return Attachment(
name=filename,
file_type='image' if ext in client.IMAGE_EXTS else 'unknown',
extension=ext or None,
server_path=img_url,
download_url=direct_url,
)
return None
resolve_timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=resolve_timeout) as session:
# Batch image resolution to avoid spawning hundreds of coroutines
_IMG_BATCH = 50
attachments = []
for ri in range(0, len(image_links), _IMG_BATCH):
img_batch = image_links[ri:ri + _IMG_BATCH]
results = await asyncio.gather(*[_resolve_one(img, session) for img in img_batch])
attachments.extend(a for a in results if a is not None)
_threads_done += 1
self._update_active_sync(creator_id, {
'status': f'Processed {_threads_done}/{len(threads_to_scrape)} threads...',
'phase': 'processing',
})
if not attachments:
return
# Filter out signature images (same URL in 3+ threads)
attachments = [a for a in attachments if _cross_thread_urls.get(a.download_url, 0) < 3]
if not attachments:
return
post = Post(
post_id=thread['post_id'],
service_id=service_id,
platform=service_id,
creator_id=celebrity_name,
title=thread['title'],
content='',
published_at=thread.get('published_at'),
)
post.attachments = attachments
post_data = post.to_dict()
post_data['metadata'] = json.dumps({
'reply_count': thread.get('reply_count', 0),
'thread_id': thread['thread_id'],
'thread_url': thread['url'],
})
# DB writes under lock so background downloader sees them immediately
async with _db_lock:
post_db_id, is_new_post = self.db.upsert_post(creator_id, post_data)
if post_db_id:
if is_new_post:
new_posts += 1
for att_idx, attachment in enumerate(attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = att_idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
self._apply_auto_tag_rules(post_db_id, is_new_post)
# Scrape + resolve + store all threads concurrently
self._update_active_sync(creator_id, {
'status': f'Scraping {len(threads_to_scrape)} threads (3 concurrent)...',
'phase': 'processing',
})
# Process threads in batches to avoid spawning thousands of coroutines
_THREAD_BATCH = 50
for i in range(0, len(threads_to_scrape), _THREAD_BATCH):
batch = threads_to_scrape[i:i + _THREAD_BATCH]
await asyncio.gather(
*[_scrape_resolve_and_store(t) for t in batch]
)
# Stop concurrent download loop and collect its results
_dl_done.set()
if _dl_task:
await _dl_task
downloaded = _dl_results['downloaded']
failed = _dl_results['failed']
downloaded_file_info = list(_dl_results['downloaded_file_info'])
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'post_count': self.db.get_creator_post_count(creator_id),
})
# Final download sweep for remaining attachments
if download and creator.get('auto_download', True):
result = await self._download_xenforo_media(creator_id)
downloaded += result.get('downloaded', 0)
failed += result.get('failed', 0)
downloaded_file_info.extend(result.get('downloaded_file_info', []))
# Clean up permanently failed attachments and empty posts
self._cleanup_xenforo_failed(creator_id)
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': new_posts, 'new_attachments': new_attachments,
'downloaded': downloaded, 'failed': failed,
})
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True, new_posts=new_posts, new_attachments=new_attachments,
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info,
)
except Exception as e:
_dl_done.set()
if _dl_task:
try:
await _dl_task
except Exception:
pass
self.log(f"Error syncing {service_id} {celebrity_name}: {e}", 'error')
import traceback
self.log(traceback.format_exc(), 'debug')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id, 'username': creator['username'], 'error': str(e),
})
return SyncResult(success=False, error=str(e))
def _cleanup_xenforo_failed(self, creator_id: int):
"""Remove permanently failed attachments and delete posts left with zero attachments."""
try:
with self.unified_db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute("""
DELETE FROM paid_content_attachments
WHERE status = 'failed'
AND post_id IN (SELECT id FROM paid_content_posts WHERE creator_id = ?)
""", (creator_id,))
removed_atts = cursor.rowcount
if removed_atts > 0:
cursor.execute("""
DELETE FROM paid_content_posts
WHERE creator_id = ?
AND id NOT IN (SELECT DISTINCT post_id FROM paid_content_attachments)
""", (creator_id,))
removed_posts = cursor.rowcount
cursor.execute("""
UPDATE paid_content_posts
SET attachment_count = (
SELECT COUNT(*) FROM paid_content_attachments WHERE post_id = paid_content_posts.id
)
WHERE creator_id = ?
""", (creator_id,))
conn.commit()
if removed_atts or removed_posts:
self.log(f"XenForo cleanup: removed {removed_atts} failed attachments, {removed_posts} empty posts", 'info')
current_count = self.db.get_creator_post_count(creator_id)
self.db.update_creator(creator_id, {'post_count': current_count})
except Exception as e:
self.log(f"Error during XenForo cleanup: {e}", 'warning')
async def _download_xenforo_media(self, creator_id: int, quiet: bool = False) -> Dict:
"""Download pending XenForo forum images via concurrent HTTP."""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
service_id = creator['service_id']
self.log(f"Downloading {len(pending)} media files for {creator['display_name']}", 'info')
if not quiet:
self._update_active_sync(creator_id, {
'phase': 'downloading', 'status': f'Downloading {len(pending)} files...',
'total_files': len(pending), 'downloaded': 0,
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id, 'username': creator['username'],
'status': f'Downloading {len(pending)} files...',
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
celebrity_dir = self._sanitize_filename(creator['display_name'] or creator['username'])
downloaded = 0
failed = 0
completed_count = 0
downloaded_file_info = []
# Per-host semaphores to avoid overwhelming individual image hosts
_host_semaphores: Dict[str, asyncio.Semaphore] = {}
_MAX_PER_HOST = 4
# Hosts that need lower concurrency (known to reject/throttle)
_SENSITIVE_HOSTS = {'pixhost.to': 2}
def _get_host_sem(url: str) -> asyncio.Semaphore:
try:
host = urlparse(url).netloc.lower()
except Exception:
host = '_default'
# Group subdomains under base domain (e.g. img1.pixhost.to → pixhost.to)
parts = host.rsplit('.', 2)
base_domain = '.'.join(parts[-2:]) if len(parts) >= 2 else host
# Use base domain as key so all pixhost subdomains share one semaphore
sem_key = base_domain
if sem_key not in _host_semaphores:
limit = _SENSITIVE_HOSTS.get(base_domain, _MAX_PER_HOST)
_host_semaphores[sem_key] = asyncio.Semaphore(limit)
return _host_semaphores[sem_key]
async def _download_one(att: Dict, session: aiohttp.ClientSession) -> None:
nonlocal downloaded, failed, completed_count
try:
post = self.db.get_post(att['post_id'])
if not post:
failed += 1
completed_count += 1
return
thread_title = self._sanitize_filename(post.get('title') or 'unknown-thread')
output_dir = base_path / service_id / celebrity_dir / thread_title
download_url = att.get('download_url')
if not download_url:
self.db.update_attachment_status(att['id'], 'failed', error_message='No download URL')
failed += 1
completed_count += 1
return
# Skip dead hosts entirely
dl_lower = download_url.lower()
if any(dh in dl_lower for dh in XenForoForumClient.DEAD_HOSTS):
self.db.update_attachment_status(att['id'], 'failed', error_message='Dead host')
failed += 1
completed_count += 1
return
filename = att.get('name') or att.get('local_filename') or f"media_{att['id']}.jpg"
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / filename
# Skip if already exists
if output_path.exists() and output_path.stat().st_size > 0:
self.db.update_attachment_status(att['id'], 'completed',
local_path=str(output_path),
local_filename=output_path.name,
file_size=output_path.stat().st_size,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(att['post_id'])
downloaded += 1
completed_count += 1
return
headers = dict(XenForoForumClient.HEADERS)
try:
host_domain = urlparse(att.get('server_path', download_url)).netloc
headers['Referer'] = f'https://{host_domain}/'
except Exception:
pass
host_sem = _get_host_sem(download_url)
# Retry up to 3 times for transient errors (connection resets, incomplete responses)
last_err = None
for attempt in range(3):
async with host_sem:
# Mark as downloading only when we actually hold the semaphore
if attempt == 0:
self.db.update_attachment_status(att['id'], 'downloading')
try:
async with session.get(download_url, headers=headers) as resp:
if resp.status != 200:
last_err = f'HTTP {resp.status}'
break # Non-transient, don't retry
total = 0
async with aiofiles.open(str(output_path), 'wb') as f:
async for chunk in resp.content.iter_chunked(65536):
await f.write(chunk)
total += len(chunk)
last_err = None
break # Success
except Exception as dl_err:
last_err = str(dl_err)
# Clean up partial file
try:
if output_path.exists():
output_path.unlink()
except Exception:
pass
# Don't retry DNS failures or permanent connection errors
err_lower = last_err.lower()
if 'name or service not known' in err_lower or 'no address associated' in err_lower:
break
if attempt < 2:
await asyncio.sleep(2 * (attempt + 1))
if last_err:
# 404/410 = permanently gone, don't show in failed queue
if last_err in ('HTTP 404', 'HTTP 410'):
status = 'gone'
else:
status = 'failed'
self.db.update_attachment_status(att['id'], status,
error_message=last_err,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
completed_count += 1
return
file_path = str(output_path)
file_size = total
width, height, duration = None, None, None
file_type = att.get('file_type', 'image')
if file_path:
width, height, duration = self._extract_dimensions(Path(file_path), file_type)
# Detect image-host placeholder/thumbnail images and discard them.
# Real forum photos are 800px+ on the long side; anything under
# 500px is a thumbnail, placeholder, or "image removed" stub.
if file_type == 'image' and width and height:
max_dim = max(width, height)
if max_dim < 500:
try:
output_path.unlink(missing_ok=True)
except Exception:
pass
self.db.update_attachment_status(att['id'], 'gone',
error_message=f'Thumbnail/placeholder ({width}x{height})')
failed += 1
completed_count += 1
return
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name,
file_size=file_size,
width=width, height=height, duration=duration,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(att['post_id'])
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
downloaded += 1
completed_count += 1
if file_path:
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name,
'source': creator['display_name'] or creator['username'],
'content_type': file_type,
})
if not quiet:
self._update_active_sync(creator_id, {
'status': f'Downloaded {downloaded}/{len(pending)}...',
'phase': 'downloading',
'downloaded': downloaded,
'progress': completed_count,
'total_files': len(pending),
})
except Exception as e:
self.log(f"Error downloading XenForo media: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
completed_count += 1
# Process in batches to avoid spawning tens of thousands of coroutines
_BATCH_SIZE = 50
timeout = aiohttp.ClientTimeout(total=120)
async with aiohttp.ClientSession(timeout=timeout) as session:
for i in range(0, len(pending), _BATCH_SIZE):
batch = pending[i:i + _BATCH_SIZE]
tasks = [_download_one(att, session) for att in batch]
await asyncio.gather(*tasks)
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
# -------------------------------------------------------------------------
# Snapchat
# -------------------------------------------------------------------------
async def _add_snapchat_creator(self, username: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a Snapchat creator"""
client = self._get_snapchat_client()
# Single fetch — get_creator_info fetches the profile page once
raw_info = await asyncio.to_thread(client.get_creator_info, username)
if not raw_info:
return {'success': False, 'error': 'Snapchat creator not found'}
creator_data = {
'service_id': 'snapchat',
'platform': 'snapchat',
'creator_id': username,
'username': raw_info.get('creator_name', username),
'display_name': raw_info.get('creator_name'),
'profile_image_url': raw_info.get('profile_image_url'),
'auto_download': 1 if auto_download else 0,
'download_embeds': 1 if download_embeds else 0,
}
db_id = self.db.add_creator(creator_data)
# Cache profile image
if raw_info.get('profile_image_url'):
cached = await self._cache_profile_image(raw_info['profile_image_url'], 'snapchat', username, 'avatar')
if cached:
self.db.update_creator(db_id, {'profile_image_url': cached})
creator_data['profile_image_url'] = cached
return {'success': True, 'creator': {'id': db_id, **creator_data}}
async def _sync_snapchat_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
"""Sync a Snapchat creator - fetch spotlights/highlights and optionally download them"""
creator_id = creator['id']
self.log(f"Syncing Snapchat creator: {creator['username']}", 'info')
sync_data = {
'username': creator['username'],
'platform': 'snapchat',
'service': 'snapchat',
'status': 'Fetching snaps...',
'phase': 'fetching',
'failed': 0
}
self._register_active_sync(creator_id, sync_data)
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
try:
client = self._get_snapchat_client()
username = creator['creator_id']
# Update profile info (avatar, display name)
try:
profile_info = await asyncio.to_thread(client.get_creator_info, username)
if profile_info:
profile_updates = {}
if profile_info.get('creator_name') and profile_info['creator_name'] != username:
profile_updates['display_name'] = profile_info['creator_name']
if profile_info.get('profile_image_url'):
cached = await self._cache_profile_image(profile_info['profile_image_url'], 'snapchat', username, 'avatar')
profile_updates['profile_image_url'] = cached or profile_info['profile_image_url']
if profile_updates:
self.db.update_creator(creator_id, profile_updates)
self.log(f"Updated Snapchat profile for @{username}: {list(profile_updates.keys())}", 'debug')
except Exception as e:
self.log(f"Failed to update Snapchat profile: {e}", 'debug')
# Determine date cutoff
from datetime import timedelta
if scheduled:
since_date = (datetime.now() - timedelta(days=3)).isoformat()
else:
since_date = creator.get('last_post_date')
self._update_active_sync(creator_id, {
'status': 'Fetching spotlights and highlights...',
'phase': 'fetching'
})
posts = await asyncio.to_thread(client.get_posts, username, since_date)
if not posts:
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': 0, 'new_attachments': 0, 'downloaded': 0, 'failed': 0
})
return SyncResult(success=True, new_posts=0, new_attachments=0)
self.log(f"Found {len(posts)} collections for @{username}", 'info')
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} posts...',
'phase': 'processing', 'total_posts': len(posts)
})
new_posts = 0
new_attachments = 0
for post in posts:
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
# Apply auto tags (e.g. "Spotlight", "Highlight")
if is_new_post and post.auto_tags:
for tag_name in post.auto_tags:
tag = self.db.get_tag_by_slug(tag_name.lower().replace(' ', '-'))
if not tag:
tag_id = self.db.create_tag(tag_name,
color='#eab308' if tag_name == 'Spotlight' else '#8b5cf6',
description=f'Snapchat {tag_name}')
else:
tag_id = tag['id']
if tag_id:
self.db.add_tag_to_post(post_db_id, tag_id)
self._apply_auto_tag_rules(post_db_id, is_new_post)
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'last_post_date': latest_post_date or creator.get('last_post_date'),
'post_count': self.db.get_creator_post_count(creator_id)
})
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self._download_snapchat_posts(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': new_posts, 'new_attachments': new_attachments,
'downloaded': downloaded, 'failed': failed
})
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True, new_posts=new_posts, new_attachments=new_attachments,
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing Snapchat creator @{creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
})
return SyncResult(success=False, error=str(e))
async def _download_snapchat_posts(self, creator_id: int) -> Dict:
"""Download pending Snapchat attachments via curl_cffi"""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
client = self._get_snapchat_client()
self.log(f"Downloading {len(pending)} Snapchat files for @{creator['username']}", 'info')
self._update_active_sync(creator_id, {
'phase': 'downloading', 'status': f'Downloading {len(pending)} files...',
'total_files': len(pending), 'downloaded': 0
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
downloaded = 0
failed = 0
downloaded_file_info = []
for i, att in enumerate(pending):
try:
post = self.db.get_post(att['post_id'])
if not post:
failed += 1
continue
published_at = post.get('published_at') or ''
post_date = published_at[:10] if published_at else 'unknown-date'
output_dir = base_path / 'snapchat' / self._sanitize_filename(creator['username']) / post_date
download_url = att.get('download_url')
if not download_url:
self.db.update_attachment_status(att['id'], 'failed', error_message='No media URL')
failed += 1
continue
self._update_active_sync(creator_id, {
'status': f'Downloading file {i + 1}/{len(pending)}...',
'downloaded': downloaded
})
filename = att.get('name') or f"snap_{i}.mp4"
output_dir.mkdir(parents=True, exist_ok=True)
output_path = str(output_dir / filename)
self.db.update_attachment_status(att['id'], 'downloading')
success = await asyncio.to_thread(client.download_snap, download_url, output_path)
if not success:
self.db.update_attachment_status(att['id'], 'failed',
error_message='Download failed',
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
fp = Path(output_path)
if not fp.exists() or fp.stat().st_size == 0:
self.db.update_attachment_status(att['id'], 'failed',
error_message='Empty file',
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
f_size = fp.stat().st_size
ext = fp.suffix.lower()
file_type = att.get('file_type', 'video')
w, h, dur = None, None, None
if file_type == 'video':
w, h, dur = self._extract_dimensions(fp, 'video')
else:
w, h, _ = self._extract_dimensions(fp, 'image')
thumb_data = None
if file_type == 'video':
thumb_data = self._generate_video_thumbnail(fp, seek_time='00:00:01')
self.db.update_attachment_status(att['id'], 'completed',
local_path=str(fp),
local_filename=fp.name,
name=fp.name,
extension=ext,
file_size=f_size,
file_type=file_type,
width=w, height=h, duration=dur,
thumbnail_data=thumb_data,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(att['post_id'])
self.db.increment_creator_download_stats(creator_id, 1, f_size)
downloaded += 1
downloaded_file_info.append({
'file_path': str(fp),
'filename': fp.name,
'source': creator['username'],
'content_type': file_type
})
except Exception as e:
self.log(f"Error downloading Snapchat file: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
# =========================================================================
# REDDIT METHODS
# =========================================================================
async def _add_reddit_creator(self, subreddit: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a Reddit subreddit as a creator"""
client = self._get_reddit_client()
info = await asyncio.to_thread(client.get_subreddit_info, subreddit)
if not info:
return {'success': False, 'error': 'Could not access subreddit'}
creator_data = {
'service_id': 'reddit',
'platform': 'reddit',
'creator_id': info.get('creator_id', subreddit.lower()),
'username': info.get('creator_name', f'r/{subreddit}'),
'display_name': info.get('display_name') or info.get('creator_name', f'r/{subreddit}'),
'profile_image_url': info.get('profile_image_url'),
'banner_image_url': info.get('banner_image_url'),
'bio': info.get('bio'),
'joined_date': info.get('joined_date'),
'auto_download': 1 if auto_download else 0,
'download_embeds': 1 if download_embeds else 0,
}
db_id = self.db.add_creator(creator_data)
# Cache profile and banner images locally
if info.get('profile_image_url'):
cached = await self._cache_profile_image(info['profile_image_url'], 'reddit', creator_data['creator_id'], 'avatar')
if cached:
self.db.update_creator(db_id, {'profile_image_url': cached})
creator_data['profile_image_url'] = cached
if info.get('banner_image_url'):
cached = await self._cache_profile_image(info['banner_image_url'], 'reddit', creator_data['creator_id'], 'banner')
if cached:
self.db.update_creator(db_id, {'banner_image_url': cached})
creator_data['banner_image_url'] = cached
return {'success': True, 'creator': {'id': db_id, **creator_data}}
async def _sync_reddit_creator(self, creator: Dict, download: bool = True, scheduled: bool = False,
force_backfill: bool = False) -> SyncResult:
"""Sync a Reddit subreddit - fetch posts and download media via gallery-dl.
Processes files incrementally as gallery-dl downloads them — posts and
attachments appear in the UI progressively instead of waiting for the
entire download to finish.
"""
creator_id = creator['id']
subreddit = creator['creator_id']
self.log(f"Syncing Reddit subreddit: r/{subreddit}", 'info')
sync_data = {
'username': creator['username'],
'platform': 'reddit',
'service': 'reddit',
'status': 'Starting...',
'phase': 'fetching',
'failed': 0
}
self._register_active_sync(creator_id, sync_data)
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_')
try:
client = self._get_reddit_client()
# Determine date cutoff
from datetime import timedelta
if scheduled:
since_date = (datetime.now() - timedelta(days=3)).isoformat()
else:
since_date = creator.get('last_post_date')
# Update profile info (icon, banner, bio, joined_date, display_name)
try:
profile_info = await asyncio.to_thread(client.get_subreddit_info, subreddit)
if profile_info:
profile_updates = {}
if profile_info.get('profile_image_url'):
cached = await self._cache_profile_image(profile_info['profile_image_url'], 'reddit', subreddit, 'avatar')
profile_updates['profile_image_url'] = cached or profile_info['profile_image_url']
if profile_info.get('banner_image_url'):
cached = await self._cache_profile_image(profile_info['banner_image_url'], 'reddit', subreddit, 'banner')
profile_updates['banner_image_url'] = cached or profile_info['banner_image_url']
if profile_info.get('display_name'):
profile_updates['display_name'] = profile_info['display_name']
if profile_info.get('bio'):
profile_updates['bio'] = profile_info['bio']
if profile_info.get('joined_date'):
profile_updates['joined_date'] = profile_info['joined_date']
if profile_updates:
self.db.update_creator(creator_id, profile_updates)
except Exception as e:
self.log(f"Failed to update Reddit profile: {e}", 'debug')
self._update_active_sync(creator_id, {
'status': 'Connecting to Reddit...',
'phase': 'fetching'
})
# Shared counters (updated from gallery-dl thread)
new_posts = 0
new_attachments = 0
downloaded = 0
failed = 0
downloaded_file_info = []
base_path = Path(self.config.get('base_download_path', '/paid-content'))
latest_post_date = None
def _on_progress(dl_count, skip_count, total_seen):
status = f'Fetching — {dl_count} downloaded'
if skip_count:
status += f', {skip_count} skipped'
status += f' ({total_seen} total)'
if downloaded > 0:
status += f' | {downloaded} processed'
self._update_active_sync(creator_id, {
'status': status,
'phase': 'fetching',
'total_files': total_seen,
'downloaded': downloaded,
})
def _on_batch(files):
"""Process a batch of downloaded files — runs in gallery-dl thread."""
nonlocal new_posts, new_attachments, downloaded, failed, latest_post_date
self.log(f"Processing batch of {len(files)} files from r/{subreddit}", 'debug')
# Group files by post using JSON sidecars
grouped = client._group_files_by_post(files, temp_dir, subreddit)
self.log(f"Grouped into {len(grouped)} posts", 'debug')
for post_id, post_data in grouped.items():
post_dict = {
'post_id': post_id,
'title': post_data.get('title'),
'content': post_data.get('title'),
'published_at': post_data.get('date'),
}
post_db_id, is_new_post = self.db.upsert_post(creator_id, post_dict)
if not post_db_id:
continue
if is_new_post:
new_posts += 1
self._apply_auto_tag_rules(post_db_id, is_new_post)
# Track latest date
if post_data.get('date'):
if not latest_post_date or post_data['date'] > latest_post_date:
latest_post_date = post_data['date']
post_date = (post_data.get('date') or '')[:10] or 'unknown-date'
output_dir = base_path / 'reddit' / self._sanitize_filename(subreddit) / post_date
for idx, file_path in enumerate(post_data['files']):
try:
output_dir.mkdir(parents=True, exist_ok=True)
final_path = output_dir / file_path.name
already_existed = False
if final_path.exists() and final_path.stat().st_size > 0:
# File already at final path — skip the move but still create attachment record
file_path.unlink(missing_ok=True)
already_existed = True
elif final_path.exists():
# Zero-byte file at final path — give unique name
stem = final_path.stem
suffix = final_path.suffix
counter = 1
while final_path.exists():
final_path = output_dir / f"{stem}_{counter}{suffix}"
counter += 1
if not already_existed:
shutil.move(str(file_path), str(final_path))
fp = final_path
f_size = fp.stat().st_size if fp.exists() else 0
ext = fp.suffix.lower()
file_type = RedditClient._detect_file_type(ext)
w, h, dur = None, None, None
if file_type == 'video':
w, h, dur = self._extract_dimensions(fp, 'video')
elif file_type == 'image':
w, h, _ = self._extract_dimensions(fp, 'image')
thumb_data = None
if file_type == 'video':
thumb_data = self._generate_video_thumbnail(fp, seek_time='00:00:01')
att_data = {
'name': fp.name,
'server_path': str(fp),
'file_type': file_type,
'extension': ext,
'file_size': f_size,
'attachment_index': idx,
}
att_id = self.db.upsert_attachment(post_db_id, att_data)
if att_id:
self.db.update_attachment_status(att_id, 'completed',
local_path=str(fp),
local_filename=fp.name,
name=fp.name,
extension=ext,
file_size=f_size,
file_type=file_type,
width=w, height=h, duration=dur,
thumbnail_data=thumb_data,
downloaded_at=datetime.now().isoformat()
)
if not already_existed:
self.db.increment_creator_download_stats(creator_id, 1, f_size)
downloaded_file_info.append({
'file_path': str(fp),
'filename': fp.name,
'source': creator['username'],
'content_type': file_type
})
self.db.mark_post_downloaded(post_db_id)
downloaded += 1
new_attachments += 1
except Exception as e:
self.log(f"Error processing Reddit file {file_path.name}: {e}", 'error')
failed += 1
# First sync (no last_post_date) = unlimited; scheduled = 500 recent
is_first_sync = not creator.get('last_post_date')
max_posts = 500 if scheduled else 0
# Run gallery-dl with incremental batch processing
result = await asyncio.to_thread(
client.run_gallery_dl, subreddit, temp_dir, since_date, max_posts,
_on_progress, _on_batch, 50)
dl_total = result.get('dl_count', 0)
skip_total = result.get('skip_count', 0)
# Pullpush historical backfill on first sync or when forced
if (is_first_sync or force_backfill) and not scheduled:
try:
backfill_counters = {
'new_posts': 0, 'new_attachments': 0,
'downloaded': 0, 'failed': 0, 'latest_post_date': None
}
await self._backfill_reddit_pullpush(
creator_id, subreddit, base_path, creator,
temp_dir, client, downloaded_file_info, backfill_counters)
new_posts += backfill_counters['new_posts']
new_attachments += backfill_counters['new_attachments']
downloaded += backfill_counters['downloaded']
failed += backfill_counters['failed']
if backfill_counters['latest_post_date']:
if not latest_post_date or backfill_counters['latest_post_date'] > latest_post_date:
latest_post_date = backfill_counters['latest_post_date']
except Exception as e:
self.log(f"Pullpush backfill failed for r/{subreddit}: {e}", 'error')
import traceback
self.log(traceback.format_exc(), 'debug')
# Post-sync sweep: find files on disk without attachment records
sweep_count = self._sweep_reddit_missing_attachments(
creator_id, subreddit, base_path, downloaded_file_info, creator)
if sweep_count > 0:
self.log(f"Sweep found {sweep_count} orphaned files for r/{subreddit}", 'info')
new_attachments += sweep_count
downloaded += sweep_count
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'last_post_date': latest_post_date or creator.get('last_post_date'),
'post_count': self.db.get_creator_post_count(creator_id)
})
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id, 'username': creator['username'],
'new_posts': new_posts, 'new_attachments': new_attachments,
'downloaded': downloaded, 'failed': failed,
'skipped': skip_total
})
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True, new_posts=new_posts, new_attachments=new_attachments,
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing Reddit r/{subreddit}: {e}", 'error')
import traceback
self.log(traceback.format_exc(), 'debug')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
})
return SyncResult(success=False, error=str(e))
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
async def _backfill_reddit_pullpush(self, creator_id: int, subreddit: str,
base_path: Path, creator: Dict,
temp_dir: str, client,
downloaded_file_info: list,
counters: dict):
"""Backfill historical posts from Pullpush (Pushshift) archive.
Called after the initial gallery-dl sync on first-time syncs to fetch
posts older than what Reddit's listing API returns (~1000 posts).
Args:
counters: Mutable dict with keys new_posts, new_attachments, downloaded,
failed, latest_post_date — updated in place.
"""
self.log(f"Starting Pullpush backfill for r/{subreddit}", 'info')
# Phase 1: Fetch all post IDs from Pullpush
self._update_active_sync(creator_id, {
'status': 'Backfill — fetching post list from archive...',
'phase': 'backfill'
})
def _pp_progress(count, msg):
self._update_active_sync(creator_id, {
'status': f'Backfill — {msg}',
'phase': 'backfill'
})
pp_posts = await asyncio.to_thread(
client.get_pullpush_post_ids, subreddit,
progress_callback=_pp_progress)
if not pp_posts:
self.log(f"Pullpush returned no posts for r/{subreddit}", 'info')
return
self.log(f"Pullpush returned {len(pp_posts)} post IDs for r/{subreddit}", 'info')
# Phase 2: Filter out post IDs already in our DB
self._update_active_sync(creator_id, {
'status': f'Backfill — filtering {len(pp_posts)} posts against DB...',
'phase': 'backfill'
})
existing_post_ids = set()
try:
with self.db.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
(creator_id,))
for row in cursor.fetchall():
existing_post_ids.add(row[0])
except Exception as e:
self.log(f"Error fetching existing post IDs for backfill: {e}", 'error')
return
# Build metadata lookup and filter new posts
pp_metadata = {}
new_pp_posts = []
for pp in pp_posts:
pid = pp['id']
pp_metadata[pid] = pp
if pid not in existing_post_ids:
new_pp_posts.append(pp)
self.log(f"Backfill: {len(new_pp_posts)} new posts out of {len(pp_posts)} total "
f"({len(existing_post_ids)} already in DB)", 'info')
if not new_pp_posts:
self._update_active_sync(creator_id, {
'status': 'Backfill — no new historical posts to fetch',
'phase': 'backfill'
})
return
# Phase 3: Write URLs to temp file
urls_file = os.path.join(temp_dir, 'backfill_urls.txt')
with open(urls_file, 'w') as f:
for pp in new_pp_posts:
f.write(f"https://www.reddit.com/r/{subreddit}/comments/{pp['id']}/\n")
self._update_active_sync(creator_id, {
'status': f'Backfill — downloading {len(new_pp_posts)} historical posts...',
'phase': 'backfill'
})
# Phase 4: Run gallery-dl with --input-file using same batch processing
def _on_backfill_progress(dl_count, skip_count, total_seen):
status = f'Backfill — {dl_count} downloaded'
if skip_count:
status += f', {skip_count} skipped'
status += f' ({total_seen}/{len(new_pp_posts)} posts)'
if counters['downloaded'] > 0:
status += f' | {counters["downloaded"]} processed'
self._update_active_sync(creator_id, {
'status': status,
'phase': 'backfill',
})
def _on_backfill_batch(files):
"""Process a batch of backfilled files — same logic as normal sync."""
self.log(f"Backfill batch: {len(files)} files from r/{subreddit}", 'debug')
grouped = client._group_files_by_post(files, temp_dir, subreddit)
self.log(f"Backfill grouped into {len(grouped)} posts", 'debug')
for post_id, post_data in grouped.items():
# Use Pullpush metadata as fallback if gallery-dl sidecar is missing
pp_meta = pp_metadata.get(post_id, {})
title = post_data.get('title') or pp_meta.get('title', '')
pub_date = post_data.get('date')
if not pub_date and pp_meta.get('created_utc'):
try:
pub_date = datetime.fromtimestamp(pp_meta['created_utc']).isoformat()
except (ValueError, OSError):
pass
post_dict = {
'post_id': post_id,
'title': title,
'content': title,
'published_at': pub_date,
}
post_db_id, is_new_post = self.db.upsert_post(creator_id, post_dict)
if not post_db_id:
continue
if is_new_post:
counters['new_posts'] += 1
self._apply_auto_tag_rules(post_db_id, is_new_post)
if pub_date:
if not counters['latest_post_date'] or pub_date > counters['latest_post_date']:
counters['latest_post_date'] = pub_date
post_date = (pub_date or '')[:10] or 'unknown-date'
output_dir = base_path / 'reddit' / self._sanitize_filename(subreddit) / post_date
for idx, file_path in enumerate(post_data['files']):
try:
output_dir.mkdir(parents=True, exist_ok=True)
final_path = output_dir / file_path.name
already_existed = False
if final_path.exists() and final_path.stat().st_size > 0:
file_path.unlink(missing_ok=True)
already_existed = True
elif final_path.exists():
stem = final_path.stem
suffix = final_path.suffix
counter = 1
while final_path.exists():
final_path = output_dir / f"{stem}_{counter}{suffix}"
counter += 1
if not already_existed:
shutil.move(str(file_path), str(final_path))
fp = final_path
f_size = fp.stat().st_size if fp.exists() else 0
ext = fp.suffix.lower()
file_type = RedditClient._detect_file_type(ext)
w, h, dur = None, None, None
if file_type == 'video':
w, h, dur = self._extract_dimensions(fp, 'video')
elif file_type == 'image':
w, h, _ = self._extract_dimensions(fp, 'image')
thumb_data = None
if file_type == 'video':
thumb_data = self._generate_video_thumbnail(fp, seek_time='00:00:01')
att_data = {
'name': fp.name,
'server_path': str(fp),
'file_type': file_type,
'extension': ext,
'file_size': f_size,
'attachment_index': idx,
}
att_id = self.db.upsert_attachment(post_db_id, att_data)
if att_id:
self.db.update_attachment_status(att_id, 'completed',
local_path=str(fp),
local_filename=fp.name,
name=fp.name,
extension=ext,
file_size=f_size,
file_type=file_type,
width=w, height=h, duration=dur,
thumbnail_data=thumb_data,
downloaded_at=datetime.now().isoformat()
)
if not already_existed:
self.db.increment_creator_download_stats(creator_id, 1, f_size)
downloaded_file_info.append({
'file_path': str(fp),
'filename': fp.name,
'source': creator['username'],
'content_type': file_type
})
self.db.mark_post_downloaded(post_db_id)
counters['downloaded'] += 1
counters['new_attachments'] += 1
except Exception as e:
self.log(f"Error processing backfill file {file_path.name}: {e}", 'error')
counters['failed'] += 1
result = await asyncio.to_thread(
client.run_gallery_dl_urls, urls_file, temp_dir,
_on_backfill_progress, _on_backfill_batch, 50)
backfill_dl = result.get('dl_count', 0)
backfill_skip = result.get('skip_count', 0)
self.log(f"Backfill complete for r/{subreddit}: {backfill_dl} downloaded, "
f"{backfill_skip} skipped", 'info')
async def _download_reddit_posts(self, creator_id: int) -> Dict:
"""Re-download pending Reddit attachments (for retry scenarios).
Reddit files are normally downloaded during sync via gallery-dl.
This handles the case where files need to be re-fetched.
"""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
# For Reddit, pending attachments without local files need a re-sync
# Mark them as needing attention — gallery-dl will pick them up on next sync
self.log(f"Reddit has {len(pending)} pending attachments for r/{creator['creator_id']} — will be fetched on next sync", 'info')
return {'downloaded': 0, 'failed': 0}
def _sweep_reddit_missing_attachments(self, creator_id: int, subreddit: str,
base_path: Path, downloaded_file_info: list,
creator: dict) -> int:
"""Scan files on disk for this Reddit creator and create any missing attachment records.
This catches files that were moved to the final path but whose attachment records
weren't created (e.g., due to batch boundary splits or the already-exists skip).
"""
media_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.mp4', '.webm', '.mov', '.avi', '.mkv'}
creator_dir = base_path / 'reddit' / self._sanitize_filename(subreddit)
if not creator_dir.exists():
return 0
# Get all existing attachment local_paths for this creator
existing_paths = set()
try:
with self.db.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT a.server_path FROM paid_content_attachments a
JOIN paid_content_posts p ON a.post_id = p.id
WHERE p.creator_id = ?
""", (creator_id,))
for row in cursor.fetchall():
if row[0]:
existing_paths.add(row[0])
except Exception as e:
self.log(f"Error fetching existing paths for sweep: {e}", 'debug')
return 0
# Scan disk for media files not in DB
orphaned = []
for date_dir in creator_dir.iterdir():
if not date_dir.is_dir():
continue
for fp in date_dir.iterdir():
if fp.suffix.lower() in media_exts and str(fp) not in existing_paths:
orphaned.append(fp)
if not orphaned:
return 0
self.log(f"Sweep: found {len(orphaned)} files on disk without attachment records for r/{subreddit}", 'info')
# Group orphaned files by post ID (from filename: "{post_id} {num} {title}.{ext}")
post_files = {}
for fp in orphaned:
parts = fp.stem.split(' ', 2)
if len(parts) >= 2:
post_id = parts[0]
else:
post_id = fp.stem
post_files.setdefault(post_id, []).append(fp)
created = 0
for post_id, files in post_files.items():
# Find the DB post for this post_id
try:
with self.db.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT id FROM paid_content_posts
WHERE creator_id = ? AND post_id = ?
""", (creator_id, post_id))
row = cursor.fetchone()
if not row:
continue
post_db_id = row[0]
for idx, fp in enumerate(files):
try:
f_size = fp.stat().st_size if fp.exists() else 0
ext = fp.suffix.lower()
file_type = RedditClient._detect_file_type(ext)
att_data = {
'name': fp.name,
'server_path': str(fp),
'file_type': file_type,
'extension': ext,
'file_size': f_size,
'attachment_index': idx,
}
att_id = self.db.upsert_attachment(post_db_id, att_data)
if att_id:
self.db.update_attachment_status(att_id, 'completed',
local_path=str(fp),
local_filename=fp.name,
name=fp.name,
extension=ext,
file_size=f_size,
file_type=file_type,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(post_db_id)
created += 1
except Exception as e:
self.log(f"Sweep error for {fp.name}: {e}", 'debug')
except Exception as e:
self.log(f"Sweep error for post {post_id}: {e}", 'debug')
return created
async def _sync_twitch_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
"""Sync a Twitch channel - fetch new clips and optionally download them"""
creator_id = creator['id']
self.log(f"Syncing Twitch channel: {creator['username']}", 'info')
# Register active sync for polling-based updates
sync_data = {
'username': creator['username'],
'platform': 'twitch',
'service': 'twitch',
'status': 'Fetching clips...',
'phase': 'fetching',
'failed': 0
}
self._register_active_sync(creator_id, sync_data)
# Emit WebSocket event
self._emit_event('paid_content_sync_started', {
'creator_id': creator_id,
**sync_data
})
try:
twitch = self._get_twitch_client()
if not twitch.is_available():
error = "yt-dlp not available"
self._unregister_active_sync(creator_id)
return SyncResult(success=False, error=error)
# Build channel URL from creator_id (which stores the channel name)
channel_url = f"https://www.twitch.tv/{creator['creator_id']}/clips"
# Fetch and update creator profile (display name, avatar, banner, bio, etc.)
try:
profile_info = await twitch.get_channel_profile(creator['creator_id'])
if profile_info:
profile_updates = {}
if profile_info.get('display_name'):
profile_updates['display_name'] = profile_info['display_name']
if profile_info.get('avatar'):
cached = await self._cache_profile_image(profile_info['avatar'], 'twitch', creator['creator_id'], 'avatar')
profile_updates['profile_image_url'] = cached or profile_info['avatar']
if profile_info.get('banner'):
cached = await self._cache_profile_image(profile_info['banner'], 'twitch', creator['creator_id'], 'banner')
profile_updates['banner_image_url'] = cached or profile_info['banner']
if profile_info.get('bio'):
profile_updates['bio'] = profile_info['bio']
if profile_info.get('joined_date'):
profile_updates['joined_date'] = profile_info['joined_date']
if profile_info.get('external_links'):
profile_updates['external_links'] = profile_info['external_links']
if profile_updates:
self.db.update_creator(creator_id, profile_updates)
self.log(f"Updated Twitch creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
except Exception as e:
self.log(f"Failed to update Twitch creator profile: {e}", 'warning')
# Fetch clips since last check with progress callback
# Scheduled syncs only check last 3 days for efficiency
from datetime import timedelta
if scheduled:
since_date = (datetime.now() - timedelta(days=3)).isoformat()
else:
since_date = creator.get('last_post_date')
def progress_callback(count: int):
self._update_active_sync(creator_id, {
'status': f'Fetched {count} clips...',
'posts_fetched': count
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': f'Fetched {count} clips...',
'phase': 'fetching',
'posts_fetched': count
})
# Get clips as Post objects (with thumbnails cached)
# For scheduled syncs, limit to 50 clips max (recent content only)
max_clips = 50 if scheduled else None
posts = await twitch.get_posts(
channel_url,
since_date=since_date,
max_clips=max_clips,
progress_callback=progress_callback
)
if not posts:
self.log(f"No new clips for {creator['username']}", 'debug')
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
# Still download any pending clips even if no new posts
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
pending_count = self.db.get_pending_attachment_count(creator_id)
if pending_count > 0:
self.log(f"Downloading {pending_count} pending clips for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'status': f'Downloading {pending_count} pending clips...',
'phase': 'downloading',
'total_files': pending_count
})
result = await self._download_twitch_clips(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': 0,
'new_attachments': 0,
'downloaded': downloaded,
'failed': failed
})
return SyncResult(success=True, new_posts=0, new_attachments=0,
downloaded_files=downloaded, failed_files=failed,
downloaded_file_info=downloaded_file_info)
self.log(f"Found {len(posts)} new clips for {creator['username']}", 'info')
# Update status
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} clips...',
'phase': 'processing',
'total_posts': len(posts)
})
new_posts = 0
new_attachments = 0
for post in posts:
# Insert/update post in database
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
# Insert clip attachment
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
self._apply_auto_tag_rules(post_db_id, is_new_post)
# Update creator stats - find the actual newest post date (posts may not be sorted by date)
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'last_post_date': latest_post_date or creator.get('last_post_date'),
'post_count': self.db.get_creator_post_count(creator_id)
})
# Download clips if enabled
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self._download_twitch_clips(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
# Unregister from active syncs
self._unregister_active_sync(creator_id)
# Emit completed event
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': new_posts,
'new_attachments': new_attachments,
'downloaded': downloaded,
'failed': failed
})
# Send push notification for new downloads
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
return SyncResult(
success=True,
new_posts=new_posts,
new_attachments=new_attachments,
downloaded_files=downloaded,
failed_files=failed,
downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing Twitch channel {creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id,
'username': creator['username'],
'error': str(e)
})
return SyncResult(success=False, error=str(e))
async def _download_twitch_clips(self, creator_id: int) -> Dict:
"""Download pending Twitch clips using yt-dlp"""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0}
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
twitch = self._get_twitch_client()
if not twitch.is_available():
return {'downloaded': 0, 'failed': 0, 'error': 'yt-dlp not available'}
self.log(f"Downloading {len(pending)} Twitch clips for {creator['username']}", 'info')
# Update status
self._update_active_sync(creator_id, {
'phase': 'downloading',
'status': f'Downloading {len(pending)} clips...',
'total_files': len(pending),
'downloaded': 0
})
base_path = Path(self.config.get('base_download_path', '/paid-content'))
quality = self.config.get('embed_quality', 'best')
downloaded = 0
failed = 0
downloaded_file_info = []
for i, att in enumerate(pending):
try:
post = self.db.get_post(att['post_id'])
if not post:
self.log(f"Post not found for attachment {att.get('id')}", 'warning')
failed += 1
continue
# Build output directory
published_at = post.get('published_at') or ''
post_date = published_at[:10] if published_at else 'unknown-date'
output_dir = base_path / 'twitch' / self._sanitize_filename(creator['username']) / post_date
# Clip URL is stored in download_url
clip_url = att.get('download_url')
if not clip_url:
self.log(f"No download URL for attachment {att.get('id')}, {att.get('name')}", 'warning')
self.db.update_attachment_status(att['id'], 'failed',
error_message='No clip URL'
)
failed += 1
continue
# Update status
self._update_active_sync(creator_id, {
'status': f'Downloading clip {i + 1}/{len(pending)}: {att.get("name", "")[:40]}...',
'downloaded': downloaded
})
self.db.update_attachment_status(att['id'], 'downloading')
# Download using yt-dlp
result = await twitch.download_clip(clip_url, output_dir, quality=quality)
if not result:
self.log(f"No result from yt-dlp for {att.get('name')}", 'warning')
self.db.update_attachment_status(att['id'], 'failed',
error_message='yt-dlp returned no result',
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
continue
if result.get('success'):
file_path = result.get('file_path')
file_size = result.get('file_size', 0)
# Get cached thumbnail from Twitch (already cached during clip fetch)
thumbnail_data = None
# The thumbnail URL should be in post metadata or we can use the original thumbnail
# For now, we'll skip thumbnail generation for clips
# Extract video dimensions
width, height, duration = None, None, None
if file_path:
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
if width and height:
self.log(f"Extracted dimensions for {att.get('name', 'clip')}: {width}x{height}, {duration}s", 'debug')
self.db.update_attachment_status(att['id'], 'completed',
local_path=file_path,
local_filename=Path(file_path).name if file_path else None,
file_size=file_size,
width=width,
height=height,
duration=duration,
downloaded_at=datetime.now().isoformat()
)
# Update post as downloaded if all attachments are done
self.db.mark_post_downloaded(att['post_id'])
# Update creator stats
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
downloaded += 1
self.log(f"Downloaded: {att.get('name', 'clip')}", 'debug')
# Collect file info for notifications
if file_path:
downloaded_file_info.append({
'file_path': file_path,
'filename': Path(file_path).name if file_path else None,
'source': creator['username'],
'content_type': att.get('file_type', 'video')
})
else:
error = result.get('error', 'Unknown error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
self.log(f"Failed to download {att.get('name', 'clip')}: {error}", 'warning')
except Exception as e:
self.log(f"Error downloading Twitch clip: {e}", 'error')
self.db.update_attachment_status(att['id'], 'failed',
error_message=str(e),
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
failed += 1
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
async def download_all_pending(self) -> Dict:
"""Download all pending attachments across all enabled creators.
Used by scheduler to process download queue after sync completes.
Returns total counts across all creators.
"""
creators = self.db.get_creators(enabled_only=True)
total_downloaded = 0
total_failed = 0
for creator in creators:
if not creator.get('auto_download', True):
continue
pending_count = self.db.get_pending_attachment_count(creator['id'])
if pending_count == 0:
continue
self.log(f"Processing {pending_count} pending downloads for {creator['username']}", 'info')
# Register a task so download_pending_for_creator doesn't think it's cancelled
task_id = f"paid_content_sync_{creator['id']}"
self.activity_manager.start_background_task(
task_id, 'paid_content_sync',
display_name=f"Downloading {creator['username']}"
)
try:
result = await self.download_pending_for_creator(creator['id'])
total_downloaded += result.get('downloaded', 0)
total_failed += result.get('failed', 0)
finally:
self.activity_manager.stop_background_task(task_id)
return {'downloaded': total_downloaded, 'failed': total_failed}
async def download_pending_for_creator(self, creator_id: int) -> Dict:
"""Download all pending attachments for a creator.
If auto_retry_failed is enabled, this will keep looping and retrying
failed downloads (that were re-queued as pending) until all files are
downloaded or the sync is cancelled.
"""
creator = self.db.get_creator(creator_id)
if not creator:
return {'downloaded': 0, 'failed': 0, 'skipped': 0, 'error': 'Creator not found'}
total_downloaded = 0
total_failed = 0
total_skipped = 0
downloaded_file_info = [] # Collect file info for notifications
round_num = 0
max_rounds = 100 # Safety limit to prevent infinite loops
base_path = Path(self.config.get('base_download_path', '/paid-content'))
auto_retry = self.config.get('auto_retry_failed', 1)
while round_num < max_rounds:
round_num += 1
# Check if sync was cancelled
# activity_manager: only cancel if task is currently active and then gets stopped
# (stale inactive entries from previous syncs should NOT cancel retries)
task_id = f"paid_content_sync_{creator_id}"
task_status = self.activity_manager.get_background_task(task_id)
if task_status and task_status.get('active'):
# Task is registered and active - this is a real sync, check won't
# trip here. But if on next iteration it becomes inactive, that means cancelled.
pass
elif task_status and not task_status.get('active') and round_num > 1:
# Was active in a previous round but now inactive = cancelled mid-download
self.log(f"Sync cancelled for {creator['username']} (activity_manager), stopping downloads", 'info')
break
# app_state: only check if creator is registered in active syncs
if self.app_state and hasattr(self.app_state, 'active_paid_content_syncs'):
sync_entry = self.app_state.active_paid_content_syncs.get(creator_id)
if sync_entry is not None and not sync_entry.get('active', True):
self.log(f"Sync cancelled for {creator['username']} (app_state), stopping downloads", 'info')
break
pending = self.db.get_pending_attachments(creator_id=creator_id)
if not pending:
if round_num > 1:
self.log(f"All downloads complete for {creator['username']} after {round_num - 1} rounds", 'info')
break
total_files = len(pending)
if round_num == 1:
self.log(f"Downloading {total_files} files for {creator['username']}", 'info')
else:
self.log(f"Round {round_num}: Retrying {total_files} re-queued files for {creator['username']}", 'info')
# Emit download started event
self._emit_event('paid_content_download_started', {
'creator_id': creator_id,
'username': creator['username'],
'total_files': total_files,
'status': f'Downloading {total_files} files...' if round_num == 1 else f'Retrying {total_files} files (round {round_num})...',
'phase': 'downloading'
})
# Update polling-based active sync status
self._update_active_sync(creator_id, {
'phase': 'downloading',
'status': f'Downloading {total_files} files...' if round_num == 1 else f'Retrying {total_files} files (round {round_num})...',
'total_files': total_files,
'downloaded': total_downloaded
})
# Download this batch
results = await self._download_attachments(pending, base_path, creator)
downloaded = sum(1 for r in results if r.success)
failed = sum(1 for r in results if not r.success and not r.is_duplicate)
skipped = sum(1 for r in results if r.is_duplicate)
# Collect file info for notifications (include metadata for lightbox)
for r in results:
if r.success and r.file_path:
file_info = {
'file_path': r.file_path,
'filename': Path(r.file_path).name if r.file_path else None,
'source': creator['username'],
'content_type': 'post',
'file_size': r.file_size,
'platform': creator.get('platform')
}
# Try to get additional metadata from the attachment record
if r.file_path:
att = self.db.get_attachment_by_path(r.file_path)
if att:
file_info['attachment_id'] = att.get('id')
file_info['width'] = att.get('width')
file_info['height'] = att.get('height')
file_info['file_type'] = att.get('file_type')
file_info['duration'] = att.get('duration')
file_info['post_content'] = att.get('post_content')
file_info['post_date'] = att.get('post_date')
downloaded_file_info.append(file_info)
total_downloaded += downloaded
total_failed += failed
total_skipped += skipped
# Update creator stats after each round
stats = self.db.get_creator_stats(creator_id)
self.db.update_creator(creator_id, {
'downloaded_count': stats['downloaded_attachments'],
'total_size_bytes': stats['total_size_bytes']
})
# If nothing was downloaded and nothing was re-queued, stop
# (all failures were permanent errors)
if downloaded == 0 and failed > 0:
# Check if any items were re-queued (status = pending)
new_pending = self.db.get_pending_attachments(creator_id=creator_id)
if len(new_pending) == 0:
self.log(f"All remaining failures are permanent, stopping", 'info')
break
# If auto-retry is disabled, stop after first round
if not auto_retry:
break
# Small delay before next round to avoid hammering the server
if len(pending) > 0:
await asyncio.sleep(5)
# Download embeds if enabled
if self.config.get('download_embeds', True) and creator.get('download_embeds', True):
embed_results = await self._download_pending_embeds(creator_id, base_path, creator)
total_downloaded += embed_results.get('downloaded', 0)
total_failed += embed_results.get('failed', 0)
# Add embed file info if available
if embed_results.get('downloaded_file_info'):
downloaded_file_info.extend(embed_results['downloaded_file_info'])
# Download pending message attachments
msg_attachments = self.db.get_pending_message_attachments(creator_id)
if msg_attachments:
self.log(f"Downloading {len(msg_attachments)} message attachments for {creator['username']}", 'info')
msg_results = await self._download_message_attachments(msg_attachments, base_path, creator)
msg_downloaded = sum(1 for r in msg_results if r.success)
msg_failed = sum(1 for r in msg_results if not r.success and not r.is_duplicate)
total_downloaded += msg_downloaded
total_failed += msg_failed
return {
'downloaded': total_downloaded,
'failed': total_failed,
'skipped': total_skipped,
'downloaded_file_info': downloaded_file_info
}
async def _download_message_attachments(self, attachments: List[Dict], base_path: Path,
creator: Dict) -> List[DownloadResult]:
"""Download message attachments using the same worker pattern as post attachments"""
results = []
queue = asyncio.Queue()
for att in attachments:
# Get message info for path building
if not att.get('message_id'):
results.append(DownloadResult(success=False, error="No message_id"))
continue
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT * FROM paid_content_messages WHERE id = ?", (att['message_id'],))
msg_row = cursor.fetchone()
if not msg_row:
results.append(DownloadResult(success=False, error="Message not found"))
continue
message = dict(msg_row)
dest_path = self._build_message_file_path(base_path, creator, message, att)
download_url = att.get('download_url')
if not download_url:
results.append(DownloadResult(success=False, error="No download URL"))
continue
await queue.put((att, download_url, dest_path))
if queue.empty():
return results
total = queue.qsize()
self._download_progress = {'completed': 0, 'success': 0, 'failed': 0}
self._active_downloads = {}
self._download_results = []
self._results_lock = asyncio.Lock()
self._active_workers = 0
num_workers = min(self.max_concurrent_downloads, total)
workers = []
for i in range(num_workers):
worker = asyncio.create_task(
self._download_worker(queue, creator['id'], total, worker_id=i)
)
workers.append(worker)
await queue.join()
for worker in workers:
worker.cancel()
await asyncio.gather(*workers, return_exceptions=True)
results.extend(self._download_results)
return results
async def _download_attachments(self, attachments: List[Dict], base_path: Path,
creator: Dict) -> List[DownloadResult]:
"""Download attachments using a queue with worker pattern for true concurrent downloads"""
total = len(attachments)
# Shared progress counter for real-time updates
self._download_progress = {'completed': 0, 'success': 0, 'failed': 0}
# Track currently active downloads for status display
self._active_downloads = {}
# Results storage
self._download_results = []
# Lock for thread-safe updates to shared state
self._results_lock = asyncio.Lock()
# Track active worker count for debugging
self._active_workers = 0
num_workers = min(self.max_concurrent_downloads, total) # Don't create more workers than files
self.log(f"Starting download of {total} files with {num_workers} concurrent workers", 'info')
# Create a queue of download jobs
queue = asyncio.Queue()
# Add all attachments to the queue
queued = 0
for att in attachments:
post = self.db.get_post(att['post_id'])
if not post:
async with self._results_lock:
self._download_results.append(DownloadResult(success=False, error="Post not found"))
self._download_progress['completed'] += 1
self._download_progress['failed'] += 1
continue
dest_path = self._build_file_path(base_path, creator, post, att)
# Use direct download_url if available (Fansly Direct), otherwise build from server_path
if att.get('download_url'):
download_url = att['download_url']
else:
client = self._get_client(creator['service_id'])
download_url = client.get_attachment_url(att['server_path'])
await queue.put((att, download_url, dest_path))
queued += 1
self.log(f"Queued {queued} downloads, starting {num_workers} workers", 'info')
# Create worker tasks
workers = []
for i in range(num_workers):
worker = asyncio.create_task(
self._download_worker(queue, creator['id'], total, worker_id=i)
)
workers.append(worker)
# Wait for all items in queue to be processed
await queue.join()
self.log(f"All downloads complete, cancelling workers", 'debug')
# Cancel workers (they're waiting on empty queue)
for worker in workers:
worker.cancel()
# Wait for workers to finish cancelling
await asyncio.gather(*workers, return_exceptions=True)
return self._download_results
async def _download_worker(self, queue: asyncio.Queue, creator_id: int, total_files: int, worker_id: int = 0):
"""Worker that pulls from queue and downloads files"""
self._active_workers += 1
self.log(f"Worker {worker_id} started (active workers: {self._active_workers})", 'debug')
while True:
try:
# Get next item from queue - non-blocking check first to log state
try:
att, url, dest_path = queue.get_nowait()
except asyncio.QueueEmpty:
# Queue empty, wait for more items or cancellation
att, url, dest_path = await queue.get()
att_name = att.get('name', 'unknown')
self.log(f"Worker {worker_id} picked up: {att_name} (queue size: {queue.qsize()}, active: {len(self._active_downloads)})", 'info')
try:
result = await self._download_single_attachment_no_semaphore(
att, url, dest_path, creator_id, total_files
)
async with self._results_lock:
self._download_results.append(result)
except Exception as e:
self.log(f"Worker {worker_id} exception: {e}", 'error')
async with self._results_lock:
self._download_results.append(DownloadResult(success=False, error=str(e)))
finally:
# Mark task as done so queue.join() knows
queue.task_done()
self.log(f"Worker {worker_id} finished task, marking done", 'debug')
except asyncio.CancelledError:
# Worker was cancelled, exit gracefully
self._active_workers -= 1
self.log(f"Worker {worker_id} cancelled (remaining: {self._active_workers})", 'debug')
break
async def _download_via_ytdlp(self, att: Dict, url: str, dest_path: Path,
creator_id: int = None, total_files: int = None) -> DownloadResult:
"""Download a YouTube video using yt-dlp instead of direct HTTP download.
Downloads to a local temp dir first to avoid mergerfs rename issues with .part files,
then moves the final merged file to the target location."""
att_id = att['id']
att_name = att.get('name', 'unknown')
youtube = self._get_youtube_client()
if not youtube.is_available():
self.log(f"yt-dlp not available for {att_name}", 'error')
return DownloadResult(success=False, error='yt-dlp not available')
quality = self.config.get('embed_quality', 'best')
final_dir = dest_path.parent
# Download to local temp dir to avoid mergerfs .part rename issues
import tempfile
with tempfile.TemporaryDirectory(prefix='ytdlp_') as tmp_dir:
tmp_path = Path(tmp_dir)
self.log(f"Downloading via yt-dlp: {att_name}", 'info')
result = await youtube.download_video(url, tmp_path, quality=quality)
if not result or not result.get('success'):
error = (result or {}).get('error', 'yt-dlp returned no result')
self.log(f"yt-dlp failed for {att_name}: {error}", 'warning')
self.db.update_attachment_status(att_id, 'pending',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
return DownloadResult(success=False, error=error)
tmp_file = Path(result.get('file_path'))
if not tmp_file.exists():
error = f"yt-dlp reported success but file not found: {tmp_file}"
self.log(error, 'error')
return DownloadResult(success=False, error=error)
# Move to final destination
final_dir.mkdir(parents=True, exist_ok=True)
final_path = final_dir / tmp_file.name
import shutil
shutil.move(str(tmp_file), str(final_path))
file_path = str(final_path)
file_size = final_path.stat().st_size
# Download YouTube thumbnail
thumbnail_data = None
post = self.db.get_post(att['post_id'])
video_id = post.get('post_id') if post else None
if video_id:
thumbnail_data = await self._download_youtube_thumbnail(video_id)
# Extract video dimensions
width, height, duration = None, None, None
if file_path:
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
self.db.update_attachment_status(att_id, 'completed',
local_path=file_path,
local_filename=Path(file_path).name if file_path else None,
file_size=file_size,
width=width,
height=height,
duration=duration,
thumbnail_data=thumbnail_data,
downloaded_at=datetime.now().isoformat()
)
self.db.mark_post_downloaded(att['post_id'])
if creator_id:
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
if creator_id and total_files:
async with self._results_lock:
self._download_progress['completed'] += 1
self._download_progress['success'] += 1
self._update_download_status(creator_id, total_files)
self.log(f"Downloaded via yt-dlp: {att_name} ({file_size} bytes)", 'info')
return DownloadResult(success=True, file_path=file_path, file_size=file_size)
async def _download_via_tiktok(self, att: Dict, url: str, dest_path: Path,
creator_id: int = None, total_files: int = None) -> DownloadResult:
"""Download a TikTok video using the TikTok client (gallery-dl with cookies)."""
att_id = att['id']
att_name = att.get('name', 'unknown')
tiktok = self._get_tiktok_client()
if not tiktok.is_available():
self.log(f"TikTok client not available for {att_name}", 'error')
return DownloadResult(success=False, error='yt-dlp/gallery-dl not available')
# Get creator username for the download
post = self.db.get_post(att['post_id'])
creator = self.db.get_creator(creator_id) if creator_id else None
username = creator['username'] if creator else ''
output_dir = dest_path.parent
output_dir.mkdir(parents=True, exist_ok=True)
self.log(f"Downloading via TikTok client: {att_name}", 'info')
result = await tiktok.download_video(url, output_dir, username=username)
if not result or not result.get('success'):
error = (result or {}).get('error', 'TikTok download failed')
self.log(f"TikTok download failed for {att_name}: {error}", 'warning')
self.db.update_attachment_status(att_id, 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
async with self._results_lock:
self._download_progress['completed'] += 1
self._download_progress['failed'] += 1
if creator_id and total_files:
self._update_download_status(creator_id, total_files)
return DownloadResult(success=False, error=error)
all_files = result.get('all_files', [])
file_path = result.get('file_path')
total_size = 0
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp'}
# Process all downloaded files (carousel photos or single video)
for file_idx, file_str in enumerate(all_files):
fp = Path(file_str)
if not fp.exists():
continue
f_size = fp.stat().st_size
total_size += f_size
ext = fp.suffix.lower()
c_type = 'image' if ext in image_exts else 'video'
w, h, dur = None, None, None
if c_type == 'video':
w, h, dur = self._extract_dimensions(fp, 'video')
else:
w, h, _ = self._extract_dimensions(fp, 'image')
thumb_data = None
if c_type == 'video':
thumb_data = self._generate_video_thumbnail(fp, seek_time='00:00:01')
if file_idx == 0:
# Update the existing attachment with the first file
self.db.update_attachment_status(att_id, 'completed',
local_path=str(fp),
local_filename=fp.name,
name=fp.name,
extension=ext,
file_size=f_size,
file_type=c_type,
width=w, height=h, duration=dur,
thumbnail_data=thumb_data,
downloaded_at=datetime.now().isoformat()
)
else:
# Create additional attachments for carousel photos
# Use unique server_path per file to avoid upsert collisions
dl_url = att.get('download_url', '')
self.db.upsert_attachment(att['post_id'], {
'name': fp.name,
'file_type': c_type,
'extension': ext,
'server_path': f"{dl_url}#slide_{file_idx}",
'download_url': dl_url,
'status': 'completed',
'local_path': str(fp),
'local_filename': fp.name,
'file_size': f_size,
'width': w, 'height': h, 'duration': dur,
'thumbnail_data': thumb_data,
'downloaded_at': datetime.now().isoformat(),
})
self.db.mark_post_downloaded(att['post_id'])
if creator_id:
self.db.increment_creator_download_stats(creator_id, 1, total_size)
if creator_id and total_files:
async with self._results_lock:
self._download_progress['completed'] += 1
self._download_progress['success'] += 1
self._update_download_status(creator_id, total_files)
file_count = len([f for f in all_files if Path(f).exists()])
self.log(f"Downloaded via TikTok client: {att_name} ({file_count} file{'s' if file_count > 1 else ''}, {total_size} bytes)", 'info')
return DownloadResult(success=True, file_path=file_path, file_size=total_size)
async def _download_single_attachment_no_semaphore(self, att: Dict, url: str, dest_path: Path,
creator_id: int = None, total_files: int = None) -> DownloadResult:
"""Download a single attachment with automatic retry/resume on stall or failure"""
att_id = att['id']
att_name = att.get('name', 'unknown')
max_retries = 5 # Max retry attempts for stalls/failures
chunk_timeout = 60 # Timeout for receiving each chunk (detect stalls)
# Helper to clean up active download tracking
def cleanup_active_download():
if att_id in self._active_downloads:
del self._active_downloads[att_id]
# Update status to downloading
self.db.update_attachment_status(att_id, 'downloading')
# Create directory
dest_path.parent.mkdir(parents=True, exist_ok=True)
# Route YouTube URLs through yt-dlp (direct aiohttp download gets HTML, not video)
if 'youtube.com/watch' in url or 'youtu.be/' in url:
return await self._download_via_ytdlp(att, url, dest_path, creator_id, total_files)
# Route TikTok URLs through TikTok client (gallery-dl with cookies)
if 'tiktok.com/' in url:
return await self._download_via_tiktok(att, url, dest_path, creator_id, total_files)
# Handle streaming formats (m3u8/HLS and mpd/DASH) with ffmpeg
if '.m3u8' in url or '.mpd' in url:
return await self._download_stream_with_ffmpeg(att, url, dest_path, creator_id, total_files)
file_size = 0
expected_size = None
last_error = None
for attempt in range(max_retries):
try:
# Check for existing partial file to enable resume
existing_size = 0
if dest_path.exists():
existing_size = dest_path.stat().st_size
if existing_size > 0:
if attempt == 0:
self.log(f"Resuming download: {att_name} from {self._format_bytes(existing_size)}", 'info')
else:
self.log(f"Retry {attempt + 1}/{max_retries}: Resuming {att_name} from {self._format_bytes(existing_size)}", 'info')
else:
self.log(f"Downloading: {att_name}", 'debug')
else:
if attempt > 0:
self.log(f"Retry {attempt + 1}/{max_retries}: Starting fresh download of {att_name}", 'info')
else:
self.log(f"Downloading: {att_name}", 'debug')
# Download file with proper headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': url.split('/data')[0] if '/data' in url else url
}
# Add Range header for resume
if existing_size > 0:
headers['Range'] = f'bytes={existing_size}-'
# Use shorter timeouts to detect stalls faster
timeout = aiohttp.ClientTimeout(total=None, connect=30, sock_read=chunk_timeout)
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(url, timeout=timeout) as resp:
# Handle response codes
if resp.status == 416:
# Range not satisfiable - file is likely complete
self.log(f"Resume not needed for {att_name} - file may be complete", 'info')
file_size = existing_size
break # Exit retry loop, proceed to verification
elif resp.status == 200 and existing_size > 0:
# Server doesn't support resume, need to start over
self.log(f"Server doesn't support resume for {att_name}, restarting download", 'warning')
existing_size = 0
# Delete partial file and start fresh
dest_path.unlink(missing_ok=True)
elif resp.status not in (200, 206):
last_error = f"HTTP {resp.status}"
self.log(f"HTTP error {resp.status} for {att_name}, will retry", 'warning')
await asyncio.sleep(2 ** attempt) # Exponential backoff
continue # Retry
# Calculate expected size
content_length = resp.headers.get('Content-Length')
if resp.status == 206:
expected_size = existing_size + (int(content_length) if content_length else 0)
else:
expected_size = int(content_length) if content_length else None
file_size = existing_size
last_progress_update = existing_size
# Register this download as active
if creator_id and total_files:
self._active_downloads[att_id] = {
'name': att_name,
'size': expected_size,
'progress': existing_size
}
self._update_download_status(creator_id, total_files)
# Open in append mode for resume, write mode for fresh start
file_mode = 'ab' if existing_size > 0 else 'wb'
async with aiofiles.open(dest_path, file_mode) as f:
async for chunk in resp.content.iter_chunked(8192):
await f.write(chunk)
file_size += len(chunk)
if creator_id and total_files and (file_size - last_progress_update) >= 512 * 1024:
last_progress_update = file_size
if att_id in self._active_downloads:
self._active_downloads[att_id]['progress'] = file_size
self._update_download_status(creator_id, total_files)
# Download completed successfully
cleanup_active_download()
# Verify file size if we know the expected size
if expected_size and file_size < expected_size:
self.log(f"Incomplete download for {att_name}: got {file_size}, expected {expected_size}", 'warning')
last_error = f"Incomplete: {file_size}/{expected_size} bytes"
continue # Retry
break # Success - exit retry loop
except asyncio.TimeoutError:
cleanup_active_download()
# Check how much we got
current_size = dest_path.stat().st_size if dest_path.exists() else 0
last_error = f"Stalled at {self._format_bytes(current_size)}"
self.log(f"Download stalled for {att_name} at {self._format_bytes(current_size)}, will retry", 'warning')
await asyncio.sleep(2) # Brief pause before retry
continue
except (aiohttp.ClientError, ConnectionError, OSError) as e:
cleanup_active_download()
current_size = dest_path.stat().st_size if dest_path.exists() else 0
last_error = str(e)
self.log(f"Connection error for {att_name} at {self._format_bytes(current_size)}: {e}, will retry", 'warning')
await asyncio.sleep(2 ** attempt) # Exponential backoff
continue
else:
# Exhausted all retries for this sync
cleanup_active_download()
error = f"Failed after {max_retries} attempts: {last_error}"
self.log(f"Download failed for {att_name}: {error}", 'error')
# Check if this is a permanent error or a retriable one
if self._is_permanent_error(last_error):
# Permanent error (500, 404, etc.) - mark as failed
self.db.update_attachment_status(att_id, 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
self.log(f"Permanent error for {att_name}, marked as failed", 'warning')
else:
# Retriable error (timeout, partial, connection) - put back in queue
# Don't increment download_attempts so it keeps trying
self.db.update_attachment_status(att_id, 'pending',
error_message=f"Will retry: {error}",
last_attempt=datetime.now().isoformat()
)
self.log(f"Retriable error for {att_name}, re-queued for next sync", 'info')
self.db.record_download_attempt(
attachment_id=att_id, url=url, status='failed',
error_message=error
)
if creator_id and total_files:
async with self._results_lock:
self._download_progress['completed'] += 1
self._download_progress['failed'] += 1
self._update_download_status(creator_id, total_files)
return DownloadResult(success=False, error=error)
# Download succeeded - proceed with verification and processing
try:
# Calculate SHA256 hash
file_hash = await self._compute_file_hash_async(dest_path)
if not file_hash:
error = f"Failed to compute hash - file may be missing: {dest_path.name}"
self.log(f"Hash computation failed for {att_name}: {error}", 'error')
self.db.update_attachment_status(att_id, 'pending',
error_message=f"Will retry: {error}",
last_attempt=datetime.now().isoformat()
)
if creator_id and total_files:
async with self._results_lock:
self._download_progress['completed'] += 1
self._download_progress['failed'] += 1
self._update_download_status(creator_id, total_files)
return DownloadResult(success=False, error=error)
# Verify hash matches expected hash from Coomer/Kemono server path
# Server path format: /xx/yy/HASH.ext (e.g., /90/b0/90b023c9...714.mp4)
server_path = att.get('server_path', '')
if server_path and '/' in server_path:
expected_hash = Path(server_path).stem # Get filename without extension
if len(expected_hash) == 64 and expected_hash != file_hash:
# Hash mismatch - file is corrupt, delete and re-queue
dest_path.unlink(missing_ok=True)
error = f"Hash mismatch: expected {expected_hash[:16]}..., got {file_hash[:16]}... - file corrupt"
self.log(f"Hash verification failed for {att_name}: {error}", 'warning')
self.db.update_attachment_status(att_id, 'pending',
error_message=f"Will retry: {error}",
last_attempt=datetime.now().isoformat()
)
if creator_id and total_files:
async with self._results_lock:
self._download_progress['completed'] += 1
self._download_progress['failed'] += 1
self._update_download_status(creator_id, total_files)
return DownloadResult(success=False, error=error)
# Compute perceptual hash for images
perceptual_hash = self._compute_perceptual_hash(dest_path)
# Generate thumbnail and extract dimensions
file_type = att.get('file_type', '')
thumbnail_data = None
width, height, duration = None, None, None
if file_type in ('image', 'video'):
thumbnail_data = self._generate_thumbnail(dest_path, file_type)
if thumbnail_data:
self.log(f"Generated thumbnail for {att_name} ({len(thumbnail_data)} bytes)", 'debug')
# Also generate large thumbnail for feed view (file cached)
large_thumb_data = self._generate_thumbnail(dest_path, file_type, max_size=(800, 800))
if large_thumb_data:
large_cache_dir = Path('/opt/media-downloader/cache/thumbnails/large')
large_cache_dir.mkdir(parents=True, exist_ok=True)
large_cache_file = large_cache_dir / f"{att_id}.jpg"
large_cache_file.write_bytes(large_thumb_data)
# Extract dimensions
width, height, duration = self._extract_dimensions(dest_path, file_type)
if width and height:
self.log(f"Extracted dimensions for {att_name}: {width}x{height}" + (f", {duration}s" if duration else ""), 'debug')
# Skip placeholder/missing-photo images (tiny thumbnails under 15KB and 200px)
if file_type == 'image' and file_size and file_size < 15000 and width and height and max(width, height) <= 200:
self.log(f"Skipping placeholder image {att_name} ({width}x{height}, {file_size} bytes)", 'info')
dest_path.unlink(missing_ok=True)
self.db.update_attachment_status(att_id, 'skipped',
error_message=f'Placeholder image ({width}x{height}, {file_size} bytes)')
return DownloadResult(success=True, file_path=None, file_hash=None, file_size=0)
# Update database
self.db.update_attachment_status(att_id, 'completed',
local_path=str(dest_path),
local_filename=dest_path.name,
file_hash=file_hash,
perceptual_hash=perceptual_hash,
file_size=file_size,
width=width,
height=height,
duration=duration,
thumbnail_data=thumbnail_data,
downloaded_at=datetime.now().isoformat()
)
self.db.record_download_attempt(
attachment_id=att_id, url=url, status='success'
)
# Check if all attachments for post are complete
post = self.db.get_post(att['post_id'])
if post:
all_complete = all(a['status'] == 'completed' for a in post['attachments'])
if all_complete:
self.db.mark_post_downloaded(att['post_id'])
self.log(f"Downloaded: {att_name} ({file_size} bytes)", 'debug')
# Update creator stats
if creator_id:
self.db.increment_creator_download_stats(creator_id, 1, file_size)
# Update progress
if creator_id and total_files:
async with self._results_lock:
self._download_progress['completed'] += 1
self._download_progress['success'] += 1
self._update_download_status(creator_id, total_files)
return DownloadResult(
success=True,
file_path=str(dest_path),
file_hash=file_hash,
file_size=file_size
)
except Exception as e:
# Error during post-download processing (hashing, duplicate check, etc.)
import traceback
error = f"Post-processing error: {str(e)}"
self.log(f"Error processing {att_name}: {error}\n{traceback.format_exc()}", 'error')
self.db.update_attachment_status(att_id, 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
if creator_id and total_files:
async with self._results_lock:
self._download_progress['completed'] += 1
self._download_progress['failed'] += 1
self._update_download_status(creator_id, total_files)
return DownloadResult(success=False, error=error)
async def _download_stream_with_ffmpeg(self, att: Dict, url: str, dest_path: Path,
creator_id: int = None, total_files: int = None) -> DownloadResult:
"""Download streaming formats (m3u8/HLS, mpd/DASH) using ffmpeg"""
import subprocess
import re
import tempfile
import aiohttp
att_id = att['id']
att_name = att.get('name', 'unknown')
stream_type = 'HLS' if '.m3u8' in url else 'DASH' if '.mpd' in url else 'stream'
self.log(f"Downloading {stream_type} stream: {att_name}", 'info')
# Track as active download
if creator_id and total_files:
self._active_downloads[att_id] = {
'name': att_name,
'size': att.get('file_size'),
'progress': 0
}
self._update_download_status(creator_id, total_files)
temp_m3u8_path = None
segment_urls = []
try:
# For HLS with CloudFront signed URLs, we need to modify the playlist
# to include signed params on each segment URL (ffmpeg doesn't carry them)
input_source = url
use_protocol_whitelist = False
if '.m3u8' in url and ('Key-Pair-Id=' in url or 'Policy=' in url):
self.log("Processing CloudFront signed HLS playlist", 'debug')
# Extract signed params from URL
params_match = re.search(r'\?(.+)$', url)
if params_match:
signed_params = '?' + params_match.group(1)
# Get base URL for constructing absolute segment URLs
# Fansly: https://cdn3.fansly.com/new/{account}/{media}/{media}.m3u8?...
base_url_match = re.match(r'(https://[^/]+/new/\d+/\d+)', url)
if base_url_match:
base_url = base_url_match.group(1)
# Determine variant path (e.g., media-1) from URL or default
# The URL might be master.m3u8 or already a variant like media-1/stream.m3u8
if '/media-' in url:
# Already a variant URL
variant_match = re.search(r'(/media-\d+/)', url)
variant_path = variant_match.group(1) if variant_match else '/media-1/'
else:
# Master playlist - fetch it to find the highest quality variant
variant_path = None
# Fetch the playlist content
async with aiohttp.ClientSession() as session:
# If this is a master playlist, fetch it and pick best variant
if 'stream.m3u8' not in url and variant_path is None:
master_url = url
self.log(f"Fetching master playlist to find best variant", 'debug')
async with session.get(master_url) as master_resp:
if master_resp.status == 200:
master_content = await master_resp.text()
# Parse master playlist for variant streams
# Look for lines like: media-1/stream.m3u8, media-5/stream.m3u8
# and #EXT-X-STREAM-INF with RESOLUTION=WxH
best_variant = None
best_resolution = 0
current_bandwidth = 0
for line in master_content.splitlines():
line = line.strip()
if line.startswith('#EXT-X-STREAM-INF:'):
# Extract resolution if available
res_match = re.search(r'RESOLUTION=(\d+)x(\d+)', line)
bw_match = re.search(r'BANDWIDTH=(\d+)', line)
if res_match:
w, h = int(res_match.group(1)), int(res_match.group(2))
current_bandwidth = w * h
elif bw_match:
current_bandwidth = int(bw_match.group(1))
else:
current_bandwidth = 0
elif line and not line.startswith('#'):
# This is a variant URI line
vm = re.search(r'(media-\d+)/stream\.m3u8', line)
if vm:
if current_bandwidth > best_resolution:
best_resolution = current_bandwidth
best_variant = vm.group(1)
current_bandwidth = 0
if best_variant:
variant_path = f'/{best_variant}/'
self.log(f"Selected best variant: {best_variant} (resolution score: {best_resolution})", 'info')
else:
variant_path = '/media-1/'
self.log(f"No variants found in master playlist, defaulting to media-1", 'warning')
else:
variant_path = '/media-1/'
self.log(f"Failed to fetch master playlist: HTTP {master_resp.status}, defaulting to media-1", 'warning')
variant_url = f"{base_url}{variant_path}stream.m3u8{signed_params}"
self.log(f"Fetching variant playlist: {variant_path}stream.m3u8", 'debug')
elif 'stream.m3u8' not in url:
variant_url = f"{base_url}{variant_path}stream.m3u8{signed_params}"
self.log(f"Fetching variant playlist: {variant_path}stream.m3u8", 'debug')
else:
variant_url = url
async with session.get(variant_url) as resp:
if resp.status == 200:
playlist_content = await resp.text()
# Normalize CRLF to LF (Windows -> Unix line endings)
playlist_content = playlist_content.replace('\r\n', '\n').replace('\r', '\n')
# Replace relative segment URLs with absolute signed URLs
# Matches: segment-0.ts, segment-123.ts, etc.
# Capture base_url and variant_path in closure
local_base = base_url
local_variant = variant_path
local_params = signed_params
def replace_segment(match):
segment = match.group(1)
return f"{local_base}{local_variant}{segment}{local_params}"
modified_content = re.sub(
r'^(segment-\d+\.ts)$',
replace_segment,
playlist_content,
flags=re.MULTILINE
)
# Extract signed segment URLs and total duration for direct download
playlist_duration = 0.0
for seg_line in modified_content.splitlines():
seg_line = seg_line.strip()
if seg_line.startswith('#EXTINF:'):
try:
playlist_duration += float(seg_line.split(':')[1].split(',')[0])
except (ValueError, IndexError):
pass
elif seg_line and not seg_line.startswith('#') and seg_line.startswith('https://'):
segment_urls.append(seg_line)
self.log(f"Extracted {len(segment_urls)} segment URLs, playlist duration: {playlist_duration:.2f}s", 'info')
else:
self.log(f"Failed to fetch variant playlist: HTTP {resp.status}", 'warning')
if segment_urls:
# Direct segment download approach - avoids ffmpeg HLS networking
# which stalls on large 4K streams with CloudFront signed URLs
import shutil
import os as _os
temp_dir = tempfile.mkdtemp(prefix='hls_segments_')
try:
total_segments = len(segment_urls)
self.log(f"Downloading {total_segments} HLS segments directly", 'info')
segment_paths = {}
completed_count = 0
total_bytes = 0
progress_lock = asyncio.Lock()
seg_timeout = aiohttp.ClientTimeout(total=120, connect=15, sock_read=60)
connector = aiohttp.TCPConnector(limit=5)
async with aiohttp.ClientSession(timeout=seg_timeout, connector=connector) as dl_session:
semaphore = asyncio.Semaphore(5)
async def _dl_segment(idx, seg_url):
nonlocal completed_count, total_bytes
seg_path = _os.path.join(temp_dir, f'segment-{idx:05d}.ts')
async with semaphore:
for attempt in range(3):
try:
async with dl_session.get(seg_url) as seg_resp:
if seg_resp.status != 200:
raise Exception(f"HTTP {seg_resp.status}")
with open(seg_path, 'wb') as sf:
async for chunk in seg_resp.content.iter_chunked(65536):
sf.write(chunk)
break
except Exception as e:
if attempt == 2:
raise Exception(f"Segment {idx}/{total_segments} failed after 3 attempts: {e}")
self.log(f"Segment {idx} attempt {attempt+1} failed: {e}, retrying", 'warning')
try:
_os.unlink(seg_path)
except OSError:
pass
await asyncio.sleep(2 ** attempt)
seg_size = _os.path.getsize(seg_path)
segment_paths[idx] = seg_path
async with progress_lock:
completed_count += 1
total_bytes += seg_size
# Update progress every ~5%
update_interval = max(1, total_segments // 20)
if completed_count % update_interval == 0 or completed_count == total_segments:
pct = completed_count / total_segments
est_total = int(total_bytes / pct) if pct > 0.01 else 0
if att_id in self._active_downloads:
self._active_downloads[att_id]['progress'] = total_bytes
self._active_downloads[att_id]['size'] = est_total
if creator_id and total_files:
self._update_download_status(creator_id, total_files)
self.log(f"Segments: {completed_count}/{total_segments} ({self._format_bytes(total_bytes)})", 'debug')
# Download all segments with limited concurrency
tasks = [_dl_segment(i, u) for i, u in enumerate(segment_urls)]
await asyncio.gather(*tasks)
self.log(f"All {total_segments} segments downloaded ({self._format_bytes(total_bytes)})", 'info')
# Create concat list for ffmpeg
concat_path = _os.path.join(temp_dir, 'concat.txt')
with open(concat_path, 'w') as cf:
for i in range(total_segments):
cf.write(f"file '{segment_paths[i]}'\n")
# Remux segments to MP4 with ffmpeg (local files only, no network)
self.log(f"Remuxing {total_segments} segments to MP4...", 'info')
remux_cmd = [
'ffmpeg', '-y',
'-f', 'concat',
'-safe', '0',
'-i', concat_path,
'-c', 'copy',
'-bsf:a', 'aac_adtstoasc',
'-avoid_negative_ts', 'make_zero',
'-fflags', '+genpts',
]
# Trim to exact playlist duration to remove HLS segment padding
if playlist_duration > 0:
remux_cmd.extend(['-t', f'{playlist_duration:.6f}'])
remux_cmd.extend([
'-progress', 'pipe:1', '-nostats',
str(dest_path)
])
process = await asyncio.create_subprocess_exec(
*remux_cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
# Track remux progress
remux_last_update = 0
duration_us = int(playlist_duration * 1_000_000) if playlist_duration > 0 else 0
while True:
line = await process.stdout.readline()
if not line:
break
line_str = line.decode('utf-8', errors='replace').strip()
if line_str.startswith('out_time_us=') and duration_us > 0:
try:
current_us = int(line_str.split('=')[1])
if current_us > 0:
pct = min(current_us / duration_us, 1.0)
if pct - remux_last_update >= 0.10:
remux_last_update = pct
self.log(f"Remuxing: {int(pct * 100)}%", 'info')
if att_id in self._active_downloads:
self._active_downloads[att_id]['progress'] = total_bytes
self._active_downloads[att_id]['size'] = total_bytes
self._active_downloads[att_id]['name'] = f"{att_name} (remuxing {int(pct * 100)}%)"
if creator_id and total_files:
self._update_download_status(creator_id, total_files)
except (ValueError, ZeroDivisionError):
pass
_, stderr_data = await process.communicate()
# Restore original name
if att_id in self._active_downloads:
self._active_downloads[att_id]['name'] = att_name
if process.returncode != 0:
error_msg = stderr_data.decode('utf-8', errors='replace')[-2000:]
raise Exception(f"ffmpeg remux failed: {error_msg}")
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
else:
# Standard ffmpeg approach for non-CloudFront HLS or DASH streams
# Parse total duration from playlist for progress tracking
total_duration_us = 0
if use_protocol_whitelist and temp_m3u8_path:
try:
with open(temp_m3u8_path, 'r') as f:
for line in f:
if line.startswith('#EXTINF:'):
duration_str = line.split(':')[1].split(',')[0]
total_duration_us += int(float(duration_str) * 1_000_000)
except Exception as e:
self.log(f"Could not parse playlist duration: {e}", 'debug')
# Build ffmpeg command
cmd = ['ffmpeg', '-y']
if use_protocol_whitelist:
cmd.extend(['-protocol_whitelist', 'file,http,https,tcp,tls'])
# For DASH/HLS with CloudFront signed params, extract them from the URL
# and pass as cookies so ffmpeg carries auth to ALL requests (manifest + segments)
if '?' in input_source and 'CloudFront-' in input_source:
from urllib.parse import urlparse as _urlparse, parse_qs as _parse_qs, urlunparse as _urlunparse
parsed_url = _urlparse(input_source)
domain = parsed_url.hostname
params = _parse_qs(parsed_url.query)
# Extract CloudFront params and build cookies
cf_cookies = []
remaining_params = []
for key, values in params.items():
val = values[0] if values else ''
if key.startswith('CloudFront-'):
cf_cookies.append(f"{key}={val}; path=/; domain={domain};\r\n")
else:
remaining_params.append(f"{key}={val}")
if cf_cookies:
cookie_str = ''.join(cf_cookies)
cmd.extend(['-cookies', cookie_str])
# Rebuild URL without CloudFront params
clean_query = '&'.join(remaining_params)
input_source = _urlunparse(parsed_url._replace(query=clean_query))
self.log(f"Passing {len(cf_cookies)} CloudFront cookies for {stream_type} stream", 'debug')
cmd.extend(['-i', input_source])
if total_duration_us > 0:
cmd.extend(['-progress', 'pipe:1', '-nostats'])
cmd.extend([
'-c', 'copy',
'-bsf:a', 'aac_adtstoasc',
'-movflags', '+faststart',
str(dest_path)
])
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stderr_data = b''
if total_duration_us > 0 and creator_id and total_files:
last_update = 0
stall_timeout = 120
while True:
try:
line = await asyncio.wait_for(process.stdout.readline(), timeout=stall_timeout)
except asyncio.TimeoutError:
self.log(f"ffmpeg stalled for {stall_timeout}s downloading {att_name}, killing", 'warning')
process.kill()
await process.wait()
raise Exception(f"ffmpeg stalled (no output for {stall_timeout}s)")
if not line:
break
line_str = line.decode('utf-8', errors='replace').strip()
if line_str.startswith('out_time_us='):
try:
current_us = int(line_str.split('=')[1])
if current_us > 0:
progress_pct = min(current_us / total_duration_us, 1.0)
try:
actual_size = dest_path.stat().st_size if dest_path.exists() else 0
except OSError:
actual_size = 0
estimated_total = int(actual_size / progress_pct) if progress_pct > 0.01 else 0
if progress_pct - last_update >= 0.02:
last_update = progress_pct
if att_id in self._active_downloads:
self._active_downloads[att_id]['progress'] = actual_size
self._active_downloads[att_id]['size'] = estimated_total
self._update_download_status(creator_id, total_files)
except (ValueError, ZeroDivisionError):
pass
_, stderr_data = await process.communicate()
else:
_, stderr_data = await process.communicate()
if process.returncode != 0:
error_msg = stderr_data.decode('utf-8', errors='replace')[-2000:]
raise Exception(f"ffmpeg failed: {error_msg}")
# Verify file was created
if not dest_path.exists():
raise Exception("Output file was not created")
file_size = dest_path.stat().st_size
if file_size == 0:
dest_path.unlink()
raise Exception("Output file is empty")
self.log(f"Downloaded {stream_type} stream: {att_name} ({self._format_bytes(file_size)})", 'info')
# Calculate file hash
file_hash = await self._compute_file_hash_async(dest_path)
# Update database
self.db.update_attachment_status(
att_id, 'completed',
local_path=str(dest_path),
local_filename=dest_path.name,
file_hash=file_hash,
file_size=file_size,
downloaded_at=datetime.now().isoformat()
)
# Get dimensions from the downloaded file
try:
probe_cmd = ['ffprobe', '-v', 'error', '-select_streams', 'v:0',
'-show_entries', 'stream=width,height', '-of', 'csv=p=0', str(dest_path)]
result = subprocess.run(probe_cmd, capture_output=True, text=True, timeout=30)
if result.returncode == 0 and result.stdout.strip():
parts = result.stdout.strip().split(',')
if len(parts) == 2:
width, height = int(parts[0]), int(parts[1])
self.db.update_attachment(att_id, {'width': width, 'height': height})
self.log(f"Video dimensions: {width}x{height}", 'debug')
except Exception as e:
self.log(f"Could not get video dimensions: {e}", 'debug')
# Update progress
if creator_id and total_files:
if att_id in self._active_downloads:
del self._active_downloads[att_id]
async with self._results_lock:
self._download_progress['completed'] += 1
self._download_progress['success'] += 1
self._update_download_status(creator_id, total_files)
return DownloadResult(
success=True,
file_path=str(dest_path),
file_hash=file_hash,
file_size=file_size
)
except Exception as e:
error = f"Stream download failed: {str(e)}"
self.log(f"Error downloading {att_name}: {error}", 'error')
self.db.update_attachment_status(att_id, 'failed',
error_message=error,
download_attempts=att.get('download_attempts', 0) + 1,
last_attempt=datetime.now().isoformat()
)
if creator_id and total_files:
if att_id in self._active_downloads:
del self._active_downloads[att_id]
async with self._results_lock:
self._download_progress['completed'] += 1
self._download_progress['failed'] += 1
self._update_download_status(creator_id, total_files)
return DownloadResult(success=False, error=error)
finally:
# Clean up temp playlist file
if temp_m3u8_path:
try:
import os
os.unlink(temp_m3u8_path)
except Exception:
pass
async def _download_pending_embeds(self, creator_id: int, base_path: Path, creator: Dict) -> Dict:
"""Download pending embedded videos using yt-dlp"""
from .embed_downloader import EmbedDownloader
pending = self.db.get_pending_embeds(creator_id=creator_id)
if not pending:
return {'downloaded': 0, 'failed': 0}
self.log(f"Downloading {len(pending)} embeds for {creator['username']}", 'info')
downloader = EmbedDownloader(log_callback=self.log_callback)
quality = self.config.get('embed_quality', 'best')
downloaded = 0
failed = 0
for embed in pending:
try:
post = self.db.get_post(embed['post_id'])
if not post:
continue
# Build output directory
post_date = post.get('published_at', '')[:10] or 'unknown-date'
post_title = self._sanitize_filename(post.get('title') or '')[:50]
post_id_short = post.get('post_id', 'unknown')[:12]
post_dir = f"{post_title}_{post_id_short}" if post_title else post_id_short
output_dir = base_path / creator['platform'] / self._sanitize_filename(creator['username']) / post_date / post_dir
self.db.update_embed_status(embed['id'], 'downloading')
result = await downloader.download(embed['url'], output_dir, quality=quality)
if result['success']:
self.db.update_embed_status(embed['id'], 'completed',
local_path=result.get('file_path'),
local_filename=result.get('filename'),
file_size=result.get('file_size'),
duration=result.get('duration'),
title=result.get('title'),
downloaded_at=datetime.now().isoformat()
)
downloaded += 1
else:
self.db.update_embed_status(embed['id'], 'failed',
error_message=result.get('error'),
download_attempts=embed.get('download_attempts', 0) + 1
)
failed += 1
except Exception as e:
self.log(f"Error downloading embed {embed['url']}: {e}", 'error')
self.db.update_embed_status(embed['id'], 'failed',
error_message=str(e),
download_attempts=embed.get('download_attempts', 0) + 1
)
failed += 1
return {'downloaded': downloaded, 'failed': failed}
async def retry_failed_downloads(self, attachment_ids: List[int] = None) -> Dict:
"""Retry failed downloads"""
if attachment_ids:
# Retry specific attachments
pending = []
for att_id in attachment_ids:
att = self.db.get_attachment(att_id)
if att and att['status'] == 'failed':
pending.append(att)
else:
# Get all eligible failed downloads
max_attempts = self.config.get('retry_max_attempts', 3)
pending = self.db.get_failed_downloads(max_attempts=max_attempts)
if not pending:
return {'downloaded': 0, 'failed': 0, 'skipped': 0}
self.log(f"Retrying {len(pending)} failed downloads", 'info')
# Reset status to pending
for att in pending:
self.db.update_attachment_status(att['id'], 'pending')
# Group by creator
by_creator = {}
for att in pending:
cid = att['creator_db_id']
if cid not in by_creator:
by_creator[cid] = []
by_creator[cid].append(att)
total_downloaded = 0
total_failed = 0
total_skipped = 0
for creator_id, atts in by_creator.items():
result = await self.download_pending_for_creator(creator_id)
total_downloaded += result.get('downloaded', 0)
total_failed += result.get('failed', 0)
total_skipped += result.get('skipped', 0)
return {
'downloaded': total_downloaded,
'failed': total_failed,
'skipped': total_skipped
}
def _build_file_path(self, base_path: Path, creator: Dict, post: Dict, attachment: Dict) -> Path:
"""Build destination file path following directory structure"""
# /paid-content/onlyfans/creatorname/2024-01-15/Post-Title_abc123/001_originalname.jpg
platform = creator['platform']
username = self._sanitize_filename(creator['username'])
# Date directory
post_date = post.get('published_at', '')[:10] or 'unknown-date'
# Post directory - always use post_id for consistency
post_id = post.get('post_id', 'unknown')
post_dir = post_id
# Filename - for Fansly use just the media ID (already unique)
# For other platforms, use index prefix to avoid collisions
index = attachment.get('attachment_index', 0) + 1
original_name = attachment.get('name', '')
if original_name:
# Sanitize the original filename
sanitized_name = self._sanitize_filename(original_name)
# Fansly media IDs are unique, no index needed
if platform == 'fansly':
filename = sanitized_name
else:
# Add index prefix for other platforms
filename = f"{index:03d}_{sanitized_name}"
else:
# Fallback to index + extension if no name
ext = attachment.get('extension') or 'bin'
if not ext.startswith('.'):
ext = '.' + ext
if platform == 'fansly':
filename = f"attachment_{index}{ext}"
else:
filename = f"{index:03d}{ext}"
return base_path / platform / username / post_date / post_dir / filename
def _sanitize_filename(self, name: str) -> str:
"""Sanitize string for use in filename/directory"""
if not name:
return 'unnamed'
# Remove/replace invalid characters
name = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', name)
name = re.sub(r'\s+', '-', name.strip())
return name or 'unnamed'
def _build_message_file_path(self, base_path: Path, creator: Dict, message: Dict, attachment: Dict) -> Path:
"""Build destination file path for message attachments"""
# /paid-content/onlyfans/creatorname/messages/2024-01-15/001_originalname.jpg
platform = creator['platform']
username = self._sanitize_filename(creator['username'])
msg_date = (message.get('sent_at') or '')[:10] or 'unknown-date'
index = attachment.get('attachment_index', 0) + 1
original_name = attachment.get('name', '')
if original_name:
sanitized_name = self._sanitize_filename(original_name)
filename = f"{index:03d}_{sanitized_name}"
else:
ext = attachment.get('extension') or 'bin'
if not ext.startswith('.'):
ext = '.' + ext
filename = f"{index:03d}{ext}"
return base_path / platform / username / 'messages' / msg_date / filename
async def _sync_messages_for_creator(self, creator: Dict, client, platform: str) -> int:
"""
Sync messages for a creator.
Args:
creator: Creator dict from database
client: OnlyFansClient or FanslyDirectClient instance
platform: 'onlyfans' or 'fansly'
Returns:
Count of new messages
"""
creator_id = creator['id']
new_messages = 0
try:
self.log(f"Syncing messages for {creator['username']} ({platform})", 'info')
if platform == 'onlyfans':
of_user_id = creator.get('creator_id', '')
if not of_user_id:
return 0
messages = await client.get_messages(of_user_id)
elif platform == 'fansly':
# Find the chat group for this creator
chat_list = await client.get_chat_list()
creator_account_id = creator.get('creator_id', '')
group_id = None
for chat in chat_list:
if str(chat.get('partner_account_id')) == str(creator_account_id):
group_id = chat['group_id']
break
if not group_id:
self.log(f"No chat group found for {creator['username']}", 'debug')
return 0
messages = await client.get_messages(group_id, creator_account_id)
else:
return 0
if not messages:
self.log(f"No messages for {creator['username']}", 'debug')
return 0
self.log(f"Processing {len(messages)} messages for {creator['username']}", 'info')
for msg in messages:
msg_data = msg.to_dict()
msg_db_id, is_new = self.db.upsert_message(creator_id, msg_data)
if is_new:
new_messages += 1
# Upsert attachments for this message
for idx, attachment in enumerate(msg.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
self.db.upsert_message_attachment(msg_db_id, att_data)
self.log(f"Synced {len(messages)} messages ({new_messages} new) for {creator['username']}", 'info')
return new_messages
except Exception as e:
self.log(f"Error syncing messages for {creator['username']}: {e}", 'error')
return 0
def _extract_embeds(self, content: str) -> List[Tuple[str, str, str]]:
"""Extract embedded video URLs from post content"""
if not content:
return []
embeds = []
for pattern, platform in self.EMBED_PATTERNS:
for match in re.finditer(pattern, content):
url = match.group(0)
video_id = match.group(1)
embeds.append((url, platform, video_id))
return embeds
async def _compute_file_hash_async(self, file_path: Path) -> Optional[str]:
"""Compute SHA256 hash of file asynchronously"""
if not file_path.exists():
return None
sha256 = hashlib.sha256()
async with aiofiles.open(file_path, 'rb') as f:
while chunk := await f.read(65536):
sha256.update(chunk)
return sha256.hexdigest()
def _format_bytes(self, size: int) -> str:
"""Format bytes to human readable string"""
if size is None:
return "0 B"
for unit in ['B', 'KB', 'MB', 'GB']:
if abs(size) < 1024.0:
return f"{size:.1f} {unit}"
size /= 1024.0
return f"{size:.1f} TB"
@staticmethod
def _get_platform_display_name(creator: Dict) -> str:
"""Get clean platform display name for notifications."""
_PLATFORM_NAMES = {
'onlyfans_direct': 'OnlyFans',
'onlyfans': 'OnlyFans',
'fansly_direct': 'Fansly',
'fansly': 'Fansly',
'pornhub': 'Pornhub',
'youtube': 'YouTube',
'twitch': 'Twitch',
'coomer': 'Coomer',
'kemono': 'Kemono',
}
key = creator.get('service_id') or creator.get('platform') or 'Unknown'
return _PLATFORM_NAMES.get(key, key.replace('_', ' ').title())
def _send_creator_notification(self, creator: Dict, new_posts: int, downloaded: int,
downloaded_file_info: List[Dict], scheduled: bool = False,
new_messages: int = 0):
"""Send push notification and create DB record for a creator sync.
Args:
creator: Creator dict with id, username, service_id, platform
new_posts: Number of new posts found
downloaded: Number of files downloaded
downloaded_file_info: List of dicts with file info
scheduled: Whether this is a scheduled sync
new_messages: Number of new messages found
"""
if not (scheduled and (downloaded > 0 or new_messages > 0)):
return
# If creator has a tagged-user filter, recount to only include matching posts
filter_tagged = creator.get('filter_tagged_users', '') or ''
filter_tagged = filter_tagged.strip()
if filter_tagged and filter_tagged != '[]':
try:
import json as _json
filter_users = _json.loads(filter_tagged)
if isinstance(filter_users, list) and filter_users:
placeholders = ','.join(['?'] * len(filter_users))
with self.db.unified_db.get_connection() as conn:
cursor = conn.cursor()
# Count new posts that have at least one matching tagged user
cursor.execute(f"""
SELECT COUNT(DISTINCT p.id)
FROM paid_content_posts p
JOIN paid_content_post_tagged_users tu ON tu.post_id = p.id
WHERE p.creator_id = ?
AND tu.username IN ({placeholders})
AND p.created_at >= datetime('now', '-1 hour')
""", (creator['id'], *filter_users))
filtered_new_posts = cursor.fetchone()[0]
# Count downloaded attachments from matching posts
cursor.execute(f"""
SELECT COUNT(DISTINCT a.id)
FROM paid_content_attachments a
JOIN paid_content_posts p ON a.post_id = p.id
JOIN paid_content_post_tagged_users tu ON tu.post_id = p.id
WHERE p.creator_id = ?
AND tu.username IN ({placeholders})
AND a.status = 'downloaded'
AND a.updated_at >= datetime('now', '-1 hour')
""", (creator['id'], *filter_users))
filtered_downloaded = cursor.fetchone()[0]
self.log(f"Notification filter: {new_posts} posts -> {filtered_new_posts}, "
f"{downloaded} downloads -> {filtered_downloaded} (filter: {filter_users})", 'debug')
new_posts = filtered_new_posts
downloaded = filtered_downloaded
# Skip notification entirely if nothing matches the filter
if downloaded == 0 and new_messages == 0:
return
except Exception as e:
self.log(f"Error applying notification filter: {e}", 'debug')
self.log(f"Push check: scheduled={scheduled}, new_posts={new_posts}, downloaded={downloaded}, "
f"notifier={self.notifier is not None}, push_enabled={self.config.get('push_notifications_enabled')}", 'debug')
if not (self.notifier and self.config.get('push_notifications_enabled')):
# Still create DB notification even if push is disabled
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
platform = self._get_platform_display_name(creator)
if downloaded > 0:
title = f"💎 {downloaded} File{'s' if downloaded != 1 else ''} Downloaded"
else:
title = f"💬 {new_messages} New Message{'s' if new_messages != 1 else ''}"
msg_lines = [f"📱 Platform: {platform}", f"📄 Creator: {creator['username']}"]
if new_posts > 0:
msg_lines.append(f"📝 Posts: {new_posts}")
if new_messages > 0:
msg_lines.append(f"💬 Messages: {new_messages}")
msg_lines.append(f"\n{timestamp}")
message = "\n".join(msg_lines)
self.db.create_notification(
notification_type='new_messages' if downloaded == 0 else 'new_content',
creator_id=creator['id'],
title=title,
message=message,
download_count=new_posts,
file_count=downloaded,
media_files=downloaded_file_info[:5] if downloaded_file_info else None
)
return
# Count images vs videos vs audio and collect media files for preview
import random
from pathlib import Path
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'}
video_extensions = {'.mp4', '.mov', '.webm', '.avi', '.mkv', '.m4v'}
audio_extensions = {'.mp3', '.wav', '.flac', '.aac', '.m4a', '.ogg', '.wma'}
image_files = []
video_files = []
image_count = 0
video_count = 0
audio_count = 0
for file_info in downloaded_file_info:
file_path = file_info.get('file_path') or file_info.get('local_path')
if file_path:
path = Path(file_path)
ext = path.suffix.lower()
# Prefer content_type from file_info (set by scraper) over extension
content_type = file_info.get('content_type', '').lower()
if content_type == 'audio':
audio_count += 1
elif content_type == 'image':
image_count += 1
if path.exists():
image_files.append(str(path))
elif content_type == 'video':
video_count += 1
if path.exists():
video_files.append(str(path))
elif ext in audio_extensions:
audio_count += 1
elif ext in image_extensions:
image_count += 1
if path.exists():
image_files.append(str(path))
elif ext in video_extensions:
video_count += 1
if path.exists():
video_files.append(str(path))
# Select preview: prefer images, but extract frame from video if no images
image_path = None
temp_frame_path = None
if image_files:
image_path = random.choice(image_files)
elif video_files and hasattr(self.notifier, '_extract_random_video_frame'):
selected_video = random.choice(video_files)
self.log(f"Extracting frame from video for notification preview: {Path(selected_video).name}", 'debug')
temp_frame_path = self.notifier._extract_random_video_frame(selected_video)
if temp_frame_path:
image_path = temp_frame_path
self.log(f"Successfully extracted video frame for preview", 'debug')
# Build title with counts
title_parts = []
if image_count > 0:
title_parts.append(f"📸 {image_count} Image{'s' if image_count != 1 else ''}")
if video_count > 0:
title_parts.append(f"🎬 {video_count} Video{'s' if video_count != 1 else ''}")
if audio_count > 0:
title_parts.append(f"🎵 {audio_count} Audio")
if downloaded > 0 and not title_parts:
title_parts.append(f"💎 {downloaded} File{'s' if downloaded != 1 else ''}")
if title_parts:
title = " + ".join(title_parts) + " Downloaded"
elif new_messages > 0:
title = f"💬 {new_messages} New Message{'s' if new_messages != 1 else ''}"
else:
title = f"💎 {downloaded} File{'s' if downloaded != 1 else ''} Downloaded"
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
platform = self._get_platform_display_name(creator)
msg_lines = [f"📱 Platform: {platform}", f"📄 Creator: {creator['username']}"]
if new_posts > 0:
msg_lines.append(f"📝 Posts: {new_posts}")
if new_messages > 0:
msg_lines.append(f"💬 Messages: {new_messages}")
msg_lines.append(f"\n{timestamp}")
push_message = "\n".join(msg_lines)
try:
self.notifier.send_notification(
title=title,
message=push_message,
priority=-1 if downloaded == 0 else 0,
image_path=image_path
)
except Exception as e:
self.log(f"Failed to send push notification: {e}", 'warning')
# Clean up temp frame file if created
if temp_frame_path:
try:
Path(temp_frame_path).unlink(missing_ok=True)
except Exception:
pass
# Create notification record in database
self.db.create_notification(
notification_type='new_messages' if downloaded == 0 else 'new_content',
creator_id=creator['id'],
title=title,
message=push_message,
download_count=new_posts,
file_count=downloaded,
media_files=downloaded_file_info[:5] if downloaded_file_info else None
)
async def search_creators(self, service_id: str, query: str, platform: str = None) -> List[Dict]:
"""Search for creators on a service"""
client = self._get_client(service_id)
return await client.search_creators(query, platform)
async def add_creator(self, service_id: str, platform: str, creator_id: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a new creator to track"""
# Check if already exists
existing = self.db.get_creator_by_api_id(service_id, platform, creator_id)
if existing:
return {'success': False, 'error': 'Creator already tracked', 'creator': existing}
# Handle YouTube channels
if service_id == 'youtube':
return await self._add_youtube_creator(creator_id, auto_download, download_embeds)
# Handle Twitch channels
if service_id == 'twitch':
return await self._add_twitch_creator(creator_id, auto_download, download_embeds)
# Handle Fansly Direct
if service_id == 'fansly_direct':
return await self._add_fansly_direct_creator(creator_id, auto_download, download_embeds)
# Handle OnlyFans Direct
if service_id == 'onlyfans_direct':
return await self._add_onlyfans_direct_creator(creator_id, auto_download, download_embeds)
# Handle Pornhub
if service_id == 'pornhub':
return await self._add_pornhub_creator(creator_id, auto_download, download_embeds)
# Handle XHamster
if service_id == 'xhamster':
return await self._add_xhamster_creator(creator_id, auto_download, download_embeds)
# Handle TikTok
if service_id == 'tiktok':
return await self._add_tiktok_creator(creator_id, auto_download, download_embeds)
# Handle Instagram
if service_id == 'instagram':
return await self._add_instagram_creator(creator_id, auto_download, download_embeds)
# Handle Soundgasm
if service_id == 'soundgasm':
return await self._add_soundgasm_creator(creator_id, auto_download, download_embeds)
# Handle BestEyeCandy
if service_id == 'besteyecandy':
return await self._add_besteyecandy_creator(creator_id, auto_download, download_embeds)
# Handle Bellazon
if service_id == 'bellazon':
return await self._add_bellazon_creator(creator_id, auto_download, download_embeds)
# Handle Snapchat
if service_id == 'snapchat':
return await self._add_snapchat_creator(creator_id, auto_download, download_embeds)
# Handle Reddit
if service_id == 'reddit':
return await self._add_reddit_creator(creator_id, auto_download, download_embeds)
# Handle Coppermine galleries
if service_id == 'coppermine':
return await self._add_coppermine_creator(creator_id, auto_download, download_embeds)
# Handle XenForo forums (HQCelebCorner, PicturePub, etc.)
if service_id in self.XENFORO_FORUMS:
return await self._add_xenforo_creator(service_id, creator_id, auto_download, download_embeds)
# Fetch creator info from API
client = self._get_client(service_id)
creator_info = await client.get_creator(platform, creator_id)
if not creator_info:
return {'success': False, 'error': f'Creator not found on {service_id}'}
# Add to database
creator_data = creator_info.to_dict()
creator_data['auto_download'] = 1 if auto_download else 0
creator_data['download_embeds'] = 1 if download_embeds else 0
db_id = self.db.add_creator(creator_data)
return {
'success': True,
'creator': {
'id': db_id,
**creator_data
}
}
async def _add_youtube_creator(self, channel_id_or_url: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a YouTube channel as a creator"""
youtube = self._get_youtube_client()
if not youtube.is_available():
return {'success': False, 'error': 'yt-dlp not available'}
# Extract channel ID from URL if necessary
if 'youtube.com' in channel_id_or_url or 'youtu.be' in channel_id_or_url:
extracted_id = youtube.extract_channel_id(channel_id_or_url)
if extracted_id:
channel_id = extracted_id
channel_url = channel_id_or_url
else:
return {'success': False, 'error': 'Could not extract channel ID from URL'}
else:
channel_id = channel_id_or_url
channel_url = youtube.normalize_channel_url(channel_id)
# Fetch channel info
creator_info = await youtube.get_creator(channel_url)
if not creator_info:
return {'success': False, 'error': 'YouTube channel not found'}
# Add to database
creator_data = creator_info.to_dict()
creator_data['auto_download'] = 1 if auto_download else 0
creator_data['download_embeds'] = 1 if download_embeds else 0
db_id = self.db.add_creator(creator_data)
return {
'success': True,
'creator': {
'id': db_id,
**creator_data
}
}
async def _add_pornhub_creator(self, creator_id_str: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a Pornhub creator (pornstar/channel/user/model)"""
pornhub = self._get_pornhub_client()
if not pornhub.is_available():
return {'success': False, 'error': 'yt-dlp not available'}
# creator_id_str is in 'type/name' format from URL parser
creator_url = pornhub.normalize_creator_url(creator_id_str)
# Fetch creator info
creator_info = await pornhub.get_creator(creator_url)
if not creator_info:
return {'success': False, 'error': 'Pornhub creator not found'}
# Add to database
creator_data = creator_info.to_dict()
creator_data['auto_download'] = 1 if auto_download else 0
creator_data['download_embeds'] = 1 if download_embeds else 0
db_id = self.db.add_creator(creator_data)
return {
'success': True,
'creator': {
'id': db_id,
**creator_data
}
}
async def _add_twitch_creator(self, channel_name_or_url: str,
auto_download: bool = True, download_embeds: bool = True) -> Dict:
"""Add a Twitch channel as a creator"""
twitch = self._get_twitch_client()
if not twitch.is_available():
return {'success': False, 'error': 'yt-dlp not available'}
# Extract channel name from URL if necessary
if 'twitch.tv' in channel_name_or_url:
extracted_name = twitch.extract_channel_name(channel_name_or_url)
if extracted_name:
channel_name = extracted_name
else:
return {'success': False, 'error': 'Could not extract channel name from URL'}
else:
channel_name = channel_name_or_url.lower()
# Fetch channel info
channel_url = f"https://www.twitch.tv/{channel_name}/clips"
creator_info = await twitch.get_creator(channel_url)
if not creator_info:
return {'success': False, 'error': 'Twitch channel not found or has no clips'}
# Add to database
creator_data = creator_info.to_dict()
creator_data['auto_download'] = 1 if auto_download else 0
creator_data['download_embeds'] = 1 if download_embeds else 0
db_id = self.db.add_creator(creator_data)
return {
'success': True,
'creator': {
'id': db_id,
**creator_data
}
}
def _get_fansly_direct_client(self) -> Optional[FanslyDirectClient]:
"""Get or create Fansly Direct client"""
if self._fansly_direct_client is None:
# Get auth token from fansly_direct service
service = self.db.get_service('fansly_direct')
auth_token = service.get('session_cookie') if service else None
if not auth_token:
self.log("Fansly Direct auth token not configured", 'warning')
return None
self._fansly_direct_client = FanslyDirectClient(
auth_token=auth_token,
log_callback=self.log_callback
)
return self._fansly_direct_client
def _apply_auto_tag_rules(self, post_db_id: int, is_new_post: bool):
"""Apply auto-tag rules to a post (only for new posts)"""
if not is_new_post:
return
try:
self.db.apply_auto_tags_to_post(post_db_id)
except Exception:
pass # Don't let auto-tagging errors break sync
def _get_or_create_ppv_tag(self) -> Optional[Dict]:
"""Get or create the PPV tag for locked content"""
# Try to get existing PPV tag
tag = self.db.get_tag_by_slug('ppv')
if tag:
return tag
# Create PPV tag with a distinct color (orange/gold for premium content)
tag_id = self.db.create_tag(
name='PPV',
color='#f59e0b', # Amber/orange color
description='Pay-per-view content requiring manual import'
)
if tag_id:
return self.db.get_tag(tag_id)
return None
async def _add_fansly_direct_creator(
self,
username: str,
auto_download: bool = True,
download_embeds: bool = True
) -> Dict:
"""Add a Fansly creator via direct API"""
client = self._get_fansly_direct_client()
if not client:
return {'success': False, 'error': 'Fansly auth token not configured. Please set it in Settings.'}
# Check if creator with this username already exists
existing = self.db.get_creator_by_api_id('fansly_direct', 'fansly', username)
if existing:
return {'success': False, 'error': 'Creator already tracked', 'creator': existing}
# Fetch account info from Fansly API
account = await client.get_account_info(username)
if not account:
return {'success': False, 'error': f'Fansly account not found: {username}'}
# Cache profile images locally
fansly_creator_id = account.get('account_id') or username
cached_avatar = await self._cache_profile_image(account.get('avatar_url'), 'fansly', fansly_creator_id, 'avatar') if account.get('avatar_url') else None
cached_banner = await self._cache_profile_image(account.get('banner_url'), 'fansly', fansly_creator_id, 'banner') if account.get('banner_url') else None
# Create creator data
creator_data = {
'service_id': 'fansly_direct',
'platform': 'fansly',
'creator_id': fansly_creator_id,
'username': account.get('username') or username,
'display_name': account.get('display_name'),
'profile_image_url': cached_avatar or account.get('avatar_url'),
'banner_image_url': cached_banner or account.get('banner_url'),
'auto_download': 1 if auto_download else 0,
'download_embeds': 1 if download_embeds else 0,
}
db_id = self.db.add_creator(creator_data)
return {
'success': True,
'creator': {
'id': db_id,
**creator_data
}
}
async def _sync_fansly_direct_creator(
self,
creator: Dict,
download: bool = True,
scheduled: bool = False,
date_from: str = None,
date_to: str = None,
days_back: int = None
) -> SyncResult:
"""
Sync a Fansly creator via direct API.
Args:
creator: Creator dict from database
download: Whether to download files after syncing
scheduled: If True, create notifications (for scheduled syncs only)
date_from: Only fetch posts after this date (ISO format)
date_to: Only fetch posts before this date (ISO format)
days_back: Fetch posts from the last N days
"""
creator_id = creator['id']
self.log(f"Syncing Fansly creator via direct API: {creator['username']}", 'info')
# Register active sync for polling-based updates
sync_data = {
'username': creator['username'],
'platform': 'fansly',
'service': 'fansly_direct',
'status': 'Fetching posts...',
'phase': 'fetching',
'failed': 0
}
self._register_active_sync(creator_id, sync_data)
# Emit WebSocket event
self._emit_event('paid_content_sync_started', {
'creator_id': creator_id,
**sync_data
})
try:
client = self._get_fansly_direct_client()
if not client:
error = "Fansly auth token not configured"
self._unregister_active_sync(creator_id)
return SyncResult(success=False, error=error)
# Fetch and update creator profile (display name, avatar, banner, bio, etc.)
try:
account_info = await client.get_account_info(creator['username'])
if account_info:
profile_updates = {}
if account_info.get('display_name'):
profile_updates['display_name'] = account_info['display_name']
if account_info.get('avatar_url'):
cached = await self._cache_profile_image(account_info['avatar_url'], 'fansly', creator['creator_id'], 'avatar')
profile_updates['profile_image_url'] = cached or account_info['avatar_url']
if account_info.get('banner_url'):
cached = await self._cache_profile_image(account_info['banner_url'], 'fansly', creator['creator_id'], 'banner')
profile_updates['banner_image_url'] = cached or account_info['banner_url']
if account_info.get('bio'):
profile_updates['bio'] = account_info['bio']
if account_info.get('location'):
profile_updates['location'] = account_info['location']
if account_info.get('external_links'):
profile_updates['external_links'] = account_info['external_links']
if profile_updates:
self.db.update_creator(creator_id, profile_updates)
self.log(f"Updated creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
except Exception as e:
self.log(f"Failed to update creator profile: {e}", 'warning')
# Determine date filter
# Priority: explicit date_from/date_to > days_back > scheduled default (3 days) > last_post_date
since_date = date_from
until_date = date_to
if days_back:
from datetime import timedelta
since_date = (datetime.now() - timedelta(days=days_back)).isoformat()
elif not since_date:
from datetime import timedelta
if scheduled:
# Scheduled syncs only check last 3 days for efficiency
since_date = (datetime.now() - timedelta(days=3)).isoformat()
else:
# Manual syncs use incremental from last post date
since_date = creator.get('last_post_date')
# Progress callback
def progress_callback(page: int, total_posts: int):
self._update_active_sync(creator_id, {
'status': f'Fetched {total_posts} posts (page {page})...',
'posts_fetched': total_posts,
'page': page
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': f'Fetched {total_posts} posts (page {page})...',
'phase': 'fetching',
'posts_fetched': total_posts,
'page': page
})
# Fetch posts from Fansly
posts = await client.get_posts(
username=creator['username'],
since_date=since_date,
until_date=until_date,
progress_callback=progress_callback
)
if not posts:
self.log(f"No new posts for {creator['username']}", 'debug')
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
# Still download any pending attachments
downloaded = 0
failed = 0
if download and creator.get('auto_download', True):
pending_count = self.db.get_pending_attachment_count(creator_id)
if pending_count > 0:
self.log(f"Downloading {pending_count} pending attachments", 'info')
self._update_active_sync(creator_id, {
'status': f'Downloading {pending_count} pending files...',
'phase': 'downloading',
'total_files': pending_count
})
result = await self.download_pending_for_creator(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': 0,
'new_attachments': 0,
'downloaded': downloaded,
'failed': failed
})
return SyncResult(success=True, new_posts=0, new_attachments=0,
downloaded_files=downloaded, failed_files=failed)
self.log(f"Found {len(posts)} posts for {creator['username']}", 'info')
# Update status
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} posts...',
'phase': 'processing',
'total_posts': len(posts)
})
new_posts = 0
new_attachments = 0
for i, post in enumerate(posts):
# Update progress periodically
if (i + 1) % 10 == 0:
self._update_active_sync(creator_id, {
'status': f'Processing post {i + 1}/{len(posts)}...',
'phase': 'processing'
})
# Insert/update post in database
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
# Check if post has PPV content (attachments without download URL)
has_ppv = any(att.download_url is None for att in post.attachments)
# Insert attachments
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
# Mark PPV attachments as unavailable so they don't enter the download queue
if attachment.download_url is None:
att_data['status'] = 'unavailable'
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
# Tag as PPV if it has locked content
if has_ppv:
ppv_tag = self._get_or_create_ppv_tag()
if ppv_tag:
self.db.add_tag_to_post(post_db_id, ppv_tag['id'])
self._apply_auto_tag_rules(post_db_id, is_new_post)
# Update pinned posts in DB (handles posts outside date range too)
if hasattr(client, '_last_pinned_posts') and client._last_pinned_posts:
self.db.update_pinned_posts(creator_id, client._last_pinned_posts)
# Sync messages
new_messages = await self._sync_messages_for_creator(creator, client, 'fansly')
# Update creator stats
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'last_post_date': latest_post_date or creator.get('last_post_date'),
'post_count': self.db.get_creator_post_count(creator_id)
})
# Download if enabled
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self.download_pending_for_creator(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
# Unregister from active syncs
self._unregister_active_sync(creator_id)
# Emit completed event
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': new_posts,
'new_attachments': new_attachments,
'downloaded': downloaded,
'failed': failed
})
# Send push notification for new downloads or messages
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled, new_messages=new_messages)
return SyncResult(
success=True,
new_posts=new_posts,
new_attachments=new_attachments,
downloaded_files=downloaded,
failed_files=failed,
downloaded_file_info=downloaded_file_info
)
except Exception as e:
self.log(f"Error syncing Fansly creator {creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id,
'username': creator['username'],
'error': str(e)
})
return SyncResult(success=False, error=str(e))
# =========================================================================
# OnlyFans Direct
# =========================================================================
def _get_onlyfans_direct_client(self) -> Optional[OnlyFansClient]:
"""Get or create OnlyFans Direct client"""
if self._onlyfans_direct_client is None:
import json
service = self.db.get_service('onlyfans_direct')
if not service or not service.get('session_cookie'):
self.log("OnlyFans Direct credentials not configured", 'warning')
return None
# Auth config is stored as JSON in session_cookie
raw = service['session_cookie']
try:
auth_config = json.loads(raw)
except (json.JSONDecodeError, TypeError):
self.log("OnlyFans Direct credentials: invalid JSON in session_cookie", 'error')
return None
if not auth_config.get('sess'):
self.log("OnlyFans Direct: 'sess' cookie not set", 'warning')
return None
signing_url = auth_config.get('signing_url')
self._onlyfans_direct_client = OnlyFansClient(
auth_config=auth_config,
signing_url=signing_url,
log_callback=self.log_callback,
)
return self._onlyfans_direct_client
async def _add_onlyfans_direct_creator(
self,
username: str,
auto_download: bool = True,
download_embeds: bool = True,
) -> Dict:
"""Add an OnlyFans creator via direct API"""
client = self._get_onlyfans_direct_client()
if not client:
return {'success': False, 'error': 'OnlyFans credentials not configured. Please set them in Settings.'}
# Check if already tracked
existing = self.db.get_creator_by_api_id('onlyfans_direct', 'onlyfans', username)
if existing:
return {'success': False, 'error': 'Creator already tracked', 'creator': existing}
# Fetch user info
user_info = await client.get_user_info(username)
if not user_info:
return {'success': False, 'error': f'OnlyFans user not found: {username}'}
# Cache profile images locally
of_creator_id = user_info.get('user_id') or username
cached_avatar = await self._cache_profile_image(user_info.get('avatar_url'), 'onlyfans', of_creator_id, 'avatar') if user_info.get('avatar_url') else None
cached_banner = await self._cache_profile_image(user_info.get('banner_url'), 'onlyfans', of_creator_id, 'banner') if user_info.get('banner_url') else None
creator_data = {
'service_id': 'onlyfans_direct',
'platform': 'onlyfans',
'creator_id': of_creator_id,
'username': user_info.get('username') or username,
'display_name': user_info.get('display_name'),
'profile_image_url': cached_avatar or user_info.get('avatar_url'),
'banner_image_url': cached_banner or user_info.get('banner_url'),
'bio': user_info.get('bio'),
'joined_date': user_info.get('join_date'),
'auto_download': 1 if auto_download else 0,
'download_embeds': 1 if download_embeds else 0,
}
db_id = self.db.add_creator(creator_data)
return {
'success': True,
'creator': {
'id': db_id,
**creator_data,
},
}
async def _sync_onlyfans_direct_creator(
self,
creator: Dict,
download: bool = True,
scheduled: bool = False,
date_from: str = None,
date_to: str = None,
days_back: int = None,
) -> SyncResult:
"""
Sync an OnlyFans creator via direct API.
Follows the exact same pattern as _sync_fansly_direct_creator.
"""
creator_id = creator['id']
self.log(f"Syncing OnlyFans creator via direct API: {creator['username']}", 'info')
# Register active sync
sync_data = {
'username': creator['username'],
'platform': 'onlyfans',
'service': 'onlyfans_direct',
'status': 'Fetching posts...',
'phase': 'fetching',
}
self._register_active_sync(creator_id, sync_data)
self._emit_event('paid_content_sync_started', {
'creator_id': creator_id,
**sync_data,
})
try:
client = self._get_onlyfans_direct_client()
if not client:
error = "OnlyFans credentials not configured"
self._unregister_active_sync(creator_id)
return SyncResult(success=False, error=error)
# Fetch and update creator profile
try:
user_info = await client.get_user_info(creator['username'])
if user_info:
profile_updates = {}
if user_info.get('display_name'):
profile_updates['display_name'] = user_info['display_name']
if user_info.get('avatar_url'):
cached = await self._cache_profile_image(user_info['avatar_url'], 'onlyfans', creator['creator_id'], 'avatar')
profile_updates['profile_image_url'] = cached or user_info['avatar_url']
if user_info.get('banner_url'):
cached = await self._cache_profile_image(user_info['banner_url'], 'onlyfans', creator['creator_id'], 'banner')
profile_updates['banner_image_url'] = cached or user_info['banner_url']
if user_info.get('bio'):
profile_updates['bio'] = user_info['bio']
if profile_updates:
self.db.update_creator(creator_id, profile_updates)
self.log(f"Updated creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
except Exception as e:
self.log(f"Failed to update creator profile: {e}", 'warning')
# Determine date filter
since_date = date_from
until_date = date_to
if days_back:
from datetime import timedelta
since_date = (datetime.now() - timedelta(days=days_back)).isoformat()
elif not since_date:
from datetime import timedelta
if scheduled:
since_date = (datetime.now() - timedelta(days=3)).isoformat()
# Manual resync: fetch all posts (no date filter) so old
# data from previous sources (e.g. Coomer) gets overwritten
# We need user_id for the posts endpoint; get it from creator or fetch it
of_user_id = creator.get('creator_id', '')
if not of_user_id or of_user_id == creator['username']:
# Fetch user info to get numeric ID
info = await client.get_user_info(creator['username'])
if info:
of_user_id = info['user_id']
else:
self._unregister_active_sync(creator_id)
return SyncResult(success=False, error=f"Could not resolve user ID for {creator['username']}")
# Progress callback
def progress_callback(page: int, total_posts: int):
self._update_active_sync(creator_id, {
'status': f'Fetched {total_posts} posts (page {page})...',
'posts_fetched': total_posts,
'page': page,
})
self._emit_event('paid_content_sync_progress', {
'creator_id': creator_id,
'username': creator['username'],
'status': f'Fetched {total_posts} posts (page {page})...',
'phase': 'fetching',
'posts_fetched': total_posts,
'page': page,
})
# Fetch posts
posts = await client.get_posts(
user_id=of_user_id,
username=creator['username'],
since_date=since_date,
until_date=until_date,
progress_callback=progress_callback,
)
if not posts:
self.log(f"No new posts for {creator['username']}", 'debug')
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
# Still download pending attachments
downloaded = 0
failed = 0
if download and creator.get('auto_download', True):
pending_count = self.db.get_pending_attachment_count(creator_id)
if pending_count > 0:
self.log(f"Downloading {pending_count} pending attachments", 'info')
self._update_active_sync(creator_id, {
'status': f'Downloading {pending_count} pending files...',
'phase': 'downloading',
'total_files': pending_count,
})
result = await self.download_pending_for_creator(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': 0,
'new_attachments': 0,
'downloaded': downloaded,
'failed': failed,
})
return SyncResult(success=True, new_posts=0, new_attachments=0,
downloaded_files=downloaded, failed_files=failed)
self.log(f"Found {len(posts)} posts for {creator['username']}", 'info')
self._update_active_sync(creator_id, {
'status': f'Processing {len(posts)} posts...',
'phase': 'processing',
'total_posts': len(posts),
})
new_posts = 0
new_attachments = 0
for i, post in enumerate(posts):
if (i + 1) % 10 == 0:
self._update_active_sync(creator_id, {
'status': f'Processing post {i + 1}/{len(posts)}...',
'phase': 'processing',
})
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
if post_db_id:
if is_new_post:
new_posts += 1
# Check for PPV content
has_ppv = any(att.download_url is None or getattr(att, 'is_preview', False) for att in post.attachments)
for idx, attachment in enumerate(post.attachments):
att_data = attachment.to_dict()
att_data['attachment_index'] = idx
# Mark PPV attachments as unavailable so they don't enter the download queue
if attachment.download_url is None:
att_data['status'] = 'unavailable'
if self.db.upsert_attachment(post_db_id, att_data):
new_attachments += 1
if has_ppv:
ppv_tag = self._get_or_create_ppv_tag()
if ppv_tag:
self.db.add_tag_to_post(post_db_id, ppv_tag['id'])
self._apply_auto_tag_rules(post_db_id, is_new_post)
# Sync messages
new_messages = await self._sync_messages_for_creator(creator, client, 'onlyfans')
# Update creator stats
latest_post_date = max(
(p.published_at for p in posts if p.published_at), default=None
) if posts else None
self.db.update_creator(creator_id, {
'last_checked': datetime.now().isoformat(),
'last_post_date': latest_post_date or creator.get('last_post_date'),
'post_count': self.db.get_creator_post_count(creator_id),
})
# Upgrade DRM preview frames from Coomer (actual video clips)
try:
upgraded = await self._upgrade_drm_from_coomer(creator_id, creator['username'], 'onlyfans')
if upgraded > 0:
self.log(f"Upgraded {upgraded} preview frames to video from Coomer", 'info')
except Exception as e:
self.log(f"Coomer fallback check failed (non-critical): {e}", 'warning')
# Download if enabled
downloaded = 0
failed = 0
downloaded_file_info = []
if download and creator.get('auto_download', True):
result = await self.download_pending_for_creator(creator_id)
downloaded = result.get('downloaded', 0)
failed = result.get('failed', 0)
downloaded_file_info = result.get('downloaded_file_info', [])
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_completed', {
'creator_id': creator_id,
'username': creator['username'],
'new_posts': new_posts,
'new_attachments': new_attachments,
'downloaded': downloaded,
'failed': failed,
})
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled, new_messages=new_messages)
return SyncResult(
success=True,
new_posts=new_posts,
new_attachments=new_attachments,
downloaded_files=downloaded,
failed_files=failed,
downloaded_file_info=downloaded_file_info,
)
except Exception as e:
self.log(f"Error syncing OnlyFans creator {creator['username']}: {e}", 'error')
self._unregister_active_sync(creator_id)
self._emit_event('paid_content_sync_error', {
'creator_id': creator_id,
'username': creator['username'],
'error': str(e),
})
return SyncResult(success=False, error=str(e))
async def _upgrade_drm_from_coomer(self, creator_id: int, username: str, platform: str = 'onlyfans') -> int:
"""Replace DRM preview frame JPGs with actual preview video clips from Coomer.
OnlyFans DRM videos only yield a still-frame JPG via direct API. Coomer archives
the actual free preview video clip. This method finds completed "video" attachments
that are actually image files (preview frames) and replaces them with the real
video from Coomer.
Only checks once per day per creator to avoid redundant API calls.
Returns the number of upgraded attachments.
"""
# Check cooldown — only query Coomer once per day per creator
creator = self.db.get_creator(creator_id)
if creator and creator.get('last_coomer_check'):
from datetime import timedelta
try:
last_check = datetime.fromisoformat(creator['last_coomer_check'])
if datetime.now().astimezone() - last_check < timedelta(hours=24):
self.log(f"Coomer fallback: skipping (last checked {creator['last_coomer_check']})", 'debug')
return 0
except (ValueError, TypeError):
pass # Invalid timestamp, proceed with check
# Revert failed Coomer upgrades back to original preview frame state.
# server_path is preserved as /onlyfans/{media_id} during upgrade, so we can
# reconstruct the original name/extension. Setting status to 'skipped' lets the
# next OF sync refresh the download_url and reset to pending automatically.
with self.db.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT a.id, a.name, a.server_path, p.post_id
FROM paid_content_attachments a
JOIN paid_content_posts p ON a.post_id = p.id
WHERE p.creator_id = ?
AND a.file_type = 'video'
AND a.status = 'failed'
AND a.name LIKE '%\\_source.mp4' ESCAPE '\\'
""", (creator_id,))
failed_coomer = [dict(row) for row in cursor.fetchall()]
if failed_coomer:
reverted = 0
for att in failed_coomer:
# Reconstruct original preview frame name from preserved server_path
# e.g. /onlyfans/3878367393 -> 3878367393.jpg
media_id = att['server_path'].rsplit('/', 1)[-1] if att['server_path'] else ''
original_name = f"{media_id}.jpg" if media_id else att['name']
self.db.update_attachment(att['id'], {
'name': original_name,
'extension': 'jpg',
'status': 'skipped',
'download_url': None,
'local_path': None,
'local_filename': None,
'file_hash': None,
'perceptual_hash': None,
'download_attempts': 0,
'error_message': 'Coomer CDN unavailable - will restore on next sync',
})
reverted += 1
self.log(f"Reverted {reverted} failed Coomer downloads to preview frames", 'info')
# Find preview frame attachments: file_type='video' but extension is an image format
# These are DRM videos where we could only download the preview frame
with self.db.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT a.id, a.name, a.local_path, a.extension, p.post_id, p.id as post_db_id
FROM paid_content_attachments a
JOIN paid_content_posts p ON a.post_id = p.id
WHERE p.creator_id = ?
AND a.file_type = 'video'
AND a.status = 'completed'
AND lower(a.extension) IN ('jpg', 'jpeg', 'png', 'webp', 'gif')
""", (creator_id,))
preview_frames = [dict(row) for row in cursor.fetchall()]
if not preview_frames:
return 0
self.log(f"Found {len(preview_frames)} DRM preview frames to check against Coomer", 'info')
# Get Coomer client
try:
coomer_client = self._get_client('coomer')
except Exception as e:
self.log(f"Could not initialize Coomer client: {e}", 'warning')
return 0
# Group by post_id to avoid duplicate API calls
posts_by_id = {}
for frame in preview_frames:
post_id = str(frame['post_id'])
if post_id not in posts_by_id:
posts_by_id[post_id] = []
posts_by_id[post_id].append(frame)
upgraded = 0
checked = 0
for post_id, frames in posts_by_id.items():
checked += 1
if checked % 10 == 0:
self.log(f"Coomer fallback: checked {checked}/{len(posts_by_id)} posts...", 'info')
try:
coomer_post = await coomer_client.get_post(platform, username, post_id)
if not coomer_post:
continue
# Find video attachments in the Coomer post
video_atts = [
att for att in coomer_post.attachments
if att.file_type == 'video' and att.extension in ('mp4', 'mov', 'webm', 'mkv', 'm4v')
]
if not video_atts:
continue
# Match preview frames to Coomer videos
# If there's one video and one frame, it's a direct match
# If multiple, match by index order
for i, frame in enumerate(frames):
if i >= len(video_atts):
break
coomer_att = video_atts[i]
download_url = coomer_att.download_url
if not download_url and coomer_att.server_path:
download_url = coomer_client.get_attachment_url(coomer_att.server_path)
if not download_url:
continue
# Delete old preview frame JPG from disk
if frame['local_path']:
old_path = Path(frame['local_path'])
if old_path.exists():
try:
old_path.unlink()
self.log(f"Deleted preview frame: {old_path.name}", 'debug')
except OSError as e:
self.log(f"Could not delete preview frame {old_path}: {e}", 'warning')
# Update attachment to point to Coomer video.
# Preserve server_path (/onlyfans/{media_id}) so the OF sync can
# match and restore the preview frame if the Coomer download fails.
self.db.update_attachment(frame['id'], {
'download_url': download_url,
'name': coomer_att.name,
'extension': coomer_att.extension or 'mp4',
'status': 'pending',
'local_path': None,
'local_filename': None,
'file_hash': None,
'perceptual_hash': None,
'download_attempts': 0,
'error_message': None,
})
upgraded += 1
except Exception as e:
self.log(f"Coomer fallback error for post {post_id}: {e}", 'debug')
continue
if upgraded > 0:
self.log(f"Upgraded {upgraded} preview frames to video from Coomer", 'info')
# Update last check timestamp so we don't re-check for 24h
self.db.update_creator(creator_id, {
'last_coomer_check': datetime.now().astimezone().isoformat()
})
return upgraded
def _compute_perceptual_hash(self, file_path: Path) -> Optional[str]:
"""
Compute perceptual hash for an image file.
Uses dhash (difference hash) which is effective for detecting
visually similar images even with minor modifications.
Returns hex string or None if not an image or error.
"""
try:
# Only process images
suffix = file_path.suffix.lower()
if suffix not in {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.heic', '.heif', '.avif'}:
return None
try:
import imagehash
from PIL import Image
except ImportError:
self.log("imagehash/PIL not available for perceptual hashing", 'debug')
return None
with Image.open(file_path) as img:
# Convert to RGB if necessary
if img.mode != 'RGB':
img = img.convert('RGB')
# Compute dhash with 16x16 = 256 bits
phash = str(imagehash.dhash(img, hash_size=16))
return phash
except Exception as e:
self.log(f"Failed to compute perceptual hash for {file_path.name}: {e}", 'debug')
return None
def _hamming_distance(self, hash1: str, hash2: str) -> int:
"""Calculate hamming distance between two hex hash strings"""
if not hash1 or not hash2 or len(hash1) != len(hash2):
return 999 # Return high value for invalid comparison
try:
# Convert hex to integers and XOR
h1 = int(hash1, 16)
h2 = int(hash2, 16)
xor = h1 ^ h2
# Count set bits (differences)
return bin(xor).count('1')
except (ValueError, TypeError):
return 999
def _check_perceptual_duplicate(self, file_path: Path, phash: str, att_id: int) -> Optional[Dict]:
"""
Check if file is a perceptual duplicate of existing content.
Returns dict with duplicate info if found, None otherwise.
"""
if not phash:
return None
# Check if perceptual detection is enabled
if not self.config.get('perceptual_duplicate_detection', True):
return None
threshold = self.config.get('perceptual_threshold', 12)
# Get all existing perceptual hashes
existing = self.db.get_attachments_with_phash()
for existing_att in existing:
if existing_att['id'] == att_id:
continue # Skip self
existing_phash = existing_att.get('perceptual_hash')
if not existing_phash:
continue
distance = self._hamming_distance(phash, existing_phash)
if distance <= threshold:
return {
'attachment_id': existing_att['id'],
'local_path': existing_att.get('local_path'),
'distance': distance,
'creator': existing_att.get('username'),
'post_title': existing_att.get('post_title')
}
return None
def _extract_dimensions(self, file_path: Path, file_type: str) -> Tuple[Optional[int], Optional[int], Optional[int]]:
"""
Extract dimensions (width, height) and duration (for videos) from a file.
Returns (width, height, duration) tuple. Duration is in seconds for videos, None for images.
"""
width, height, duration = None, None, None
try:
if file_type == 'image' and HAS_PIL:
with Image.open(file_path) as img:
width, height = img.size
elif file_type in ('video', 'audio'):
# Use ffprobe to get dimensions (video) and duration (video/audio)
cmd = [
'ffprobe', '-v', 'quiet',
'-print_format', 'json',
'-show_streams', '-show_format',
]
if file_type == 'video':
cmd += ['-select_streams', 'v:0']
cmd.append(str(file_path))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if result.returncode == 0:
import json
data = json.loads(result.stdout)
if file_type == 'video' and data.get('streams'):
stream = data['streams'][0]
width = stream.get('width')
height = stream.get('height')
if data.get('format'):
duration_str = data['format'].get('duration')
if duration_str:
duration = int(float(duration_str))
except Exception as e:
self.log(f"Error extracting dimensions from {file_path.name}: {e}", 'warning')
return width, height, duration
def _generate_thumbnail(self, file_path: Path, file_type: str, max_size: tuple = (300, 300)) -> Optional[bytes]:
"""
Generate thumbnail for image or video file.
Returns thumbnail as JPEG bytes, or None if generation fails.
"""
if not HAS_PIL:
return None
try:
# Check actual file extension — preview frames are jpg but file_type may say 'video'
actual_ext = file_path.suffix.lower().lstrip('.')
image_exts = {'jpg', 'jpeg', 'png', 'webp', 'gif', 'bmp', 'tiff'}
if actual_ext in image_exts:
file_type = 'image'
if file_type == 'image':
# Native size not supported for images - fall back to large
result = self._generate_image_thumbnail(file_path, max_size or (800, 800))
if result:
return result
# PIL failed — file may be a video with image extension (e.g. Instagram stories)
self.log(f"PIL failed for {file_path.name}, trying ffmpeg (may be video with image extension)", 'debug')
return self._generate_video_thumbnail(file_path, max_size, seek_time='00:00:01')
elif file_type == 'video':
# Platform-specific seek times:
# - Pornhub: 10s to skip intro branding
# - OnlyFans/Fansly: first frame (no intro logos)
# - Default: 5s
path_str = str(file_path)
if '/pornhub/' in path_str:
seek_time = '00:00:10'
elif '/onlyfans/' in path_str or '/fansly/' in path_str:
seek_time = '00:00:00'
else:
seek_time = None
return self._generate_video_thumbnail(file_path, max_size, seek_time=seek_time)
except Exception as e:
self.log(f"Error generating thumbnail for {file_path.name}: {e}", 'warning')
return None
def _generate_image_thumbnail(self, file_path: Path, max_size: tuple = (300, 300)) -> Optional[bytes]:
"""Generate thumbnail for image file."""
try:
with Image.open(file_path) as img:
# Convert to RGB if necessary (handles RGBA, P mode, etc.)
if img.mode in ('RGBA', 'P', 'LA'):
img = img.convert('RGB')
elif img.mode != 'RGB':
img = img.convert('RGB')
img.thumbnail(max_size, Image.Resampling.LANCZOS)
buffer = BytesIO()
img.save(buffer, format='JPEG', quality=85, optimize=True)
return buffer.getvalue()
except Exception as e:
self.log(f"Error generating image thumbnail: {e}", 'warning')
return None
def _generate_video_thumbnail(self, file_path: Path, max_size: tuple = (300, 300), seek_time: str = None) -> Optional[bytes]:
"""Generate thumbnail for video file using ffmpeg.
Args:
seek_time: Override seek position (e.g. '00:00:30'). If None, defaults to 5 seconds
to skip intro logos (e.g. Pornhub/Modelhub branding).
"""
try:
# Create temp file for thumbnail
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
tmp_path = tmp.name
# Use ffmpeg to extract frame at specified time
# -ss before -i for fast input seeking (keyframe-based)
# Default to 5 seconds to skip platform intro logos
# max_size=None means native resolution (no scaling)
scale_args = ['-vf', f'scale={max_size[0]}:{max_size[1]}:force_original_aspect_ratio=decrease'] if max_size else []
cmd = [
'ffmpeg', '-y',
'-ss', seek_time or '00:00:05',
'-i', str(file_path),
'-vframes', '1',
*scale_args,
'-f', 'image2',
tmp_path
]
result = subprocess.run(cmd, capture_output=True, timeout=30)
# Retry without seeking if first attempt failed or produced no output
# (ffmpeg returns 0 even when seeking past end of short videos)
if result.returncode != 0 or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) < 100:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
# Try without seeking (for very short videos)
cmd = [
'ffmpeg', '-y',
'-i', str(file_path),
'-vframes', '1',
*scale_args,
'-f', 'image2',
tmp_path
]
result = subprocess.run(cmd, capture_output=True, timeout=30)
if result.returncode == 0 and os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 100:
with open(tmp_path, 'rb') as f:
thumbnail_data = f.read()
os.unlink(tmp_path)
return thumbnail_data
# Clean up temp file if it exists
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except subprocess.TimeoutExpired:
self.log(f"Timeout generating video thumbnail for {file_path.name}", 'warning')
except Exception as e:
self.log(f"Error generating video thumbnail: {e}", 'warning')
return None
async def _download_youtube_thumbnail(self, video_id: str, max_size: tuple = (300, 300)) -> Optional[bytes]:
"""
Download YouTube thumbnail for a video.
Tries maxresdefault first, falls back to hqdefault.
Returns resized JPEG bytes.
"""
if not HAS_PIL:
self.log("PIL not available for thumbnail processing", 'warning')
return None
# YouTube thumbnail URLs in order of quality preference
thumbnail_urls = [
f"https://i.ytimg.com/vi/{video_id}/maxresdefault.jpg", # 1280x720
f"https://i.ytimg.com/vi/{video_id}/sddefault.jpg", # 640x480
f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg", # 480x360
f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg", # 320x180
]
try:
async with aiohttp.ClientSession() as session:
for url in thumbnail_urls:
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=10)) as resp:
if resp.status == 200:
image_data = await resp.read()
# Check if it's a valid image (YouTube returns a placeholder for missing thumbnails)
if len(image_data) < 1000:
continue # Too small, likely a placeholder
# Resize to standard thumbnail size
with Image.open(BytesIO(image_data)) as img:
if img.mode != 'RGB':
img = img.convert('RGB')
img.thumbnail(max_size, Image.Resampling.LANCZOS)
buffer = BytesIO()
img.save(buffer, format='JPEG', quality=85, optimize=True)
return buffer.getvalue()
except Exception:
continue # Try next URL
self.log(f"Could not download YouTube thumbnail for {video_id}", 'warning')
return None
except Exception as e:
self.log(f"Error downloading YouTube thumbnail: {e}", 'warning')
return None
def backfill_dimensions(self, batch_size: int = 100, log_callback: Callable = None) -> Dict:
"""
Scan completed attachments with missing dimensions and extract them from files.
Returns stats dict with counts.
"""
if log_callback:
self.log_callback = log_callback
stats = {
'total_missing': self.db.count_attachments_missing_dimensions(),
'processed': 0,
'updated': 0,
'failed': 0,
'not_found': 0
}
self.log(f"Starting dimension backfill: {stats['total_missing']} attachments missing dimensions", 'info')
while True:
attachments = self.db.get_attachments_missing_dimensions(limit=batch_size)
if not attachments:
break
for att in attachments:
stats['processed'] += 1
local_path = att.get('local_path')
if not local_path:
stats['not_found'] += 1
continue
file_path = Path(local_path)
if not file_path.exists():
stats['not_found'] += 1
self.log(f"File not found: {local_path}", 'debug')
continue
try:
width, height, duration = self._extract_dimensions(file_path, att.get('file_type', ''))
if width and height:
updates = {'width': width, 'height': height}
if duration:
updates['duration'] = duration
self.db.update_attachment(att['id'], updates)
stats['updated'] += 1
self.log(f"Updated dimensions for attachment {att['id']}: {width}x{height}", 'debug')
else:
stats['failed'] += 1
self.log(f"Could not extract dimensions from {file_path.name}", 'debug')
except Exception as e:
stats['failed'] += 1
self.log(f"Error processing attachment {att['id']}: {e}", 'warning')
# Log progress
remaining = stats['total_missing'] - stats['processed']
self.log(f"Progress: {stats['processed']}/{stats['total_missing']} processed, {stats['updated']} updated, {remaining} remaining", 'info')
self.log(f"Dimension backfill complete: {stats['updated']} updated, {stats['failed']} failed, {stats['not_found']} files not found", 'info')
return stats
async def backfill_truncated_content(self, batch_size: int = 50, creator_id: int = None) -> Dict:
"""
Find posts with truncated content (ending with '..') from Coomer/Kemono
and re-fetch full content from the individual post endpoint.
Args:
batch_size: Number of posts to process per batch
creator_id: Optional - limit to a specific creator
Returns:
Stats dict with counts
"""
stats = {
'total_truncated': 0,
'updated': 0,
'failed': 0,
'skipped': 0,
}
# Find posts with truncated or missing content
# Coomer OnlyFans posts store the truncated text in 'title' and have empty 'content'
# Also catch posts ending with '..' (truncated substring)
with self.db.unified_db.get_connection() as conn:
cursor = conn.cursor()
query = """
SELECT p.id, p.post_id, p.content, p.title, p.creator_id,
c.platform, c.creator_id as api_creator_id, c.service_id, c.username
FROM paid_content_posts p
JOIN paid_content_creators c ON p.creator_id = c.id
WHERE c.service_id IN ('coomer', 'kemono')
AND (
(p.content IS NULL OR p.content = '') AND p.title IS NOT NULL AND p.title != ''
OR p.content LIKE '%..'
OR p.title LIKE '%..'
)
"""
params = []
if creator_id:
query += " AND p.creator_id = ?"
params.append(creator_id)
query += " ORDER BY p.id"
cursor.execute(query, params)
truncated_posts = [dict(row) for row in cursor.fetchall()]
stats['total_truncated'] = len(truncated_posts)
self.log(f"Found {len(truncated_posts)} posts with truncated content", 'info')
if not truncated_posts:
return stats
# Group by service_id to use the right client
from collections import defaultdict
by_service = defaultdict(list)
for post in truncated_posts:
by_service[post['service_id']].append(post)
for service_id, posts in by_service.items():
try:
client = self._get_client(service_id)
except Exception as e:
self.log(f"Could not get client for {service_id}: {e}", 'error')
stats['failed'] += len(posts)
continue
for i, post in enumerate(posts):
try:
full_post = await client.get_post(
post['platform'],
post['api_creator_id'],
post['post_id']
)
new_content = None
if full_post:
# Use full content from API if available
new_content = full_post.content
# If API content is also empty, try the full post title
if not new_content and full_post.title:
new_content = full_post.title
# If API didn't help, at least copy title to content locally
if not new_content and post.get('title'):
new_content = post['title']
if new_content and new_content != (post.get('content') or ''):
updates = {'content': new_content}
# Clear the title if it was just the truncated content
# (OnlyFans posts don't have real titles)
if post.get('title') and new_content.startswith(post['title'].rstrip('.')):
updates['title'] = None
self.db.update_post(post['id'], updates)
stats['updated'] += 1
self.log(
f"Updated content for post {post['post_id']} ({post['username']})",
'debug'
)
else:
stats['skipped'] += 1
except Exception as e:
stats['failed'] += 1
self.log(f"Error fetching full content for post {post['post_id']}: {e}", 'warning')
# Progress logging
if (i + 1) % 25 == 0:
self.log(
f"Content backfill progress: {i + 1}/{len(posts)} for {service_id} "
f"({stats['updated']} updated, {stats['failed']} failed)",
'info'
)
self.log(
f"Content backfill complete: {stats['updated']} updated, "
f"{stats['failed']} failed, {stats['skipped']} skipped "
f"(of {stats['total_truncated']} truncated)",
'info'
)
return stats