9844 lines
451 KiB
Python
9844 lines
451 KiB
Python
"""
|
|
Main scraper orchestrating content download from Coomer/Kemono
|
|
"""
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import re
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from datetime import datetime
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
from typing import Callable, Dict, List, Optional, Tuple
|
|
from urllib.parse import urlparse
|
|
|
|
import aiohttp
|
|
import aiofiles
|
|
|
|
try:
|
|
from PIL import Image
|
|
HAS_PIL = True
|
|
except ImportError:
|
|
HAS_PIL = False
|
|
|
|
from modules.base_module import LoggingMixin, DeferredDownloadsMixin
|
|
from modules.activity_status import get_activity_manager
|
|
from .api_client import PaidContentAPIClient
|
|
from .db_adapter import PaidContentDBAdapter
|
|
from .youtube_client import YouTubeClient
|
|
from .twitch_client import TwitchClient
|
|
from .fansly_direct_client import FanslyDirectClient
|
|
from .onlyfans_client import OnlyFansClient
|
|
from .pornhub_client import PornhubClient
|
|
from .xhamster_client import XHamsterClient
|
|
from .tiktok_client import TikTokClient
|
|
from .instagram_adapter import InstagramAdapter
|
|
from .soundgasm_client import SoundgasmClient, format_tag_display
|
|
from .bellazon_client import BellazonClient
|
|
from .besteyecandy_client import BestEyeCandyClient
|
|
from .snapchat_client import SnapchatPaidContentClient
|
|
from .reddit_client import RedditClient
|
|
from .xenforo_forum_client import XenForoForumClient
|
|
from .coppermine_client import CoppermineClient
|
|
from .models import Post, Attachment, SyncResult, DownloadResult
|
|
|
|
|
|
class PaidContentScraper(LoggingMixin, DeferredDownloadsMixin):
|
|
"""
|
|
Main scraper for Paid Content feature
|
|
|
|
Responsibilities:
|
|
- Sync creators (fetch new posts)
|
|
- Download attachments
|
|
- Download embedded videos
|
|
- File organization
|
|
- Duplicate detection
|
|
- Progress reporting
|
|
"""
|
|
|
|
# Regex patterns for extracting embedded URLs from post content
|
|
EMBED_PATTERNS = [
|
|
(r'https?://(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', 'youtube'),
|
|
(r'https?://youtu\.be/([a-zA-Z0-9_-]{11})', 'youtube'),
|
|
(r'https?://(?:www\.)?vimeo\.com/(\d+)', 'vimeo'),
|
|
(r'https?://(?:www\.)?dailymotion\.com/video/([a-zA-Z0-9]+)', 'dailymotion'),
|
|
(r'https?://(?:www\.)?twitch\.tv/videos/(\d+)', 'twitch'),
|
|
]
|
|
|
|
# XenForo-based forum configs (service_id → settings)
|
|
XENFORO_FORUMS = {
|
|
'hqcelebcorner': {
|
|
'base_url': 'https://www.hqcelebcorner.net',
|
|
'cookie_path': '/opt/media-downloader/cookies/forum_cookies_HQCelebCorner.json',
|
|
},
|
|
'picturepub': {
|
|
'base_url': 'https://picturepub.net',
|
|
'cookie_path': '/opt/media-downloader/cookies/forum_cookies_PicturePub.json',
|
|
},
|
|
}
|
|
|
|
def __init__(self, unified_db, event_emitter=None, notifier=None, log_callback=None, websocket_manager=None, app_state=None):
|
|
self._init_logger('PaidContent', log_callback, default_module='Scraper')
|
|
self._init_deferred_downloads()
|
|
|
|
self.unified_db = unified_db
|
|
self.db = PaidContentDBAdapter(unified_db)
|
|
self.event_emitter = event_emitter
|
|
self.notifier = notifier
|
|
self.websocket_manager = websocket_manager
|
|
self.app_state = app_state
|
|
|
|
# API clients (initialized lazily with session cookies)
|
|
self._clients: Dict[str, PaidContentAPIClient] = {}
|
|
|
|
# YouTube client (uses yt-dlp)
|
|
self._youtube_client: Optional[YouTubeClient] = None
|
|
|
|
# Twitch client (uses yt-dlp)
|
|
self._twitch_client: Optional[TwitchClient] = None
|
|
|
|
# Fansly Direct client
|
|
self._fansly_direct_client = None
|
|
|
|
# OnlyFans Direct client
|
|
self._onlyfans_direct_client = None
|
|
|
|
# Pornhub client (uses yt-dlp)
|
|
self._pornhub_client: Optional[PornhubClient] = None
|
|
|
|
# XHamster client (uses yt-dlp)
|
|
self._xhamster_client: Optional[XHamsterClient] = None
|
|
|
|
# TikTok client (uses yt-dlp + gallery-dl)
|
|
self._tiktok_client: Optional[TikTokClient] = None
|
|
|
|
# Soundgasm + Liltsome client
|
|
self._soundgasm_client: Optional[SoundgasmClient] = None
|
|
|
|
# Bellazon forum client
|
|
self._bellazon_client: Optional[BellazonClient] = None
|
|
|
|
# BestEyeCandy client
|
|
self._besteyecandy_client: Optional[BestEyeCandyClient] = None
|
|
|
|
# Snapchat client
|
|
self._snapchat_client: Optional[SnapchatPaidContentClient] = None
|
|
|
|
# Reddit client (uses gallery-dl)
|
|
self._reddit_client: Optional[RedditClient] = None
|
|
|
|
# XenForo forum clients (hqcelebcorner, picturepub, etc.)
|
|
self._xenforo_clients: Dict[str, XenForoForumClient] = {}
|
|
|
|
# Coppermine gallery client
|
|
self._coppermine_client: Optional[CoppermineClient] = None
|
|
|
|
# Load config
|
|
self.config = self.db.get_config()
|
|
|
|
# Download settings
|
|
self.max_concurrent_downloads = self.config.get('max_concurrent_downloads', 3)
|
|
self.log(f"Max concurrent downloads: {self.max_concurrent_downloads}", 'info')
|
|
|
|
# Initialize activity manager for database-backed progress tracking
|
|
# This works across processes (scheduler, API, etc.)
|
|
self.activity_manager = get_activity_manager(unified_db)
|
|
|
|
# Keep app_state for backwards compatibility, but prefer activity_manager
|
|
if self.app_state and not hasattr(self.app_state, 'active_paid_content_syncs'):
|
|
self.app_state.active_paid_content_syncs = {}
|
|
|
|
self.log(f"Scraper initialized", 'info')
|
|
|
|
def _is_creator_syncing(self, creator_id: int) -> bool:
|
|
"""Check if a creator already has an active sync running."""
|
|
task_id = f"paid_content_sync_{creator_id}"
|
|
try:
|
|
all_tasks = self.activity_manager.get_active_background_tasks()
|
|
return any(
|
|
t.get('task_id') == task_id and t.get('task_type') == 'paid_content_sync'
|
|
for t in all_tasks
|
|
)
|
|
except Exception:
|
|
return False
|
|
|
|
def _register_active_sync(self, creator_id: int, data: Dict):
|
|
"""Register an active sync task for polling-based status updates.
|
|
|
|
Uses activity_manager for database-backed tracking that works across processes.
|
|
"""
|
|
task_id = f"paid_content_sync_{creator_id}"
|
|
creator_name = data.get('username') or data.get('creator', 'Unknown')
|
|
self.log(f"Registering active sync for creator {creator_id} ({creator_name})", 'info')
|
|
|
|
# Use activity_manager for database-backed tracking (works across processes)
|
|
self.activity_manager.start_background_task(
|
|
task_id=task_id,
|
|
task_type="paid_content_sync",
|
|
display_name=f"Sync: {creator_name}",
|
|
status="Starting",
|
|
extra_data={
|
|
**data,
|
|
'creator_id': creator_id,
|
|
'started_at': datetime.now().isoformat()
|
|
}
|
|
)
|
|
|
|
# Also update app_state for backwards compatibility (API process only)
|
|
if self.app_state:
|
|
if not hasattr(self.app_state, 'active_paid_content_syncs'):
|
|
self.app_state.active_paid_content_syncs = {}
|
|
self.app_state.active_paid_content_syncs[creator_id] = {
|
|
**data,
|
|
'creator_id': creator_id,
|
|
'started_at': datetime.now().isoformat()
|
|
}
|
|
|
|
self.log(f"Registered sync for creator {creator_id}", 'info')
|
|
|
|
def _is_permanent_error(self, error: str) -> bool:
|
|
"""
|
|
Check if an error is permanent (should not be retried) vs retriable.
|
|
|
|
Permanent errors (don't retry):
|
|
- HTTP 500, 502, 503 server errors
|
|
- HTTP 404 not found
|
|
- HTTP 403 forbidden
|
|
- HTTP 410 gone
|
|
|
|
Retriable errors (auto-retry later):
|
|
- Timeouts / stalls
|
|
- Partial downloads
|
|
- Connection resets
|
|
- Network errors
|
|
"""
|
|
error_lower = error.lower()
|
|
|
|
# Permanent HTTP errors
|
|
permanent_patterns = [
|
|
'http 500', 'http 502', 'http 503', 'http 504',
|
|
'http 404', 'http 403', 'http 410',
|
|
'500 internal', '502 bad gateway', '503 service',
|
|
'404 not found', '403 forbidden', '410 gone',
|
|
'no video url', 'invalid url'
|
|
]
|
|
|
|
for pattern in permanent_patterns:
|
|
if pattern in error_lower:
|
|
return True
|
|
|
|
return False
|
|
|
|
def _update_active_sync(self, creator_id: int, updates: Dict):
|
|
"""Update an active sync task's status.
|
|
|
|
Uses activity_manager for database-backed tracking that works across processes.
|
|
"""
|
|
task_id = f"paid_content_sync_{creator_id}"
|
|
|
|
# Build status message from updates
|
|
status = updates.get('status', 'Running')
|
|
phase = updates.get('phase', '')
|
|
progress_current = updates.get('progress')
|
|
progress_total = updates.get('total_files')
|
|
|
|
# Merge updates into existing extra_data to preserve persistent fields
|
|
# (username, platform, service, etc. set during _register_active_sync)
|
|
existing_extra = {}
|
|
try:
|
|
tasks = self.activity_manager.get_active_background_tasks()
|
|
for t in tasks:
|
|
if t.get('task_id') == task_id:
|
|
existing_extra = t.get('extra_data', {}) or {}
|
|
break
|
|
except Exception:
|
|
pass
|
|
|
|
merged_extra = {**existing_extra, **updates, 'updated_at': datetime.now().isoformat()}
|
|
|
|
# Update activity_manager (database-backed, works across processes)
|
|
self.activity_manager.update_background_task(
|
|
task_id=task_id,
|
|
detailed_status=status,
|
|
progress_current=progress_current,
|
|
progress_total=progress_total,
|
|
extra_data=merged_extra
|
|
)
|
|
|
|
# Also update app_state for backwards compatibility (API process only)
|
|
if self.app_state and hasattr(self.app_state, 'active_paid_content_syncs'):
|
|
if creator_id in self.app_state.active_paid_content_syncs:
|
|
self.app_state.active_paid_content_syncs[creator_id].update(updates)
|
|
self.app_state.active_paid_content_syncs[creator_id]['updated_at'] = datetime.now().isoformat()
|
|
|
|
def _unregister_active_sync(self, creator_id: int):
|
|
"""Remove an active sync task when complete.
|
|
|
|
Uses activity_manager for database-backed tracking that works across processes.
|
|
"""
|
|
task_id = f"paid_content_sync_{creator_id}"
|
|
self.log(f"Unregistering active sync for creator {creator_id}", 'info')
|
|
|
|
# Stop in activity_manager (database-backed)
|
|
self.activity_manager.stop_background_task(task_id)
|
|
|
|
# Also clear from app_state for backwards compatibility
|
|
if self.app_state and hasattr(self.app_state, 'active_paid_content_syncs'):
|
|
self.app_state.active_paid_content_syncs.pop(creator_id, None)
|
|
|
|
self.log(f"Unregistered sync for creator {creator_id}", 'info')
|
|
|
|
def _update_download_status(self, creator_id: int, total_files: int):
|
|
"""Update the sync status with current download progress"""
|
|
active_count = len(self._active_downloads)
|
|
|
|
if active_count == 0:
|
|
status = f"Downloaded {self._download_progress['completed']}/{total_files} files"
|
|
elif active_count == 1:
|
|
# Single file - show detailed progress
|
|
dl = list(self._active_downloads.values())[0]
|
|
progress_str = self._format_bytes(dl['progress'])
|
|
if dl['size']:
|
|
pct = int(dl['progress'] / dl['size'] * 100)
|
|
total_str = self._format_bytes(dl['size'])
|
|
status = f"Downloading: {dl['name'][:40]} ({progress_str}/{total_str} - {pct}%)"
|
|
else:
|
|
status = f"Downloading: {dl['name'][:40]} ({progress_str})"
|
|
else:
|
|
# Multiple concurrent downloads - show summary
|
|
total_progress = sum(d['progress'] for d in self._active_downloads.values())
|
|
total_size = sum(d['size'] or 0 for d in self._active_downloads.values())
|
|
progress_str = self._format_bytes(total_progress)
|
|
|
|
if total_size > 0:
|
|
total_str = self._format_bytes(total_size)
|
|
status = f"Downloading {active_count} files: {progress_str}/{total_str}"
|
|
else:
|
|
status = f"Downloading {active_count} files: {progress_str}"
|
|
|
|
# Build list of active downloads for UI
|
|
active_list = [
|
|
{
|
|
'name': d['name'],
|
|
'size': d['size'],
|
|
'progress': d['progress']
|
|
}
|
|
for d in self._active_downloads.values()
|
|
]
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading',
|
|
'status': status,
|
|
'active_downloads': active_list,
|
|
'active_count': active_count,
|
|
'downloaded': self._download_progress['success'],
|
|
'failed': self._download_progress['failed'],
|
|
'total_files': total_files,
|
|
'progress': self._download_progress['completed']
|
|
})
|
|
|
|
def _emit_event(self, event_type: str, data: Dict):
|
|
"""Emit WebSocket event for real-time updates.
|
|
|
|
When websocket_manager is not available (e.g., running from scheduler),
|
|
status updates are still available via polling /dashboard/active-syncs
|
|
which uses activity_manager for database-backed cross-process tracking.
|
|
"""
|
|
if self.websocket_manager:
|
|
try:
|
|
self.log(f"Emitting WebSocket event: {event_type}", 'debug')
|
|
self.websocket_manager.broadcast_sync({
|
|
'type': event_type,
|
|
'data': {
|
|
**data,
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
})
|
|
except Exception as e:
|
|
self.log(f"Failed to emit event {event_type}: {e}", 'warning')
|
|
# Note: When websocket is unavailable, status updates are still tracked
|
|
# via activity_manager and available through polling endpoints
|
|
|
|
def _get_client(self, service_id: str) -> PaidContentAPIClient:
|
|
"""Get or create API client for service"""
|
|
if service_id not in self._clients:
|
|
service = self.db.get_service(service_id)
|
|
session_cookie = service.get('session_cookie') if service else None
|
|
base_url = service.get('base_url') if service else None
|
|
self.log(f"Creating API client for {service_id}: base_url={base_url}, has_cookie={session_cookie is not None}", 'debug')
|
|
self._clients[service_id] = PaidContentAPIClient(
|
|
service_id,
|
|
session_cookie=session_cookie,
|
|
base_url=base_url,
|
|
log_callback=self.log_callback
|
|
)
|
|
return self._clients[service_id]
|
|
|
|
def _get_youtube_client(self) -> YouTubeClient:
|
|
"""Get or create YouTube client"""
|
|
if self._youtube_client is None:
|
|
# Try to load YouTube Data API key from ytdlp scraper settings
|
|
api_key = None
|
|
try:
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT settings_json FROM scrapers WHERE id = ?", ('ytdlp',))
|
|
row = cursor.fetchone()
|
|
if row and row[0]:
|
|
settings = json.loads(row[0])
|
|
api_key = settings.get('youtube_api_key') or None
|
|
except Exception as e:
|
|
self.log(f"Could not load YouTube API key from settings: {e}", 'debug')
|
|
|
|
self._youtube_client = YouTubeClient(
|
|
unified_db=self.unified_db,
|
|
log_callback=self.log_callback,
|
|
api_key=api_key
|
|
)
|
|
if not self._youtube_client.is_available():
|
|
self.log("yt-dlp not found, YouTube support will be disabled", 'warning')
|
|
return self._youtube_client
|
|
|
|
def _get_twitch_client(self) -> TwitchClient:
|
|
"""Get or create Twitch client"""
|
|
if self._twitch_client is None:
|
|
self._twitch_client = TwitchClient(unified_db=self.unified_db, log_callback=self.log_callback)
|
|
if not self._twitch_client.is_available():
|
|
self.log("yt-dlp not found, Twitch support will be disabled", 'warning')
|
|
return self._twitch_client
|
|
|
|
def _get_pornhub_client(self) -> PornhubClient:
|
|
"""Get or create Pornhub client"""
|
|
if self._pornhub_client is None:
|
|
self._pornhub_client = PornhubClient(unified_db=self.unified_db, log_callback=self.log_callback)
|
|
if not self._pornhub_client.is_available():
|
|
self.log("yt-dlp not found, Pornhub support will be disabled", 'warning')
|
|
return self._pornhub_client
|
|
|
|
def _get_xhamster_client(self) -> XHamsterClient:
|
|
"""Get or create XHamster client"""
|
|
if self._xhamster_client is None:
|
|
self._xhamster_client = XHamsterClient(unified_db=self.unified_db, log_callback=self.log_callback)
|
|
if not self._xhamster_client.is_available():
|
|
self.log("yt-dlp not found, XHamster support will be disabled", 'warning')
|
|
return self._xhamster_client
|
|
|
|
def _get_tiktok_client(self) -> TikTokClient:
|
|
"""Get or create TikTok client"""
|
|
if self._tiktok_client is None:
|
|
self._tiktok_client = TikTokClient(unified_db=self.unified_db, log_callback=self.log_callback)
|
|
if not self._tiktok_client.is_available():
|
|
self.log("yt-dlp/gallery-dl not found, TikTok support will be disabled", 'warning')
|
|
return self._tiktok_client
|
|
|
|
def _get_soundgasm_client(self) -> SoundgasmClient:
|
|
"""Get or create Soundgasm + Liltsome client"""
|
|
if self._soundgasm_client is None:
|
|
self._soundgasm_client = SoundgasmClient(log_callback=self.log_callback)
|
|
return self._soundgasm_client
|
|
|
|
def _get_bellazon_client(self) -> BellazonClient:
|
|
"""Get or create Bellazon forum client"""
|
|
if self._bellazon_client is None:
|
|
self._bellazon_client = BellazonClient(log_callback=self.log_callback)
|
|
return self._bellazon_client
|
|
|
|
def _get_besteyecandy_client(self) -> BestEyeCandyClient:
|
|
"""Get or create BestEyeCandy client"""
|
|
if self._besteyecandy_client is None:
|
|
self._besteyecandy_client = BestEyeCandyClient(
|
|
unified_db=self.unified_db, log_callback=self.log_callback)
|
|
return self._besteyecandy_client
|
|
|
|
def _get_snapchat_client(self) -> SnapchatPaidContentClient:
|
|
"""Get or create Snapchat client"""
|
|
if self._snapchat_client is None:
|
|
self._snapchat_client = SnapchatPaidContentClient(
|
|
unified_db=self.unified_db, log_callback=self.log_callback)
|
|
return self._snapchat_client
|
|
|
|
def _get_reddit_client(self) -> RedditClient:
|
|
"""Get or create Reddit client"""
|
|
if self._reddit_client is None:
|
|
self._reddit_client = RedditClient(
|
|
unified_db=self.unified_db, log_callback=self.log_callback)
|
|
return self._reddit_client
|
|
|
|
def _get_xenforo_client(self, service_id: str) -> XenForoForumClient:
|
|
"""Get or create a XenForo forum client for the given service."""
|
|
if service_id not in self._xenforo_clients:
|
|
config = self.XENFORO_FORUMS[service_id]
|
|
self._xenforo_clients[service_id] = XenForoForumClient(
|
|
service_id=service_id,
|
|
base_url=config['base_url'],
|
|
cookie_path=config['cookie_path'],
|
|
log_callback=self.log_callback,
|
|
)
|
|
return self._xenforo_clients[service_id]
|
|
|
|
def _get_coppermine_client(self) -> CoppermineClient:
|
|
"""Get or create Coppermine gallery client"""
|
|
if self._coppermine_client is None:
|
|
self._coppermine_client = CoppermineClient(log_callback=self.log_callback)
|
|
return self._coppermine_client
|
|
|
|
async def close(self):
|
|
"""Close all API clients"""
|
|
for client in self._clients.values():
|
|
await client.close()
|
|
self._clients.clear()
|
|
# Cleanup YouTube client temp files
|
|
if self._youtube_client:
|
|
self._youtube_client.cleanup()
|
|
# Cleanup Twitch client temp files
|
|
if self._twitch_client:
|
|
self._twitch_client.cleanup()
|
|
# Cleanup Pornhub client temp files
|
|
if self._pornhub_client:
|
|
self._pornhub_client.cleanup()
|
|
# Cleanup XHamster client temp files
|
|
if self._xhamster_client:
|
|
self._xhamster_client.cleanup()
|
|
# Cleanup TikTok client
|
|
if self._tiktok_client:
|
|
self._tiktok_client.cleanup()
|
|
|
|
async def _cache_profile_image(self, url: str, platform: str, creator_id: str, image_type: str) -> Optional[str]:
|
|
"""Download a profile image and cache it locally.
|
|
|
|
Args:
|
|
url: Remote image URL to download
|
|
platform: Platform name (instagram, onlyfans, etc.)
|
|
creator_id: Creator's ID on the platform
|
|
image_type: 'avatar' or 'banner'
|
|
|
|
Returns:
|
|
Local serving URL like /api/paid-content/cache/profile-image/instagram_username_avatar.jpg
|
|
or None on failure
|
|
"""
|
|
if not url:
|
|
return None
|
|
|
|
cache_dir = Path(__file__).parent.parent.parent / 'data' / 'cache' / 'profile_images'
|
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Determine extension from URL
|
|
ext = '.jpg' # default
|
|
parsed_path = urlparse(url).path.lower()
|
|
if '.png' in parsed_path:
|
|
ext = '.png'
|
|
elif '.webp' in parsed_path:
|
|
ext = '.webp'
|
|
elif '.gif' in parsed_path:
|
|
ext = '.gif'
|
|
|
|
# Sanitize creator_id for filename
|
|
safe_id = re.sub(r'[^\w.-]', '_', str(creator_id))
|
|
filename = f"{platform}_{safe_id}_{image_type}{ext}"
|
|
filepath = cache_dir / filename
|
|
|
|
try:
|
|
# Instagram needs curl_cffi due to CDN restrictions
|
|
if platform == 'instagram':
|
|
downloaded = await asyncio.to_thread(
|
|
self._download_with_instagram_session, url, filepath
|
|
)
|
|
if downloaded:
|
|
return f"/api/paid-content/cache/profile-image/{filename}"
|
|
return None
|
|
|
|
# All other platforms: use aiohttp
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
|
|
}
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=30)) as resp:
|
|
if resp.status == 200:
|
|
data = await resp.read()
|
|
filepath.write_bytes(data)
|
|
return f"/api/paid-content/cache/profile-image/{filename}"
|
|
except Exception as e:
|
|
self.log(f"Failed to cache {image_type} for {platform}/{creator_id}: {e}", 'debug')
|
|
|
|
return None
|
|
|
|
def _download_with_instagram_session(self, url: str, filepath: Path) -> bool:
|
|
"""Download an image using the ImgInn session (sync, for thread)."""
|
|
try:
|
|
adapter = InstagramAdapter(unified_db=self.unified_db, log_callback=self.log_callback)
|
|
client = adapter._get_client()
|
|
resp = client.session.get(url, headers=client._default_headers, timeout=30)
|
|
if resp.status_code == 200:
|
|
filepath.write_bytes(resp.content)
|
|
return True
|
|
else:
|
|
self.log(f"Instagram avatar download HTTP {resp.status_code}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Instagram session download failed: {e}", 'debug')
|
|
return False
|
|
|
|
async def sync_creator(self, creator_id: int, download: bool = True, scheduled: bool = False,
|
|
force_backfill: bool = False) -> SyncResult:
|
|
"""Sync a single creator - fetch new posts and optionally queue downloads
|
|
|
|
Args:
|
|
creator_id: The creator's database ID
|
|
download: Whether to download files after syncing
|
|
scheduled: If True, create notifications (for scheduled syncs only)
|
|
force_backfill: If True, run Pullpush historical backfill even if not first sync (Reddit only)
|
|
"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return SyncResult(success=False, error="Creator not found")
|
|
|
|
# Handle YouTube channels separately
|
|
if creator['service_id'] == 'youtube':
|
|
return await self._sync_youtube_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle Twitch channels separately
|
|
if creator['service_id'] == 'twitch':
|
|
return await self._sync_twitch_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle Fansly Direct separately
|
|
if creator['service_id'] == 'fansly_direct':
|
|
return await self._sync_fansly_direct_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle OnlyFans Direct separately
|
|
if creator['service_id'] == 'onlyfans_direct':
|
|
return await self._sync_onlyfans_direct_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle Pornhub creators
|
|
if creator['service_id'] == 'pornhub':
|
|
return await self._sync_pornhub_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle XHamster creators
|
|
if creator['service_id'] == 'xhamster':
|
|
return await self._sync_xhamster_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle TikTok creators
|
|
if creator['service_id'] == 'tiktok':
|
|
return await self._sync_tiktok_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle Instagram creators
|
|
if creator['service_id'] == 'instagram':
|
|
return await self._sync_instagram_creator(creator, download, scheduled=scheduled, force_backfill=force_backfill)
|
|
|
|
# Handle Soundgasm creators
|
|
if creator['service_id'] == 'soundgasm':
|
|
return await self._sync_soundgasm_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle BestEyeCandy creators
|
|
if creator['service_id'] == 'besteyecandy':
|
|
return await self._sync_besteyecandy_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle Bellazon creators
|
|
if creator['service_id'] == 'bellazon':
|
|
return await self._sync_bellazon_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle Snapchat creators
|
|
if creator['service_id'] == 'snapchat':
|
|
return await self._sync_snapchat_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle Reddit subreddits
|
|
if creator['service_id'] == 'reddit':
|
|
return await self._sync_reddit_creator(creator, download, scheduled=scheduled,
|
|
force_backfill=force_backfill)
|
|
|
|
# Handle Coppermine gallery creators
|
|
if creator['service_id'] == 'coppermine':
|
|
return await self._sync_coppermine_creator(creator, download, scheduled=scheduled)
|
|
|
|
# Handle XenForo forum creators (HQCelebCorner, PicturePub, etc.)
|
|
if creator['service_id'] in self.XENFORO_FORUMS:
|
|
return await self._sync_xenforo_creator(creator, download, scheduled=scheduled)
|
|
|
|
self.log(f"Syncing creator: {creator['username']} ({creator['platform']})", 'info')
|
|
|
|
# Register active sync for polling-based updates
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': creator['platform'],
|
|
'service': creator['service_id'],
|
|
'status': 'Fetching posts...',
|
|
'phase': 'fetching',
|
|
'failed': 0
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
|
|
# Also emit WebSocket event (for clients that support it)
|
|
self._emit_event('paid_content_sync_started', {
|
|
'creator_id': creator_id,
|
|
**sync_data
|
|
})
|
|
|
|
try:
|
|
client = self._get_client(creator['service_id'])
|
|
|
|
# Fetch and update creator profile (display name, avatar, banner)
|
|
try:
|
|
creator_info = await client.get_creator(creator['platform'], creator['creator_id'])
|
|
if creator_info:
|
|
profile_updates = {}
|
|
if creator_info.display_name:
|
|
profile_updates['display_name'] = creator_info.display_name
|
|
if creator_info.profile_image_url:
|
|
cached = await self._cache_profile_image(creator_info.profile_image_url, creator['platform'], creator['creator_id'], 'avatar')
|
|
profile_updates['profile_image_url'] = cached or creator_info.profile_image_url
|
|
if creator_info.banner_image_url:
|
|
cached = await self._cache_profile_image(creator_info.banner_image_url, creator['platform'], creator['creator_id'], 'banner')
|
|
profile_updates['banner_image_url'] = cached or creator_info.banner_image_url
|
|
if profile_updates:
|
|
self.db.update_creator(creator_id, profile_updates)
|
|
self.log(f"Updated creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Failed to update creator profile: {e}", 'warning')
|
|
|
|
# Fetch posts since last check with progress callback
|
|
since_date = creator.get('last_post_date')
|
|
|
|
def progress_callback(page: int, total_posts: int):
|
|
# Update polling-based status
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Fetched {total_posts} posts (page {page})...',
|
|
'posts_fetched': total_posts,
|
|
'page': page
|
|
})
|
|
# Also emit WebSocket event
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': f'Fetched {total_posts} posts (page {page})...',
|
|
'phase': 'fetching',
|
|
'posts_fetched': total_posts,
|
|
'page': page
|
|
})
|
|
|
|
posts = await client.get_all_creator_posts(
|
|
creator['platform'],
|
|
creator['creator_id'],
|
|
since_date=since_date,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
if not posts:
|
|
self.log(f"No new posts for {creator['username']}", 'debug')
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
|
|
# Still download any pending attachments even if no new posts
|
|
downloaded = 0
|
|
failed = 0
|
|
if download and creator.get('auto_download', True):
|
|
pending_count = self.db.get_pending_attachment_count(creator_id)
|
|
if pending_count > 0:
|
|
self.log(f"Downloading {pending_count} pending attachments for {creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading {pending_count} pending files...',
|
|
'phase': 'downloading',
|
|
'total_files': pending_count
|
|
})
|
|
result = await self.download_pending_for_creator(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
|
|
# Unregister from active syncs
|
|
self._unregister_active_sync(creator_id)
|
|
# Emit completed event
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': 0,
|
|
'new_attachments': 0,
|
|
'downloaded': downloaded,
|
|
'failed': failed
|
|
})
|
|
return SyncResult(success=True, new_posts=0, new_attachments=0, downloaded_files=downloaded, failed_files=failed)
|
|
|
|
self.log(f"Found {len(posts)} new posts for {creator['username']}", 'info')
|
|
|
|
# Update polling status and emit processing event
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} posts...',
|
|
'phase': 'processing',
|
|
'total_posts': len(posts)
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': f'Processing {len(posts)} posts...',
|
|
'phase': 'processing',
|
|
'total_posts': len(posts)
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
|
|
for i, post in enumerate(posts):
|
|
# Fetch full post content (list endpoint only returns truncated 'substring')
|
|
full_post = await client.get_post(creator['platform'], creator['creator_id'], post.post_id)
|
|
if full_post:
|
|
# Use full content from individual post endpoint
|
|
post.content = full_post.content
|
|
# Also update attachments if the full post has more details
|
|
if full_post.attachments:
|
|
post.attachments = full_post.attachments
|
|
|
|
# Update progress
|
|
if (i + 1) % 10 == 0:
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing post {i + 1}/{len(posts)}...',
|
|
'phase': 'processing'
|
|
})
|
|
|
|
# Insert/update post in database
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
|
|
# Insert attachments
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
# Extract and store embedded URLs
|
|
embeds = self._extract_embeds(post.content)
|
|
for embed_url, platform, video_id in embeds:
|
|
self.db.upsert_embed(post_db_id, {
|
|
'url': embed_url,
|
|
'platform': platform,
|
|
'video_id': video_id
|
|
})
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
# Update creator stats - find the actual newest post date (posts may not be sorted by date)
|
|
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'last_post_date': latest_post_date or creator.get('last_post_date'),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
# Download if enabled
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self.download_pending_for_creator(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
# Unregister from active syncs
|
|
self._unregister_active_sync(creator_id)
|
|
|
|
# Emit completed event
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': new_posts,
|
|
'new_attachments': new_attachments,
|
|
'downloaded': downloaded,
|
|
'failed': failed
|
|
})
|
|
|
|
# Send push notification for new downloads
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True,
|
|
new_posts=new_posts,
|
|
new_attachments=new_attachments,
|
|
downloaded_files=downloaded,
|
|
failed_files=failed,
|
|
downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing {creator['username']}: {e}", 'error')
|
|
# Unregister from active syncs
|
|
self._unregister_active_sync(creator_id)
|
|
# Emit error event
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
async def sync_all_creators(self, enabled_only: bool = True, scheduled: bool = False, download: bool = True) -> Dict[int, SyncResult]:
|
|
"""Sync all enabled creators - YouTube runs in parallel with Coomer/Kemono
|
|
|
|
Args:
|
|
enabled_only: Only sync enabled creators
|
|
scheduled: If True, create notifications (for scheduled syncs only)
|
|
download: If True, download files after syncing. If False, only sync metadata.
|
|
"""
|
|
creators = self.db.get_creators(enabled_only=enabled_only)
|
|
|
|
# Separate creators by service type for parallel execution
|
|
youtube_creators = [c for c in creators if c['service_id'] == 'youtube']
|
|
pornhub_creators = [c for c in creators if c['service_id'] == 'pornhub']
|
|
reddit_creators = [c for c in creators if c['service_id'] == 'reddit']
|
|
other_creators = [c for c in creators if c['service_id'] not in ('youtube', 'pornhub', 'reddit')]
|
|
|
|
self.log(f"Syncing {len(creators)} creators ({len(youtube_creators)} YouTube, {len(pornhub_creators)} Pornhub, {len(reddit_creators)} Reddit, {len(other_creators)} other)", 'info')
|
|
|
|
# Run each service group in parallel
|
|
tasks = []
|
|
if youtube_creators:
|
|
tasks.append(self._sync_creator_group(youtube_creators, 'YouTube', scheduled=scheduled, download=download))
|
|
if pornhub_creators:
|
|
tasks.append(self._sync_creator_group(pornhub_creators, 'Pornhub', scheduled=scheduled, download=download))
|
|
if reddit_creators:
|
|
tasks.append(self._sync_creator_group(reddit_creators, 'Reddit', scheduled=scheduled, download=download))
|
|
if other_creators:
|
|
tasks.append(self._sync_creator_group(other_creators, 'Coomer/Kemono', scheduled=scheduled, download=download))
|
|
|
|
if not tasks:
|
|
return {}
|
|
|
|
# Wait for both groups to complete
|
|
group_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Merge results
|
|
results = {}
|
|
total_new_posts = 0
|
|
total_new_files = 0
|
|
all_downloaded_files = []
|
|
|
|
for group_result in group_results:
|
|
if isinstance(group_result, Exception):
|
|
self.log(f"Error in sync group: {group_result}", 'error')
|
|
continue
|
|
for creator_id, result in group_result.items():
|
|
results[creator_id] = result
|
|
total_new_posts += result.new_posts
|
|
total_new_files += result.new_attachments
|
|
# Collect downloaded file info for notifications
|
|
if result.downloaded_file_info:
|
|
all_downloaded_files.extend(result.downloaded_file_info)
|
|
|
|
# Note: Per-creator notifications are sent in individual sync methods
|
|
# Skip aggregate notification to avoid duplicates
|
|
# self._send_sync_notification(total_new_posts, total_new_files, all_downloaded_files)
|
|
|
|
self.log(f"Sync complete: {total_new_posts} new posts, {total_new_files} new files", 'info')
|
|
return results
|
|
|
|
async def _sync_creator_group(self, creators: List[Dict], group_name: str, scheduled: bool = False, download: bool = True) -> Dict[int, SyncResult]:
|
|
"""Sync a group of creators sequentially (used for parallel service groups)"""
|
|
results = {}
|
|
self.log(f"Starting {group_name} sync group ({len(creators)} creators)", 'info')
|
|
|
|
# Crash recovery checkpoint
|
|
from modules.task_checkpoint import TaskCheckpoint
|
|
checkpoint = TaskCheckpoint(f'paid_content:{group_name}', 'background')
|
|
checkpoint.start(total_items=len(creators))
|
|
if checkpoint.is_recovering():
|
|
self.log(f"{group_name}: recovering — skipping already-synced creators", 'info')
|
|
|
|
for creator in creators:
|
|
creator_id = str(creator['id'])
|
|
if checkpoint.is_completed(creator_id):
|
|
continue
|
|
|
|
# Skip creators that already have an active sync running
|
|
if self._is_creator_syncing(creator['id']):
|
|
self.log(f"Skipping {creator['username']} — already syncing", 'info')
|
|
checkpoint.mark_completed(creator_id)
|
|
continue
|
|
|
|
checkpoint.set_current(creator_id)
|
|
|
|
try:
|
|
result = await self.sync_creator(creator['id'], download=download, scheduled=scheduled)
|
|
results[creator['id']] = result
|
|
except Exception as e:
|
|
self.log(f"Error syncing {creator['username']}: {e}", 'error')
|
|
results[creator['id']] = SyncResult(success=False, error=str(e))
|
|
|
|
checkpoint.mark_completed(creator_id)
|
|
|
|
checkpoint.finish()
|
|
self.log(f"Completed {group_name} sync group", 'info')
|
|
return results
|
|
|
|
async def _sync_youtube_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
|
|
"""Sync a YouTube channel - fetch new videos and optionally download them"""
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing YouTube channel: {creator['username']}", 'info')
|
|
|
|
# Register active sync for polling-based updates
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'youtube',
|
|
'service': 'youtube',
|
|
'status': 'Fetching videos...',
|
|
'phase': 'fetching',
|
|
'failed': 0
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
|
|
# Emit WebSocket event
|
|
self._emit_event('paid_content_sync_started', {
|
|
'creator_id': creator_id,
|
|
**sync_data
|
|
})
|
|
|
|
try:
|
|
youtube = self._get_youtube_client()
|
|
if not youtube.is_available():
|
|
error = "yt-dlp not available"
|
|
self._unregister_active_sync(creator_id)
|
|
return SyncResult(success=False, error=error)
|
|
|
|
# Build channel URL from creator_id (which stores the channel ID or handle)
|
|
channel_url = youtube.normalize_channel_url(creator['creator_id'])
|
|
|
|
# Fetch and update creator profile (display name, avatar, banner)
|
|
try:
|
|
profile_updates = {}
|
|
# Get channel name from yt-dlp
|
|
channel_info = await youtube.get_channel_info(channel_url)
|
|
if channel_info and channel_info.get('channel_name'):
|
|
profile_updates['display_name'] = channel_info['channel_name']
|
|
|
|
# Get avatar by scraping the page (yt-dlp doesn't provide it)
|
|
avatar_url = await youtube.get_channel_avatar(channel_url)
|
|
if avatar_url:
|
|
cached = await self._cache_profile_image(avatar_url, 'youtube', creator['creator_id'], 'avatar')
|
|
profile_updates['profile_image_url'] = cached or avatar_url
|
|
|
|
# Get banner by scraping the page
|
|
banner_url = await youtube.get_channel_banner(channel_url)
|
|
if banner_url:
|
|
cached = await self._cache_profile_image(banner_url, 'youtube', creator['creator_id'], 'banner')
|
|
profile_updates['banner_image_url'] = cached or banner_url
|
|
|
|
# Get metadata (bio, joined date, location, external links) by scraping the page
|
|
metadata = await youtube.get_channel_metadata(channel_url)
|
|
if metadata:
|
|
if metadata.get('bio'):
|
|
profile_updates['bio'] = metadata['bio']
|
|
if metadata.get('joined_date'):
|
|
profile_updates['joined_date'] = metadata['joined_date']
|
|
if metadata.get('location'):
|
|
profile_updates['location'] = metadata['location']
|
|
if metadata.get('external_links'):
|
|
profile_updates['external_links'] = metadata['external_links']
|
|
|
|
if profile_updates:
|
|
self.db.update_creator(creator_id, profile_updates)
|
|
self.log(f"Updated YouTube creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Failed to update YouTube creator profile: {e}", 'warning')
|
|
|
|
# Fetch videos since last check with progress callback
|
|
# Scheduled syncs only check last 3 days for efficiency
|
|
from datetime import timedelta
|
|
if scheduled:
|
|
since_date = (datetime.now() - timedelta(days=3)).isoformat()
|
|
else:
|
|
since_date = creator.get('last_post_date')
|
|
|
|
def progress_callback(count: int):
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Fetched {count} videos...',
|
|
'posts_fetched': count
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': f'Fetched {count} videos...',
|
|
'phase': 'fetching',
|
|
'posts_fetched': count
|
|
})
|
|
|
|
# Get known video IDs from DB so the members-only scan
|
|
# doesn't re-fetch videos we already have
|
|
known_video_ids = set()
|
|
try:
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
|
|
(creator_id,)
|
|
)
|
|
known_video_ids = {row[0] for row in cursor.fetchall()}
|
|
except Exception:
|
|
pass
|
|
|
|
# Get videos as Post objects
|
|
# For scheduled syncs, limit to 20 videos max (recent content only)
|
|
max_videos = 20 if scheduled else None
|
|
posts = await youtube.get_posts(
|
|
channel_url,
|
|
since_date=since_date,
|
|
max_videos=max_videos,
|
|
progress_callback=progress_callback,
|
|
known_video_ids=known_video_ids
|
|
)
|
|
|
|
if not posts:
|
|
self.log(f"No new videos for {creator['username']}", 'debug')
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': 0,
|
|
'new_attachments': 0,
|
|
'downloaded': 0,
|
|
'failed': 0
|
|
})
|
|
return SyncResult(success=True, new_posts=0, new_attachments=0)
|
|
|
|
self.log(f"Found {len(posts)} new videos for {creator['username']}", 'info')
|
|
|
|
# Update status
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} videos...',
|
|
'phase': 'processing',
|
|
'total_posts': len(posts)
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
|
|
for post in posts:
|
|
# Insert/update post in database
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
|
|
# Insert video attachment
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
# Apply auto tags from source (e.g. "Members Only" for subscriber-only videos)
|
|
for tag_name in getattr(post, 'auto_tags', []):
|
|
tag = self.db.get_tag_by_slug(tag_name.lower().replace(' ', '-'))
|
|
if not tag:
|
|
tag_id = self.db.create_tag(tag_name, color='#8b5cf6', description='Auto-applied by YouTube sync')
|
|
else:
|
|
tag_id = tag['id']
|
|
if tag_id:
|
|
self.db.add_tag_to_post(post_db_id, tag_id)
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
# Update creator stats - find the actual newest post date (posts may not be sorted by date)
|
|
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'last_post_date': latest_post_date or creator.get('last_post_date'),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
# Download videos if enabled
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self._download_youtube_videos(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
# Unregister from active syncs
|
|
self._unregister_active_sync(creator_id)
|
|
|
|
# Emit completed event
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': new_posts,
|
|
'new_attachments': new_attachments,
|
|
'downloaded': downloaded,
|
|
'failed': failed
|
|
})
|
|
|
|
# Send push notification for new downloads
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True,
|
|
new_posts=new_posts,
|
|
new_attachments=new_attachments,
|
|
downloaded_files=downloaded,
|
|
failed_files=failed,
|
|
downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing YouTube channel {creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
async def _download_youtube_videos(self, creator_id: int) -> Dict:
|
|
"""Download pending YouTube videos using yt-dlp"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
youtube = self._get_youtube_client()
|
|
if not youtube.is_available():
|
|
return {'downloaded': 0, 'failed': 0, 'error': 'yt-dlp not available'}
|
|
|
|
self.log(f"Downloading {len(pending)} YouTube videos for {creator['username']}", 'info')
|
|
|
|
# Update status
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading',
|
|
'status': f'Downloading {len(pending)} videos...',
|
|
'total_files': len(pending),
|
|
'downloaded': 0
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
quality = self.config.get('embed_quality', 'best')
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
|
|
for i, att in enumerate(pending):
|
|
try:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
self.log(f"Post not found for attachment {att.get('id')}", 'warning')
|
|
failed += 1
|
|
continue
|
|
|
|
# Build output directory
|
|
published_at = post.get('published_at') or ''
|
|
post_date = published_at[:10] if published_at else 'unknown-date'
|
|
output_dir = base_path / 'youtube' / self._sanitize_filename(creator['username']) / post_date
|
|
|
|
# Video URL is stored in download_url
|
|
video_url = att.get('download_url')
|
|
if not video_url:
|
|
self.log(f"No download URL for attachment {att.get('id')}, {att.get('name')}", 'warning')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message='No video URL'
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
# Update status
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading video {i + 1}/{len(pending)}: {att.get("name", "")[:40]}...',
|
|
'downloaded': downloaded
|
|
})
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
|
|
# Download using yt-dlp to temp dir first (avoids mergerfs .part rename issues)
|
|
import tempfile, shutil
|
|
tmp_dir = tempfile.mkdtemp(prefix='ytdlp_')
|
|
try:
|
|
result = await youtube.download_video(video_url, Path(tmp_dir), quality=quality)
|
|
# Move completed file to final destination
|
|
if result and result.get('success') and result.get('file_path'):
|
|
tmp_file = Path(result['file_path'])
|
|
if tmp_file.exists():
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
final_path = output_dir / tmp_file.name
|
|
shutil.move(str(tmp_file), str(final_path))
|
|
result['file_path'] = str(final_path)
|
|
result['file_size'] = final_path.stat().st_size
|
|
finally:
|
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
|
|
|
if not result:
|
|
self.log(f"No result from yt-dlp for {att.get('name')}", 'warning')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message='yt-dlp returned no result',
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
if result.get('success'):
|
|
file_path = result.get('file_path')
|
|
file_size = result.get('file_size', 0)
|
|
|
|
# Download YouTube thumbnail instead of generating from video
|
|
thumbnail_data = None
|
|
video_id = post.get('post_id') # post_id is the YouTube video ID
|
|
if video_id:
|
|
thumbnail_data = await self._download_youtube_thumbnail(video_id)
|
|
if thumbnail_data:
|
|
self.log(f"Downloaded YouTube thumbnail for {att.get('name', 'video')} ({len(thumbnail_data)} bytes)", 'debug')
|
|
|
|
# Extract video dimensions
|
|
width, height, duration = None, None, None
|
|
if file_path:
|
|
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
|
|
if width and height:
|
|
self.log(f"Extracted dimensions for {att.get('name', 'video')}: {width}x{height}, {duration}s", 'debug')
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name if file_path else None,
|
|
file_size=file_size,
|
|
width=width,
|
|
height=height,
|
|
duration=duration,
|
|
thumbnail_data=thumbnail_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
# Update post as downloaded if all attachments are done
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
|
|
# Update creator stats
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
|
|
downloaded += 1
|
|
self.log(f"Downloaded: {att.get('name', 'video')}", 'debug')
|
|
|
|
# Collect file info for notifications
|
|
if file_path:
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name if file_path else None,
|
|
'source': creator['username'],
|
|
'content_type': att.get('file_type', 'video')
|
|
})
|
|
else:
|
|
error = result.get('error', 'Unknown error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
self.log(f"Failed to download {att.get('name', 'video')}: {error}", 'warning')
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading YouTube video: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
|
|
|
|
async def _sync_pornhub_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
|
|
"""Sync a Pornhub creator - fetch new videos and optionally download them"""
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing Pornhub creator: {creator['username']}", 'info')
|
|
|
|
# Register active sync for polling-based updates
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'pornhub',
|
|
'service': 'pornhub',
|
|
'status': 'Fetching videos...',
|
|
'phase': 'fetching',
|
|
'failed': 0
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
|
|
# Emit WebSocket event
|
|
self._emit_event('paid_content_sync_started', {
|
|
'creator_id': creator_id,
|
|
**sync_data
|
|
})
|
|
|
|
try:
|
|
pornhub = self._get_pornhub_client()
|
|
if not pornhub.is_available():
|
|
error = "yt-dlp not available"
|
|
self._unregister_active_sync(creator_id)
|
|
return SyncResult(success=False, error=error)
|
|
|
|
# Build creator URL from creator_id (which stores type/name)
|
|
creator_url = pornhub.normalize_creator_url(creator['creator_id'])
|
|
|
|
# Fetch and update creator profile
|
|
try:
|
|
profile_updates = {}
|
|
|
|
# Get profile image
|
|
avatar_url = await pornhub.get_profile_image(creator_url)
|
|
if avatar_url:
|
|
cached = await self._cache_profile_image(avatar_url, 'pornhub', creator['creator_id'], 'avatar')
|
|
profile_updates['profile_image_url'] = cached or avatar_url
|
|
|
|
# Get banner
|
|
banner_url = await pornhub.get_profile_banner(creator_url)
|
|
if banner_url:
|
|
cached = await self._cache_profile_image(banner_url, 'pornhub', creator['creator_id'], 'banner')
|
|
profile_updates['banner_image_url'] = cached or banner_url
|
|
|
|
# Get bio
|
|
bio = await pornhub.get_profile_bio(creator_url)
|
|
if bio:
|
|
profile_updates['bio'] = bio
|
|
|
|
# Get joined/career start date
|
|
joined_date = await pornhub.get_joined_date(creator_url)
|
|
if joined_date:
|
|
profile_updates['joined_date'] = joined_date
|
|
|
|
if profile_updates:
|
|
self.db.update_creator(creator_id, profile_updates)
|
|
self.log(f"Updated Pornhub creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Failed to update Pornhub creator profile: {e}", 'warning')
|
|
|
|
# Fetch videos since last check with progress callback
|
|
from datetime import timedelta
|
|
if scheduled:
|
|
since_date = (datetime.now() - timedelta(days=3)).isoformat()
|
|
else:
|
|
since_date = creator.get('last_post_date')
|
|
|
|
def progress_callback(count: int):
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Fetched {count} videos...',
|
|
'posts_fetched': count
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': f'Fetched {count} videos...',
|
|
'phase': 'fetching',
|
|
'posts_fetched': count
|
|
})
|
|
|
|
# Get videos as Post objects
|
|
max_videos = 20 if scheduled else None
|
|
posts = await pornhub.get_posts(
|
|
creator_url,
|
|
since_date=since_date,
|
|
max_videos=max_videos,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
if not posts:
|
|
self.log(f"No new videos for {creator['username']}", 'debug')
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': 0,
|
|
'new_attachments': 0,
|
|
'downloaded': 0,
|
|
'failed': 0
|
|
})
|
|
return SyncResult(success=True, new_posts=0, new_attachments=0)
|
|
|
|
self.log(f"Found {len(posts)} new videos for {creator['username']}", 'info')
|
|
|
|
# Update status
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} videos...',
|
|
'phase': 'processing',
|
|
'total_posts': len(posts)
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
|
|
for post in posts:
|
|
# Insert/update post in database
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
|
|
# Insert video attachment
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
# Update creator stats
|
|
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'last_post_date': latest_post_date or creator.get('last_post_date'),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
# Download videos if enabled
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self._download_pornhub_videos(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
# Post dates are backfilled during download (from yt-dlp metadata)
|
|
# Update creator's last_post_date now that we have actual dates
|
|
if downloaded > 0:
|
|
try:
|
|
latest = result.get('latest_post_date')
|
|
if latest:
|
|
self.db.update_creator(creator_id, {'last_post_date': latest})
|
|
except Exception:
|
|
pass
|
|
|
|
# Unregister from active syncs
|
|
self._unregister_active_sync(creator_id)
|
|
|
|
# Emit completed event
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': new_posts,
|
|
'new_attachments': new_attachments,
|
|
'downloaded': downloaded,
|
|
'failed': failed
|
|
})
|
|
|
|
# Send push notification for new downloads
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True,
|
|
new_posts=new_posts,
|
|
new_attachments=new_attachments,
|
|
downloaded_files=downloaded,
|
|
failed_files=failed,
|
|
downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing Pornhub creator {creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
async def _download_pornhub_videos(self, creator_id: int) -> Dict:
|
|
"""Download pending Pornhub videos using yt-dlp"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pornhub = self._get_pornhub_client()
|
|
if not pornhub.is_available():
|
|
return {'downloaded': 0, 'failed': 0, 'error': 'yt-dlp not available'}
|
|
|
|
self.log(f"Downloading {len(pending)} Pornhub videos for {creator['username']}", 'info')
|
|
|
|
# Update status
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading',
|
|
'status': f'Downloading {len(pending)} videos...',
|
|
'total_files': len(pending),
|
|
'downloaded': 0
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
quality = self.config.get('embed_quality', 'best')
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
latest_post_date = None
|
|
|
|
for i, att in enumerate(pending):
|
|
try:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
self.log(f"Post not found for attachment {att.get('id')}", 'warning')
|
|
failed += 1
|
|
continue
|
|
|
|
# Build output directory
|
|
published_at = post.get('published_at') or ''
|
|
post_date = published_at[:10] if published_at else 'unknown-date'
|
|
output_dir = base_path / 'pornhub' / self._sanitize_filename(creator['username']) / post_date
|
|
|
|
# Video URL is stored in download_url
|
|
video_url = att.get('download_url')
|
|
if not video_url:
|
|
self.log(f"No download URL for attachment {att.get('id')}, {att.get('name')}", 'warning')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message='No video URL'
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
# Update status
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading video {i + 1}/{len(pending)}: {att.get("name", "")[:40]}...',
|
|
'downloaded': downloaded
|
|
})
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
|
|
# Download using yt-dlp
|
|
result = await pornhub.download_video(video_url, output_dir, quality=quality)
|
|
|
|
if not result:
|
|
self.log(f"No result from yt-dlp for {att.get('name')}", 'warning')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message='yt-dlp returned no result',
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
if result.get('success'):
|
|
file_path = result.get('file_path')
|
|
file_size = result.get('file_size', 0)
|
|
|
|
# Extract video dimensions first (need duration for thumbnail seek)
|
|
width, height, duration = None, None, None
|
|
if file_path:
|
|
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
|
|
if width and height:
|
|
self.log(f"Extracted dimensions for {att.get('name', 'video')}: {width}x{height}, {duration}s", 'debug')
|
|
|
|
# Generate thumbnail from video file (Pornhub CDN thumbnails are just logos)
|
|
# Seek to 10 seconds to skip intro branding
|
|
thumbnail_data = None
|
|
if file_path and Path(file_path).exists():
|
|
seek_secs = 10
|
|
seek_time = f'{seek_secs // 3600:02d}:{(seek_secs % 3600) // 60:02d}:{seek_secs % 60:02d}'
|
|
thumbnail_data = self._generate_video_thumbnail(Path(file_path), seek_time=seek_time)
|
|
if thumbnail_data:
|
|
self.log(f"Generated video thumbnail at {seek_time} for {att.get('name', 'video')}", 'debug')
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name if file_path else None,
|
|
file_size=file_size,
|
|
width=width,
|
|
height=height,
|
|
duration=duration,
|
|
thumbnail_data=thumbnail_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
# Update post date from yt-dlp metadata if it was missing
|
|
# Prefer 'timestamp' (epoch) for full date+time, fallback to 'upload_date' (YYYYMMDD)
|
|
timestamp = result.get('timestamp')
|
|
upload_date = result.get('upload_date')
|
|
if (timestamp or upload_date) and post and not post.get('published_at'):
|
|
try:
|
|
if timestamp:
|
|
from datetime import timezone
|
|
formatted_date = datetime.fromtimestamp(int(timestamp), tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%S')
|
|
elif len(upload_date) == 8 and upload_date.isdigit():
|
|
formatted_date = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}"
|
|
else:
|
|
formatted_date = upload_date
|
|
self.db.update_post(att['post_id'], {'published_at': formatted_date})
|
|
self.log(f"Updated post date: {formatted_date}", 'debug')
|
|
# Track latest date for creator update
|
|
if not latest_post_date or formatted_date > latest_post_date:
|
|
latest_post_date = formatted_date
|
|
except Exception as e:
|
|
self.log(f"Could not update post date: {e}", 'debug')
|
|
|
|
# Update post as downloaded if all attachments are done
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
|
|
# Update creator stats
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
|
|
downloaded += 1
|
|
self.log(f"Downloaded: {att.get('name', 'video')}", 'debug')
|
|
|
|
# Collect file info for notifications
|
|
if file_path:
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name if file_path else None,
|
|
'source': creator['username'],
|
|
'content_type': att.get('file_type', 'video')
|
|
})
|
|
else:
|
|
error = result.get('error', 'Unknown error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
self.log(f"Failed to download {att.get('name', 'video')}: {error}", 'warning')
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading Pornhub video: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info, 'latest_post_date': latest_post_date}
|
|
|
|
# =========================================================================
|
|
# XHamster sync/download/add
|
|
# =========================================================================
|
|
|
|
async def _sync_xhamster_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
|
|
"""Sync an XHamster creator - fetch new videos and optionally download them"""
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing XHamster creator: {creator['username']}", 'info')
|
|
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'xhamster',
|
|
'service': 'xhamster',
|
|
'status': 'Fetching media...',
|
|
'phase': 'fetching',
|
|
'failed': 0
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
|
|
|
|
try:
|
|
xhamster = self._get_xhamster_client()
|
|
if not xhamster.is_available():
|
|
self._unregister_active_sync(creator_id)
|
|
return SyncResult(success=False, error="yt-dlp not available")
|
|
|
|
creator_url = xhamster.normalize_creator_url(creator['creator_id'])
|
|
|
|
# Update profile
|
|
try:
|
|
profile_updates = {}
|
|
avatar_url = await xhamster.get_profile_image(creator_url)
|
|
if avatar_url:
|
|
cached = await self._cache_profile_image(avatar_url, 'xhamster', creator['creator_id'], 'avatar')
|
|
profile_updates['profile_image_url'] = cached or avatar_url
|
|
bio = await xhamster.get_profile_bio(creator_url)
|
|
if bio:
|
|
profile_updates['bio'] = bio
|
|
if profile_updates:
|
|
self.db.update_creator(creator_id, profile_updates)
|
|
self.log(f"Updated XHamster creator profile: {list(profile_updates.keys())}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Failed to update XHamster creator profile: {e}", 'warning')
|
|
|
|
# Fetch videos
|
|
from datetime import timedelta
|
|
if scheduled:
|
|
since_date = (datetime.now() - timedelta(days=3)).isoformat()
|
|
else:
|
|
since_date = creator.get('last_post_date')
|
|
|
|
def progress_callback(count: int):
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Fetched {count} media items...',
|
|
'posts_fetched': count
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': f'Fetched {count} media items...',
|
|
'phase': 'fetching',
|
|
'posts_fetched': count
|
|
})
|
|
|
|
max_videos = 20 if scheduled else None
|
|
posts = await xhamster.get_posts(
|
|
creator_url,
|
|
since_date=since_date,
|
|
max_videos=max_videos,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
if not posts:
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': 0, 'new_attachments': 0, 'downloaded': 0, 'failed': 0
|
|
})
|
|
return SyncResult(success=True, new_posts=0, new_attachments=0)
|
|
|
|
self.log(f"Found {len(posts)} new media posts for {creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} media posts...',
|
|
'phase': 'processing', 'total_posts': len(posts)
|
|
})
|
|
|
|
# Get or create "Short" tag for auto-tagging moments
|
|
short_tag = self.db.get_tag_by_slug('short')
|
|
short_tag_id = short_tag['id'] if short_tag else None
|
|
if not short_tag_id:
|
|
short_tag_id = self.db.create_tag('Short', '#8b5cf6', 'Short-form content (moments, clips)')
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
for post in posts:
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
# Auto-tag shorts/moments
|
|
if short_tag_id:
|
|
is_short = any(
|
|
'/moments/' in (att.server_path or '') or '/moments/' in (att.download_url or '')
|
|
for att in post.attachments
|
|
)
|
|
if is_short:
|
|
self.db.add_tag_to_post(post_db_id, short_tag_id)
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'last_post_date': latest_post_date or creator.get('last_post_date'),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self._download_xhamster_videos(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
if downloaded > 0 and result.get('latest_post_date'):
|
|
try:
|
|
self.db.update_creator(creator_id, {'last_post_date': result['latest_post_date']})
|
|
except Exception:
|
|
pass
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': new_posts, 'new_attachments': new_attachments,
|
|
'downloaded': downloaded, 'failed': failed
|
|
})
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True, new_posts=new_posts, new_attachments=new_attachments,
|
|
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing XHamster creator {creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
async def _download_xhamster_videos(self, creator_id: int) -> Dict:
|
|
"""Download pending XHamster media (videos and images)"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
xhamster = self._get_xhamster_client()
|
|
if not xhamster.is_available():
|
|
return {'downloaded': 0, 'failed': 0, 'error': 'yt-dlp not available'}
|
|
|
|
self.log(f"Downloading {len(pending)} XHamster media files for {creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading', 'status': f'Downloading {len(pending)} media files...',
|
|
'total_files': len(pending), 'downloaded': 0
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
quality = self.config.get('embed_quality', 'best')
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
latest_post_date = None
|
|
|
|
for i, att in enumerate(pending):
|
|
try:
|
|
# Rate limit: delay between downloads to avoid Cloudflare blocks
|
|
if i > 0:
|
|
file_type = att.get('file_type', 'video')
|
|
await asyncio.sleep(0.5 if file_type == 'image' else 2)
|
|
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
failed += 1
|
|
continue
|
|
|
|
published_at = post.get('published_at') or ''
|
|
post_date = published_at[:10] if published_at else None
|
|
creator_dir = base_path / 'xhamster' / self._sanitize_filename(creator['username'])
|
|
output_dir = creator_dir / (post_date or 'unknown-date')
|
|
|
|
download_url = att.get('download_url')
|
|
if not download_url:
|
|
self.db.update_attachment_status(att['id'], 'failed', error_message='No download URL')
|
|
failed += 1
|
|
continue
|
|
|
|
file_type = att.get('file_type', 'video')
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading {file_type} {i + 1}/{len(pending)}: {att.get("name", "")[:40]}...',
|
|
'downloaded': downloaded
|
|
})
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
|
|
# --- Image download path ---
|
|
if file_type == 'image':
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
image_filename = att.get('name', f'{att["id"]}.jpg')
|
|
image_path = output_dir / image_filename
|
|
|
|
result = await xhamster.download_image(download_url, image_path)
|
|
|
|
if result and result.get('success'):
|
|
file_path = result['file_path']
|
|
file_size = result.get('file_size', 0)
|
|
|
|
width, height, duration = self._extract_dimensions(Path(file_path), 'image')
|
|
|
|
thumbnail_data = None
|
|
if Path(file_path).exists():
|
|
thumbnail_data = self._generate_image_thumbnail(Path(file_path))
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name,
|
|
file_size=file_size,
|
|
width=width, height=height, duration=duration,
|
|
thumbnail_data=thumbnail_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
downloaded += 1
|
|
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name,
|
|
'source': creator['username'],
|
|
'content_type': 'image'
|
|
})
|
|
else:
|
|
error = result.get('error', 'Unknown error') if result else 'Download returned no result'
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
continue
|
|
|
|
# --- Video download path ---
|
|
result = await xhamster.download_video(download_url, output_dir, quality=quality)
|
|
|
|
if not result:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message='Download returned no result',
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
if result.get('success'):
|
|
file_path = result.get('file_path')
|
|
file_size = result.get('file_size', 0)
|
|
|
|
# Extract date from download result and move file if needed
|
|
timestamp = result.get('timestamp')
|
|
upload_date = result.get('upload_date')
|
|
formatted_date = None
|
|
if timestamp or upload_date:
|
|
try:
|
|
if timestamp:
|
|
from datetime import timezone
|
|
formatted_date = datetime.fromtimestamp(int(timestamp), tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%S')
|
|
elif len(upload_date) == 8 and upload_date.isdigit():
|
|
formatted_date = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}"
|
|
else:
|
|
formatted_date = upload_date
|
|
except Exception:
|
|
pass
|
|
|
|
# Move file to correct date directory if downloaded to unknown-date
|
|
if formatted_date and file_path and not post_date:
|
|
correct_date_dir = creator_dir / formatted_date[:10]
|
|
if str(correct_date_dir) != str(output_dir):
|
|
try:
|
|
correct_date_dir.mkdir(parents=True, exist_ok=True)
|
|
new_path = correct_date_dir / Path(file_path).name
|
|
import shutil
|
|
shutil.move(str(file_path), str(new_path))
|
|
file_path = str(new_path)
|
|
self.log(f"Moved to date directory: {formatted_date[:10]}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Failed to move file to date directory: {e}", 'debug')
|
|
|
|
# Update post date
|
|
if formatted_date and not post.get('published_at'):
|
|
try:
|
|
self.db.update_post(att['post_id'], {'published_at': formatted_date})
|
|
if not latest_post_date or formatted_date > latest_post_date:
|
|
latest_post_date = formatted_date
|
|
except Exception:
|
|
pass
|
|
|
|
width, height, duration = None, None, None
|
|
if file_path:
|
|
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
|
|
|
|
thumbnail_data = None
|
|
if file_path and Path(file_path).exists():
|
|
seek_time = '00:00:10'
|
|
thumbnail_data = self._generate_video_thumbnail(Path(file_path), seek_time=seek_time)
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name if file_path else None,
|
|
file_size=file_size,
|
|
width=width, height=height, duration=duration,
|
|
thumbnail_data=thumbnail_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
downloaded += 1
|
|
|
|
if file_path:
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name,
|
|
'source': creator['username'],
|
|
'content_type': att.get('file_type', 'video')
|
|
})
|
|
else:
|
|
error = result.get('error', 'Unknown error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading XHamster media: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info, 'latest_post_date': latest_post_date}
|
|
|
|
async def _add_xhamster_creator(self, creator_id_str: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add an XHamster creator"""
|
|
xhamster = self._get_xhamster_client()
|
|
if not xhamster.is_available():
|
|
return {'success': False, 'error': 'yt-dlp not available'}
|
|
|
|
creator_url = xhamster.normalize_creator_url(creator_id_str)
|
|
creator_info = await xhamster.get_creator(creator_url)
|
|
if not creator_info:
|
|
return {'success': False, 'error': 'XHamster creator not found'}
|
|
|
|
creator_data = creator_info.to_dict()
|
|
creator_data['auto_download'] = 1 if auto_download else 0
|
|
creator_data['download_embeds'] = 1 if download_embeds else 0
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
return {'success': True, 'creator': {'id': db_id, **creator_data}}
|
|
|
|
# =========================================================================
|
|
# TikTok sync/download/add
|
|
# =========================================================================
|
|
|
|
async def _sync_tiktok_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
|
|
"""Sync a TikTok creator - fetch new videos and optionally download them"""
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing TikTok creator: {creator['username']}", 'info')
|
|
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'tiktok',
|
|
'service': 'tiktok',
|
|
'status': 'Fetching videos...',
|
|
'phase': 'fetching',
|
|
'failed': 0
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
|
|
|
|
try:
|
|
tiktok = self._get_tiktok_client()
|
|
if not tiktok.is_available():
|
|
self._unregister_active_sync(creator_id)
|
|
return SyncResult(success=False, error="yt-dlp/gallery-dl not available")
|
|
|
|
creator_url = tiktok.normalize_creator_url(creator['creator_id'])
|
|
|
|
# Update profile info (avatar, bio, display name)
|
|
try:
|
|
profile_info = await tiktok.get_creator_info(creator_url)
|
|
if profile_info:
|
|
profile_updates = {}
|
|
if profile_info.get('creator_name') and profile_info['creator_name'] != creator.get('creator_id'):
|
|
profile_updates['display_name'] = profile_info['creator_name']
|
|
if profile_info.get('profile_image_url'):
|
|
cached = await self._cache_profile_image(profile_info['profile_image_url'], 'tiktok', creator['creator_id'], 'avatar')
|
|
profile_updates['profile_image_url'] = cached or profile_info['profile_image_url']
|
|
if profile_info.get('bio'):
|
|
profile_updates['bio'] = profile_info['bio']
|
|
if profile_updates:
|
|
self.db.update_creator(creator_id, profile_updates)
|
|
self.log(f"Updated TikTok profile for @{creator['creator_id']}: {list(profile_updates.keys())}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Failed to update TikTok profile: {e}", 'debug')
|
|
|
|
# Fetch videos
|
|
from datetime import timedelta
|
|
if scheduled:
|
|
since_date = (datetime.now() - timedelta(days=3)).isoformat()
|
|
else:
|
|
since_date = creator.get('last_post_date')
|
|
|
|
def progress_callback(count: int):
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Fetched {count} videos...',
|
|
'posts_fetched': count
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': f'Fetched {count} videos...',
|
|
'phase': 'fetching',
|
|
'posts_fetched': count
|
|
})
|
|
|
|
max_videos = 20 if scheduled else None
|
|
posts = await tiktok.get_posts(
|
|
creator_url,
|
|
since_date=since_date,
|
|
max_videos=max_videos,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
if not posts:
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': 0, 'new_attachments': 0, 'downloaded': 0, 'failed': 0
|
|
})
|
|
return SyncResult(success=True, new_posts=0, new_attachments=0)
|
|
|
|
self.log(f"Found {len(posts)} new videos for @{creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} posts...',
|
|
'phase': 'processing', 'total_posts': len(posts)
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
for post in posts:
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
# Update pinned posts from profile page data
|
|
if hasattr(tiktok, '_last_pinned_posts') and tiktok._last_pinned_posts:
|
|
self.db.update_pinned_posts(creator_id, tiktok._last_pinned_posts)
|
|
|
|
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'last_post_date': latest_post_date or creator.get('last_post_date'),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self._download_tiktok_posts(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': new_posts, 'new_attachments': new_attachments,
|
|
'downloaded': downloaded, 'failed': failed
|
|
})
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True, new_posts=new_posts, new_attachments=new_attachments,
|
|
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing TikTok creator @{creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
async def _download_tiktok_posts(self, creator_id: int) -> Dict:
|
|
"""Download pending TikTok posts using gallery-dl"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
tiktok = self._get_tiktok_client()
|
|
if not tiktok.is_available():
|
|
return {'downloaded': 0, 'failed': 0, 'error': 'yt-dlp/gallery-dl not available'}
|
|
|
|
self.log(f"Downloading {len(pending)} TikTok posts for @{creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading', 'status': f'Downloading {len(pending)} posts...',
|
|
'total_files': len(pending), 'downloaded': 0
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
|
|
for i, att in enumerate(pending):
|
|
try:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
failed += 1
|
|
continue
|
|
|
|
published_at = post.get('published_at') or ''
|
|
post_date = published_at[:10] if published_at else 'unknown-date'
|
|
output_dir = base_path / 'tiktok' / self._sanitize_filename(creator['username']) / post_date
|
|
|
|
video_url = att.get('download_url')
|
|
if not video_url:
|
|
self.db.update_attachment_status(att['id'], 'failed', error_message='No video URL')
|
|
failed += 1
|
|
continue
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading post {i + 1}/{len(pending)}...',
|
|
'downloaded': downloaded
|
|
})
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
result = await tiktok.download_video(video_url, output_dir, username=creator['username'])
|
|
|
|
if not result or not result.get('success'):
|
|
error = result.get('error', 'Download failed') if result else 'No result'
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
all_files = result.get('all_files', [])
|
|
is_carousel = result.get('is_carousel', False)
|
|
|
|
# Process all downloaded files (carousel photos or single video)
|
|
first_file_size = None
|
|
for file_idx, file_str in enumerate(all_files):
|
|
fp = Path(file_str)
|
|
if not fp.exists():
|
|
continue
|
|
|
|
f_size = fp.stat().st_size
|
|
ext = fp.suffix.lower()
|
|
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp'}
|
|
c_type = 'image' if ext in image_exts else 'video'
|
|
|
|
# Skip duplicate files: gallery-dl sometimes returns the same
|
|
# video twice (main + subdirectory copy) for non-carousel posts
|
|
if file_idx > 0 and not is_carousel and first_file_size and f_size == first_file_size:
|
|
self.log(f"Skipping duplicate file {fp.name} (same size as primary)", 'debug')
|
|
try:
|
|
fp.unlink()
|
|
except Exception:
|
|
pass
|
|
continue
|
|
|
|
w, h, dur = None, None, None
|
|
if c_type == 'video':
|
|
w, h, dur = self._extract_dimensions(fp, 'video')
|
|
else:
|
|
w, h, _ = self._extract_dimensions(fp, 'image')
|
|
|
|
thumb_data = None
|
|
if c_type == 'video':
|
|
thumb_data = self._generate_video_thumbnail(fp, seek_time='00:00:01')
|
|
|
|
if file_idx == 0:
|
|
first_file_size = f_size
|
|
# Update the existing attachment with the first file
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=str(fp),
|
|
local_filename=fp.name,
|
|
name=fp.name,
|
|
extension=ext,
|
|
file_size=f_size,
|
|
file_type=c_type,
|
|
width=w, height=h, duration=dur,
|
|
thumbnail_data=thumb_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
else:
|
|
# Create additional attachments for carousel photos
|
|
# Use unique server_path per file to avoid upsert collisions
|
|
self.db.upsert_attachment(att['post_id'], {
|
|
'name': fp.name,
|
|
'file_type': c_type,
|
|
'extension': ext,
|
|
'server_path': f"{video_url}#slide_{file_idx}",
|
|
'download_url': video_url,
|
|
'status': 'completed',
|
|
'local_path': str(fp),
|
|
'local_filename': fp.name,
|
|
'file_size': f_size,
|
|
'width': w, 'height': h, 'duration': dur,
|
|
'thumbnail_data': thumb_data,
|
|
'downloaded_at': datetime.now().isoformat(),
|
|
})
|
|
|
|
downloaded_file_info.append({
|
|
'file_path': str(fp),
|
|
'filename': fp.name,
|
|
'source': creator['username'],
|
|
'content_type': c_type
|
|
})
|
|
|
|
total_size = sum(Path(f).stat().st_size for f in all_files if Path(f).exists())
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
self.db.increment_creator_download_stats(creator_id, 1, total_size)
|
|
downloaded += 1
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading TikTok post: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
|
|
|
|
async def _add_tiktok_creator(self, username: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a TikTok creator"""
|
|
tiktok = self._get_tiktok_client()
|
|
if not tiktok.is_available():
|
|
return {'success': False, 'error': 'yt-dlp/gallery-dl not available'}
|
|
|
|
creator_url = tiktok.normalize_creator_url(username)
|
|
|
|
# Get full creator info including profile image and bio
|
|
creator_info_raw = await tiktok.get_creator_info(creator_url)
|
|
if not creator_info_raw:
|
|
return {'success': False, 'error': 'TikTok creator not found'}
|
|
|
|
creator_info = await tiktok.get_creator(creator_url)
|
|
if not creator_info:
|
|
return {'success': False, 'error': 'TikTok creator not found'}
|
|
|
|
creator_data = creator_info.to_dict()
|
|
creator_data['auto_download'] = 1 if auto_download else 0
|
|
creator_data['download_embeds'] = 1 if download_embeds else 0
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
|
|
# Update bio separately (not in Creator model)
|
|
if creator_info_raw.get('bio'):
|
|
self.db.update_creator(db_id, {'bio': creator_info_raw['bio']})
|
|
|
|
return {'success': True, 'creator': {'id': db_id, **creator_data}}
|
|
|
|
# =========================================================================
|
|
# Instagram sync/download/add
|
|
# =========================================================================
|
|
|
|
async def _sync_instagram_creator(self, creator: Dict, download: bool = True, scheduled: bool = False, force_backfill: bool = False) -> SyncResult:
|
|
"""Sync an Instagram creator - fetch new posts and optionally download them"""
|
|
from modules.instagram_rate_limiter import rate_limiter as ig_rate_limiter
|
|
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing Instagram creator: {creator['username']}", 'info')
|
|
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'instagram',
|
|
'service': 'instagram',
|
|
'status': 'Backfilling full timeline...' if force_backfill else 'Scanning profile...',
|
|
'phase': 'backfilling' if force_backfill else 'fetching',
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
|
|
|
|
# Concurrent download loop — downloads attachments while scraping
|
|
# continues, so CDN URLs don't expire waiting for all posts to be fetched.
|
|
_dl_done = asyncio.Event()
|
|
_dl_results = {'downloaded': 0, 'failed': 0, 'downloaded_file_info': []}
|
|
|
|
async def _bg_download_loop():
|
|
await asyncio.sleep(15) # Let some posts queue up first
|
|
while not _dl_done.is_set():
|
|
r = await self._download_instagram_posts(creator_id, quiet=True)
|
|
_dl_results['downloaded'] += r.get('downloaded', 0)
|
|
_dl_results['failed'] += r.get('failed', 0)
|
|
_dl_results['downloaded_file_info'].extend(r.get('downloaded_file_info', []))
|
|
if r.get('downloaded', 0) == 0:
|
|
try:
|
|
await asyncio.wait_for(_dl_done.wait(), timeout=10)
|
|
except asyncio.TimeoutError:
|
|
pass
|
|
|
|
_dl_task = None
|
|
|
|
# Acquire operation lock (in thread to avoid blocking event loop)
|
|
await asyncio.to_thread(ig_rate_limiter.operation_lock.acquire)
|
|
try:
|
|
# Use ImgInn API adapter directly — paid content must NOT use the
|
|
# universal Instagram system (instagram_client / real IG API).
|
|
imginn_posts_adapter = InstagramAdapter(unified_db=self.unified_db, log_callback=self.log_callback)
|
|
|
|
# Fetch and update creator profile (avatar, bio, display name, etc.)
|
|
# Profile data comes from ImgInn HTML scraping, not Instagram API.
|
|
needs_profile = not creator.get('profile_image_url')
|
|
if not needs_profile and creator.get('updated_at'):
|
|
try:
|
|
last_update = datetime.fromisoformat(creator['updated_at'])
|
|
needs_profile = (datetime.now() - last_update).total_seconds() > 86400
|
|
except (ValueError, TypeError):
|
|
needs_profile = True
|
|
if needs_profile or not scheduled:
|
|
try:
|
|
profile_info = await imginn_posts_adapter.get_profile_info(creator['creator_id'])
|
|
if profile_info:
|
|
profile_updates = {}
|
|
if profile_info.get('avatar_url'):
|
|
cached = await self._cache_profile_image(profile_info['avatar_url'], 'instagram', creator['creator_id'], 'avatar')
|
|
profile_updates['profile_image_url'] = cached or profile_info['avatar_url']
|
|
if profile_info.get('bio'):
|
|
profile_updates['bio'] = profile_info['bio']
|
|
if profile_info.get('display_name'):
|
|
profile_updates['display_name'] = profile_info['display_name']
|
|
if profile_info.get('post_count'):
|
|
profile_updates['post_count'] = profile_info['post_count']
|
|
if profile_info.get('external_links'):
|
|
profile_updates['external_links'] = profile_info['external_links']
|
|
if profile_updates:
|
|
self.db.update_creator(creator_id, profile_updates)
|
|
self.log(f"Updated Instagram profile for {creator['username']}: {list(profile_updates.keys())}", 'info')
|
|
except Exception as e:
|
|
self.log(f"Failed to fetch Instagram profile info: {e}", 'warning')
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
creator_url = InstagramAdapter.normalize_creator_url(creator['creator_id'])
|
|
|
|
# Start concurrent download task
|
|
if download and creator.get('auto_download', True):
|
|
_dl_task = asyncio.create_task(_bg_download_loop())
|
|
|
|
# Fetch posts — only new ones since the last sync (or full timeline if backfilling)
|
|
if creator.get('sync_posts', 1):
|
|
since_date = None if force_backfill else creator.get('last_post_date')
|
|
|
|
# Get known post IDs so the scroll can stop when it hits existing posts
|
|
# (for backfill, always load known IDs to skip duplicates)
|
|
known_post_ids = None
|
|
if since_date or force_backfill:
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
|
|
(creator_id,)
|
|
)
|
|
known_post_ids = {row[0] for row in cursor.fetchall()}
|
|
|
|
_use_auth = bool(creator.get('use_authenticated_api'))
|
|
_is_first_sync = not creator.get('last_post_date')
|
|
_backfill_phase = 'backfilling' if force_backfill else 'fetching'
|
|
_profile_post_count = creator.get('post_count') or 0
|
|
_is_paginating = force_backfill or (_use_auth and _is_first_sync)
|
|
|
|
def progress_callback(current, total=None):
|
|
if _is_paginating:
|
|
# current = total scanned, total = new posts found
|
|
new_count = total or 0
|
|
if _profile_post_count:
|
|
status = f'Scanned {current} of {_profile_post_count} posts ({new_count} new)...'
|
|
else:
|
|
status = f'Scanned {current} posts ({new_count} new)...'
|
|
elif total and total > 1:
|
|
status = f'Fetching full-res post {current}/{total}...'
|
|
else:
|
|
status = f'Fetched {current} posts...'
|
|
self._update_active_sync(creator_id, {
|
|
'status': status,
|
|
'phase': _backfill_phase,
|
|
'posts_fetched': current,
|
|
'progress': current,
|
|
'total_files': _profile_post_count if _is_paginating else total,
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': status,
|
|
'phase': _backfill_phase,
|
|
'progress': current,
|
|
'total_files': _profile_post_count if _is_paginating else total,
|
|
'posts_fetched': current,
|
|
})
|
|
|
|
# Use ImgInn API adapter directly for posts — it has proper cursor-based
|
|
# pagination that fetches ALL posts. The instagram_client adapter (browser)
|
|
# only scrapes the first page of the HTML profile.
|
|
|
|
# Callback to upsert each post immediately as it's fetched,
|
|
# so downloads are queued before CDN URLs expire.
|
|
def post_callback(post):
|
|
nonlocal new_posts, new_attachments
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
if post.tagged_users:
|
|
self.db.set_post_tagged_users(post_db_id, post.tagged_users)
|
|
|
|
posts = await imginn_posts_adapter.get_posts(
|
|
creator_url,
|
|
since_date=since_date,
|
|
progress_callback=progress_callback,
|
|
known_post_ids=known_post_ids,
|
|
post_callback=post_callback,
|
|
use_authenticated_api=_use_auth,
|
|
paginate_all=force_backfill or (_use_auth and _is_first_sync),
|
|
)
|
|
|
|
if posts:
|
|
self.log(f"Processed {len(posts)} new posts for @{creator['username']}", 'info')
|
|
# Update post count and notify frontend
|
|
current_count = self.db.get_creator_post_count(creator_id)
|
|
self.db.update_creator(creator_id, {'post_count': current_count})
|
|
self._emit_event('paid_content_creator_updated', {
|
|
'creator_id': creator_id,
|
|
'post_count': current_count,
|
|
})
|
|
|
|
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None)
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'last_post_date': latest_post_date or creator.get('last_post_date'),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
else:
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
|
|
# Update pinned posts from Instagram API data
|
|
if hasattr(imginn_posts_adapter, '_last_pinned_posts') and imginn_posts_adapter._last_pinned_posts:
|
|
self.db.update_pinned_posts(creator_id, imginn_posts_adapter._last_pinned_posts)
|
|
else:
|
|
self.log(f"Skipping posts for @{creator['username']} (disabled in sync settings)", 'info')
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
|
|
# --- Fetch stories ---
|
|
if creator.get('sync_stories', 1):
|
|
try:
|
|
self._update_active_sync(creator_id, {
|
|
'status': 'Fetching stories...',
|
|
'phase': 'fetching_stories',
|
|
})
|
|
|
|
# Get known story post_ids for incremental sync
|
|
story_known_ids = None
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT post_id FROM paid_content_posts WHERE creator_id = ? AND post_id LIKE 'story_%'",
|
|
(creator_id,)
|
|
)
|
|
story_known_ids = {row[0] for row in cursor.fetchall()}
|
|
|
|
# Use FastDL directly for stories — it hits the Instagram API
|
|
# and returns stable pk-based IDs. ImgInn was causing duplicate
|
|
# stories because its AQ-code IDs differ from FastDL's.
|
|
fastdl_file_map = {}
|
|
story_posts = []
|
|
self._update_active_sync(creator_id, {'status': 'Fetching stories...', 'phase': 'fetching_stories'})
|
|
try:
|
|
story_posts, fastdl_file_map = await self._fetch_stories_via_fastdl(creator['username'])
|
|
if story_known_ids and story_posts:
|
|
story_posts = [p for p in story_posts if p.post_id not in story_known_ids]
|
|
if story_posts:
|
|
self.log(f"FastDL found {len(story_posts)} new stories for @{creator['username']}", 'info')
|
|
except Exception as e:
|
|
self.log(f"FastDL story fetch failed for @{creator['username']}: {e}", 'warning')
|
|
|
|
if story_posts:
|
|
self.log(f"Found {len(story_posts)} new stories for @{creator['username']}", 'info')
|
|
|
|
# Get or create the "Story" tag
|
|
story_tag = self.db.get_tag_by_slug('story')
|
|
if not story_tag:
|
|
story_tag_id = self.db.create_tag('Story', color='#8b5cf6')
|
|
else:
|
|
story_tag_id = story_tag['id']
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
|
|
for post in story_posts:
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
post_all_completed = True
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
|
|
# If FastDL pre-downloaded this file, move it directly
|
|
temp_file = fastdl_file_map.get(attachment.name)
|
|
if temp_file and Path(temp_file).exists():
|
|
published_at = post.published_at or ''
|
|
post_date = published_at[:10] if published_at else 'unknown-date'
|
|
output_dir = base_path / 'instagram' / self._sanitize_filename(creator['username']) / post_date
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
dest_path = output_dir / attachment.name
|
|
|
|
shutil.move(str(temp_file), str(dest_path))
|
|
|
|
file_size = dest_path.stat().st_size if dest_path.exists() else 0
|
|
width, height, duration = None, None, None
|
|
if attachment.file_type == 'video':
|
|
width, height, duration = self._extract_dimensions(dest_path, 'video')
|
|
else:
|
|
width, height, _ = self._extract_dimensions(dest_path, 'image')
|
|
|
|
thumbnail_data = None
|
|
if dest_path.exists():
|
|
thumbnail_data = self._generate_thumbnail(dest_path, attachment.file_type or 'image')
|
|
|
|
# Compute file_hash for future dedup (stories often lack pk)
|
|
try:
|
|
story_file_hash = hashlib.md5(dest_path.read_bytes()).hexdigest()
|
|
except Exception:
|
|
story_file_hash = None
|
|
|
|
att_data['local_path'] = str(dest_path)
|
|
att_data['local_filename'] = attachment.name
|
|
att_data['file_size'] = file_size
|
|
att_data['file_hash'] = story_file_hash
|
|
att_data['width'] = width
|
|
att_data['height'] = height
|
|
att_data['duration'] = duration
|
|
att_data['status'] = 'completed'
|
|
att_data['downloaded_at'] = datetime.now().isoformat()
|
|
att_data['thumbnail_data'] = thumbnail_data
|
|
else:
|
|
post_all_completed = False
|
|
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
# Mark post as downloaded if all attachments were pre-downloaded by FastDL
|
|
if post_all_completed and fastdl_file_map:
|
|
self.db.mark_post_downloaded(post_db_id)
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
if story_tag_id:
|
|
self.db.add_tag_to_post(post_db_id, story_tag_id)
|
|
|
|
# Update post count after stories
|
|
current_count = self.db.get_creator_post_count(creator_id)
|
|
self.db.update_creator(creator_id, {'post_count': current_count})
|
|
|
|
# Clean up FastDL temp directory
|
|
fastdl_temp = Path('/opt/media-downloader/temp/paid_content_fastdl')
|
|
if fastdl_temp.exists():
|
|
shutil.rmtree(fastdl_temp, ignore_errors=True)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error fetching stories for @{creator['username']}: {e}", 'warning')
|
|
# Non-fatal — posts were already synced successfully
|
|
else:
|
|
self.log(f"Skipping stories for @{creator['username']} (disabled in sync settings)", 'info')
|
|
|
|
# --- Fetch highlights ---
|
|
if creator.get('sync_highlights', 1):
|
|
try:
|
|
self._update_active_sync(creator_id, {
|
|
'status': 'Fetching highlights...',
|
|
'phase': 'fetching_highlights',
|
|
})
|
|
|
|
# Get known highlight post_ids for incremental sync
|
|
highlight_known_ids = None
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT post_id FROM paid_content_posts WHERE creator_id = ? AND post_id LIKE 'highlight_%'",
|
|
(creator_id,)
|
|
)
|
|
highlight_known_ids = {row[0] for row in cursor.fetchall()}
|
|
|
|
def highlight_progress(current, total=None):
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Fetching highlight {current}/{total}...' if total else 'Fetching highlights...',
|
|
'phase': 'fetching_highlights',
|
|
})
|
|
|
|
highlight_posts = await imginn_posts_adapter.get_highlights(
|
|
creator_url,
|
|
known_post_ids=highlight_known_ids,
|
|
progress_callback=highlight_progress,
|
|
)
|
|
|
|
if highlight_posts:
|
|
self.log(f"Found {len(highlight_posts)} new highlights for @{creator['username']}", 'info')
|
|
|
|
# Get or create the "Highlight" tag
|
|
highlight_tag = self.db.get_tag_by_slug('highlight')
|
|
if not highlight_tag:
|
|
highlight_tag_id = self.db.create_tag('Highlight', color='#f59e0b')
|
|
else:
|
|
highlight_tag_id = highlight_tag['id']
|
|
|
|
for post in highlight_posts:
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
if highlight_tag_id:
|
|
self.db.add_tag_to_post(post_db_id, highlight_tag_id)
|
|
|
|
# Update post count after highlights
|
|
current_count = self.db.get_creator_post_count(creator_id)
|
|
self.db.update_creator(creator_id, {'post_count': current_count})
|
|
|
|
except Exception as e:
|
|
self.log(f"Error fetching highlights for @{creator['username']}: {e}", 'warning')
|
|
# Non-fatal — posts were already synced successfully
|
|
|
|
# Refresh failed highlight URLs — re-fetch API for fresh CDN links
|
|
try:
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT a.id, a.name, a.download_url
|
|
FROM paid_content_attachments a
|
|
JOIN paid_content_posts p ON a.post_id = p.id
|
|
WHERE p.creator_id = ? AND p.post_id LIKE 'highlight_%'
|
|
AND a.status = 'failed'
|
|
""", (creator_id,))
|
|
failed_highlights = [dict(row) for row in cursor.fetchall()]
|
|
|
|
if failed_highlights:
|
|
self.log(f"Refreshing URLs for {len(failed_highlights)} failed highlight attachments", 'info')
|
|
updated_urls = await imginn_posts_adapter.refresh_failed_highlight_urls(failed_highlights)
|
|
for att_id, new_url in updated_urls.items():
|
|
self.db.update_attachment(att_id, {
|
|
'download_url': new_url,
|
|
'status': 'pending',
|
|
'error_message': None,
|
|
})
|
|
if updated_urls:
|
|
self.log(f"Reset {len(updated_urls)} highlight attachments with fresh URLs", 'info')
|
|
else:
|
|
self.log(f"No fresh URLs available yet for failed highlights (CDN URLs still expired at source)", 'info')
|
|
except Exception as e:
|
|
self.log(f"Error refreshing failed highlight URLs: {e}", 'warning')
|
|
else:
|
|
self.log(f"Skipping highlights for @{creator['username']} (disabled in sync settings)", 'info')
|
|
|
|
# Stop concurrent download loop and collect its results
|
|
_dl_done.set()
|
|
if _dl_task:
|
|
await _dl_task
|
|
|
|
downloaded = _dl_results['downloaded']
|
|
failed = _dl_results['failed']
|
|
downloaded_file_info = list(_dl_results['downloaded_file_info'])
|
|
|
|
# Final download sweep for remaining attachments
|
|
# (stories, highlights, or posts queued near the end)
|
|
if download and creator.get('auto_download', True):
|
|
result = await self._download_instagram_posts(creator_id)
|
|
downloaded += result.get('downloaded', 0)
|
|
failed += result.get('failed', 0)
|
|
downloaded_file_info.extend(result.get('downloaded_file_info', []))
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': new_posts, 'new_attachments': new_attachments,
|
|
'downloaded': downloaded, 'failed': failed
|
|
})
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True, new_posts=new_posts, new_attachments=new_attachments,
|
|
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
_dl_done.set()
|
|
if _dl_task:
|
|
try:
|
|
await _dl_task
|
|
except Exception:
|
|
pass
|
|
self.log(f"Error syncing Instagram creator @{creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
finally:
|
|
ig_rate_limiter.operation_lock.release()
|
|
|
|
async def _fetch_stories_via_fastdl(self, username: str) -> Tuple[List, Dict]:
|
|
"""Fetch Instagram stories via FastDL subprocess as a fallback when ImgInn returns nothing.
|
|
|
|
Returns:
|
|
Tuple of (list of Post objects, dict mapping attachment_name -> temp_file_path)
|
|
"""
|
|
temp_dir = Path('/opt/media-downloader/temp/paid_content_fastdl')
|
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
config = {
|
|
'username': username,
|
|
'content_type': 'stories',
|
|
'temp_dir': str(temp_dir),
|
|
'days_back': 3,
|
|
'max_downloads': 50,
|
|
'headless': True,
|
|
'db_path': '/opt/media-downloader/database/media_downloader.db',
|
|
'defer_database': True,
|
|
}
|
|
|
|
wrapper_path = '/opt/media-downloader/wrappers/fastdl_subprocess_wrapper.py'
|
|
self.log(f"Running FastDL subprocess for @{username} stories...", 'info')
|
|
|
|
process = await asyncio.create_subprocess_exec(
|
|
sys.executable, wrapper_path,
|
|
stdin=asyncio.subprocess.PIPE,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
stdout, stderr = await asyncio.wait_for(
|
|
process.communicate(input=json.dumps(config).encode()),
|
|
timeout=300,
|
|
)
|
|
|
|
if stderr:
|
|
for line in stderr.decode(errors='replace').strip().splitlines():
|
|
self.log(f"[FastDL] {line}", 'debug')
|
|
|
|
if process.returncode != 0:
|
|
error_msg = 'FastDL subprocess failed'
|
|
if stdout:
|
|
try:
|
|
err_result = json.loads(stdout.decode().strip())
|
|
error_msg = err_result.get('message', error_msg)
|
|
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
pass
|
|
raise RuntimeError(error_msg)
|
|
|
|
result = json.loads(stdout.decode().strip())
|
|
pending_downloads = result.get('pending_downloads', [])
|
|
|
|
if not pending_downloads:
|
|
return [], {}
|
|
|
|
posts = []
|
|
file_map = {}
|
|
|
|
for item in pending_downloads:
|
|
media_id = item.get('media_id', '')
|
|
url = item.get('url', '')
|
|
post_date = item.get('post_date')
|
|
filename = item.get('filename', '')
|
|
file_path = item.get('file_path', '')
|
|
|
|
# Determine extension and file type from CDN URL or FastDL filename
|
|
cdn_path = url.split('?')[0] if url else ''
|
|
is_video = '.mp4' in cdn_path or '.mp4' in filename
|
|
ext = '.mp4' if is_video else '.jpg'
|
|
file_type = 'video' if is_video else 'image'
|
|
|
|
# Use Instagram pk (stable numeric primary key) as story_id.
|
|
# Falls back to media_id if pk not available.
|
|
metadata = item.get('metadata') or {}
|
|
pk = metadata.get('pk', '')
|
|
story_id = pk if pk else media_id
|
|
|
|
# When pk is missing (browser fallback), check file hash and local_path
|
|
# against existing attachments to prevent duplicates with different ID formats.
|
|
if not pk and file_path and Path(file_path).exists():
|
|
file_hash = hashlib.md5(Path(file_path).read_bytes()).hexdigest()
|
|
existing = self.db.check_duplicate_hash(file_hash)
|
|
if existing:
|
|
existing_name = existing.get('name', '')
|
|
self.log(f"Story {media_id} is duplicate of existing {existing_name} (same file hash), skipping", 'info')
|
|
try:
|
|
Path(file_path).unlink()
|
|
except Exception:
|
|
pass
|
|
continue
|
|
|
|
# Also check by computed destination path — catches duplicates where
|
|
# neither story had file_hash populated (e.g. pk-based vs CDN-based IDs)
|
|
att_name_check = f"story_{story_id}{ext}"
|
|
try:
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT a.id, a.name FROM paid_content_attachments a
|
|
JOIN paid_content_posts p ON a.post_id = p.id
|
|
WHERE p.creator_id = ? AND p.service_id = 'instagram'
|
|
AND a.name LIKE 'story_%' AND a.local_path IS NOT NULL
|
|
AND a.local_path = (
|
|
SELECT a2.local_path FROM paid_content_attachments a2
|
|
JOIN paid_content_posts p2 ON a2.post_id = p2.id
|
|
WHERE p2.creator_id = ? AND a2.name = ? AND a2.status = 'completed'
|
|
LIMIT 1
|
|
)
|
|
AND a.name != ?
|
|
LIMIT 1
|
|
""", (creator_id, creator_id, att_name_check, att_name_check))
|
|
dup_row = cursor.fetchone()
|
|
if dup_row:
|
|
self.log(f"Story {media_id} maps to same path as existing {dup_row['name']}, skipping", 'info')
|
|
try:
|
|
Path(file_path).unlink()
|
|
except Exception:
|
|
pass
|
|
continue
|
|
except Exception:
|
|
pass
|
|
|
|
att_name = f"story_{story_id}{ext}"
|
|
attachment = Attachment(
|
|
name=att_name,
|
|
file_type=file_type,
|
|
extension=ext,
|
|
server_path=f"https://www.instagram.com/stories/{username}/",
|
|
download_url=url,
|
|
)
|
|
|
|
post = Post(
|
|
post_id=f'story_{story_id}',
|
|
service_id='instagram',
|
|
platform='instagram',
|
|
creator_id=username,
|
|
title=f'Story by @{username}',
|
|
content='',
|
|
published_at=post_date,
|
|
attachments=[attachment],
|
|
)
|
|
posts.append(post)
|
|
|
|
# Track pre-downloaded file for direct move
|
|
if file_path and Path(file_path).exists():
|
|
file_map[att_name] = file_path
|
|
|
|
self.log(f"FastDL returned {len(posts)} stories ({len(file_map)} pre-downloaded)", 'info')
|
|
return posts, file_map
|
|
|
|
async def _download_instagram_posts(self, creator_id: int, quiet: bool = False) -> Dict:
|
|
"""Download pending Instagram media from CDN URLs"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
instagram = InstagramAdapter(unified_db=self.unified_db, log_callback=self.log_callback)
|
|
|
|
self.log(f"Downloading {len(pending)} Instagram media for @{creator['username']}", 'info')
|
|
if not quiet:
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading', 'status': f'Downloading {len(pending)} media files...',
|
|
'total_files': len(pending), 'downloaded': 0
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'status': f'Downloading {len(pending)} media files...',
|
|
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
|
|
for i, att in enumerate(pending):
|
|
try:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
failed += 1
|
|
continue
|
|
|
|
published_at = post.get('published_at') or ''
|
|
post_date = published_at[:10] if published_at else 'unknown-date'
|
|
output_dir = base_path / 'instagram' / self._sanitize_filename(creator['username']) / post_date
|
|
|
|
download_url = att.get('download_url')
|
|
if not download_url:
|
|
self.db.update_attachment_status(att['id'], 'failed', error_message='No download URL')
|
|
failed += 1
|
|
continue
|
|
|
|
if not quiet:
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading media {i + 1}/{len(pending)}...',
|
|
'phase': 'downloading',
|
|
'downloaded': downloaded,
|
|
'progress': i + 1,
|
|
'total_files': len(pending),
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'status': f'Downloading media {i + 1}/{len(pending)}...',
|
|
'phase': 'downloading', 'downloaded': downloaded,
|
|
'progress': i + 1, 'total_files': len(pending),
|
|
})
|
|
|
|
# Determine output filename from attachment name or generate one
|
|
filename = att.get('name') or att.get('local_filename') or f"media_{i}{att.get('extension', '.jpg')}"
|
|
output_path = Path(output_dir) / filename
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
result = await instagram.download_media(download_url, output_path)
|
|
|
|
if not result or not result.get('success'):
|
|
error = result.get('error', 'Download failed') if result else 'No result'
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
file_path = result.get('file_path')
|
|
file_size = result.get('file_size', 0)
|
|
|
|
content_type = 'image'
|
|
if file_path:
|
|
ext = Path(file_path).suffix.lower()
|
|
if ext in ('.mp4', '.mov', '.webm', '.avi'):
|
|
content_type = 'video'
|
|
|
|
width, height, duration = None, None, None
|
|
if file_path and content_type == 'video':
|
|
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
|
|
elif file_path and content_type == 'image':
|
|
width, height, _ = self._extract_dimensions(Path(file_path), 'image')
|
|
|
|
thumbnail_data = None
|
|
if file_path and Path(file_path).exists():
|
|
thumbnail_data = self._generate_thumbnail(Path(file_path), content_type)
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name if file_path else None,
|
|
file_size=file_size,
|
|
width=width, height=height, duration=duration,
|
|
thumbnail_data=thumbnail_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
downloaded += 1
|
|
|
|
if file_path:
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name,
|
|
'source': creator['username'],
|
|
'content_type': content_type
|
|
})
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading Instagram post: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
|
|
|
|
async def _add_instagram_creator(self, username: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add an Instagram creator"""
|
|
instagram = InstagramAdapter(unified_db=self.unified_db, log_callback=self.log_callback)
|
|
|
|
creator_url = InstagramAdapter.normalize_creator_url(username)
|
|
|
|
# Fetch profile info for richer data (avatar, bio, display name)
|
|
profile_info = await instagram.get_profile_info(username)
|
|
|
|
creator_info = await instagram.get_creator(creator_url)
|
|
if not creator_info:
|
|
return {'success': False, 'error': 'Instagram creator not found'}
|
|
|
|
creator_data = creator_info.to_dict()
|
|
creator_data['auto_download'] = 1 if auto_download else 0
|
|
creator_data['download_embeds'] = 1 if download_embeds else 0
|
|
|
|
# Cache avatar locally so CDN URL expiry doesn't break it
|
|
if profile_info and profile_info.get('avatar_url'):
|
|
try:
|
|
cached = await self._cache_profile_image(
|
|
profile_info['avatar_url'], 'instagram', username, 'avatar'
|
|
)
|
|
if cached:
|
|
creator_data['profile_image_url'] = cached
|
|
except Exception as e:
|
|
self.log(f"Failed to cache avatar for @{username}: {e}", 'warning')
|
|
|
|
# Store bio and display name from profile
|
|
if profile_info:
|
|
if profile_info.get('bio'):
|
|
creator_data['bio'] = profile_info['bio']
|
|
if profile_info.get('display_name'):
|
|
creator_data['display_name'] = profile_info['display_name']
|
|
if profile_info.get('post_count'):
|
|
creator_data['post_count'] = profile_info['post_count']
|
|
if profile_info.get('external_links'):
|
|
creator_data['external_links'] = profile_info['external_links']
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
return {'success': True, 'creator': {'id': db_id, **creator_data}}
|
|
|
|
# =========================================================================
|
|
# Soundgasm + Liltsome sync/download/add
|
|
# =========================================================================
|
|
|
|
async def _sync_soundgasm_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
|
|
"""Sync a Soundgasm creator - fetch audio from Soundgasm and Liltsome archive"""
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing Soundgasm creator: {creator['username']}", 'info')
|
|
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'soundgasm',
|
|
'service': 'soundgasm',
|
|
'status': 'Fetching audio posts...',
|
|
'phase': 'fetching',
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
|
|
|
|
try:
|
|
client = self._get_soundgasm_client()
|
|
|
|
# Get known post IDs for incremental sync (no dates on Soundgasm)
|
|
known_post_ids = set()
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
|
|
(creator_id,)
|
|
)
|
|
known_post_ids = {row[0] for row in cursor.fetchall()}
|
|
|
|
def progress_callback(count):
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Fetched {count} audio posts...',
|
|
'posts_fetched': count,
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': f'Fetched {count} audio posts...',
|
|
'phase': 'fetching',
|
|
'posts_fetched': count,
|
|
})
|
|
|
|
posts = await client.get_posts(
|
|
creator['creator_id'],
|
|
known_post_ids=known_post_ids,
|
|
progress_callback=progress_callback,
|
|
)
|
|
|
|
if not posts:
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': 0, 'new_attachments': 0, 'downloaded': 0, 'failed': 0
|
|
})
|
|
return SyncResult(success=True, new_posts=0, new_attachments=0)
|
|
|
|
self.log(f"Found {len(posts)} new audio posts for {creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} posts...',
|
|
'phase': 'processing', 'total_posts': len(posts)
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
for post in posts:
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
# Apply bracket auto-tags
|
|
if is_new_post and hasattr(post, 'auto_tags') and post.auto_tags:
|
|
for tag_name in post.auto_tags:
|
|
try:
|
|
slug = tag_name.lower().replace(' ', '-').replace('/', '-')
|
|
tag = self.db.get_tag_by_slug(slug)
|
|
if not tag:
|
|
display = format_tag_display(tag_name)
|
|
tag_id = self.db.create_tag(
|
|
display, color='#8b5cf6',
|
|
description='Auto-extracted from audio title'
|
|
)
|
|
else:
|
|
tag_id = tag['id']
|
|
if tag_id:
|
|
self.db.add_tag_to_post(post_db_id, tag_id)
|
|
except Exception:
|
|
pass # Don't let tagging errors break sync
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
# Update post count and notify frontend
|
|
current_count = self.db.get_creator_post_count(creator_id)
|
|
self.db.update_creator(creator_id, {'post_count': current_count})
|
|
self._emit_event('paid_content_creator_updated', {
|
|
'creator_id': creator_id,
|
|
'post_count': current_count,
|
|
})
|
|
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
# Download audio files
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self._download_soundgasm_audio(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': new_posts, 'new_attachments': new_attachments,
|
|
'downloaded': downloaded, 'failed': failed
|
|
})
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True, new_posts=new_posts, new_attachments=new_attachments,
|
|
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing Soundgasm creator {creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
async def _download_soundgasm_audio(self, creator_id: int) -> Dict:
|
|
"""Download pending Soundgasm/Liltsome audio files via direct HTTP"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
client = self._get_soundgasm_client()
|
|
|
|
self.log(f"Downloading {len(pending)} audio files for {creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading', 'status': f'Downloading {len(pending)} audio files...',
|
|
'total_files': len(pending), 'downloaded': 0
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'status': f'Downloading {len(pending)} audio files...',
|
|
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
|
|
for i, att in enumerate(pending):
|
|
try:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
failed += 1
|
|
continue
|
|
|
|
output_dir = base_path / 'soundgasm' / self._sanitize_filename(creator['username'])
|
|
|
|
download_url = att.get('download_url')
|
|
if not download_url:
|
|
self.db.update_attachment_status(att['id'], 'failed', error_message='No download URL')
|
|
failed += 1
|
|
continue
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading audio {i + 1}/{len(pending)}...',
|
|
'phase': 'downloading',
|
|
'downloaded': downloaded,
|
|
'progress': i + 1,
|
|
'total_files': len(pending),
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'status': f'Downloading audio {i + 1}/{len(pending)}...',
|
|
'phase': 'downloading', 'downloaded': downloaded,
|
|
'progress': i + 1, 'total_files': len(pending),
|
|
})
|
|
|
|
filename = att.get('name') or att.get('local_filename') or f"audio_{i}.m4a"
|
|
output_path = Path(output_dir) / filename
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
result = await client.download_audio(download_url, output_path)
|
|
|
|
if not result or not result.get('success'):
|
|
error = result.get('error', 'Download failed') if result else 'No result'
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
file_path = result.get('file_path')
|
|
file_size = result.get('file_size', 0)
|
|
|
|
# Extract duration via ffprobe
|
|
width, height, duration = None, None, None
|
|
if file_path:
|
|
_, _, duration = self._extract_dimensions(Path(file_path), 'audio')
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name if file_path else None,
|
|
file_size=file_size,
|
|
width=width, height=height, duration=duration,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
downloaded += 1
|
|
|
|
if file_path:
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name,
|
|
'source': creator['username'],
|
|
'content_type': 'audio'
|
|
})
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading Soundgasm audio: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
|
|
|
|
async def _add_soundgasm_creator(self, username: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a Soundgasm creator (checks both Soundgasm and Liltsome)"""
|
|
client = self._get_soundgasm_client()
|
|
|
|
profile_info = await client.get_profile_info(username)
|
|
if not profile_info:
|
|
return {'success': False, 'error': 'Creator not found on Soundgasm or Liltsome archive'}
|
|
|
|
creator_data = {
|
|
'service_id': 'soundgasm',
|
|
'platform': 'soundgasm',
|
|
'creator_id': username,
|
|
'username': username,
|
|
'display_name': username,
|
|
'post_count': profile_info.get('post_count', 0),
|
|
'auto_download': 1 if auto_download else 0,
|
|
'download_embeds': 1 if download_embeds else 0,
|
|
}
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
return {'success': True, 'creator': {'id': db_id, **creator_data}}
|
|
|
|
# ------------------------------------------------------------------
|
|
# Bellazon (forum thread scraping)
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _add_bellazon_creator(self, topic_id: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a Bellazon forum thread as a creator"""
|
|
client = self._get_bellazon_client()
|
|
|
|
profile_info = await client.get_profile_info(topic_id)
|
|
if not profile_info:
|
|
return {'success': False, 'error': 'Thread not found on Bellazon'}
|
|
|
|
creator_data = {
|
|
'service_id': 'bellazon',
|
|
'platform': 'bellazon',
|
|
'creator_id': topic_id,
|
|
'username': profile_info['username'],
|
|
'display_name': profile_info['display_name'],
|
|
'post_count': profile_info.get('post_count', 0),
|
|
'bio': profile_info.get('topic_url', ''),
|
|
'auto_download': 1 if auto_download else 0,
|
|
'download_embeds': 1 if download_embeds else 0,
|
|
}
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
return {'success': True, 'creator': {'id': db_id, **creator_data}}
|
|
|
|
async def _sync_bellazon_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
|
|
"""Sync a Bellazon forum thread — scrape pages for image/video posts"""
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing Bellazon thread: {creator['display_name'] or creator['username']}", 'info')
|
|
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'bellazon',
|
|
'service': 'bellazon',
|
|
'status': 'Fetching forum posts...',
|
|
'phase': 'fetching',
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
|
|
|
|
try:
|
|
client = self._get_bellazon_client()
|
|
|
|
# Get known post IDs for incremental sync
|
|
known_post_ids = set()
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
|
|
(creator_id,)
|
|
)
|
|
known_post_ids = {row[0] for row in cursor.fetchall()}
|
|
|
|
# Reconstruct topic URL from bio (stored at add time) or from creator_id + username
|
|
topic_url = creator.get('bio', '').strip()
|
|
if not topic_url or 'bellazon.com' not in topic_url:
|
|
topic_url = f"{client.BASE_URL}/topic/{creator['creator_id']}-{creator['username']}"
|
|
|
|
def progress_callback(count):
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Fetched {count} media posts...',
|
|
'posts_fetched': count,
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': f'Fetched {count} media posts...',
|
|
'phase': 'fetching',
|
|
'posts_fetched': count,
|
|
})
|
|
|
|
posts = await client.get_posts(
|
|
creator['creator_id'],
|
|
topic_url,
|
|
known_post_ids=known_post_ids,
|
|
progress_callback=progress_callback,
|
|
)
|
|
|
|
if posts:
|
|
self.log(f"Found {len(posts)} new media posts for {creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} posts...',
|
|
'phase': 'processing', 'total_posts': len(posts)
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
for post in posts:
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
current_count = self.db.get_creator_post_count(creator_id)
|
|
self.db.update_creator(creator_id, {'post_count': current_count})
|
|
self._emit_event('paid_content_creator_updated', {
|
|
'creator_id': creator_id,
|
|
'post_count': current_count,
|
|
})
|
|
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
# Download media files
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self._download_bellazon_media(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
# Clean up permanently failed attachments and empty posts
|
|
self._cleanup_bellazon_failed(creator_id)
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': new_posts, 'new_attachments': new_attachments,
|
|
'downloaded': downloaded, 'failed': failed
|
|
})
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True, new_posts=new_posts, new_attachments=new_attachments,
|
|
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing Bellazon thread {creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
def _cleanup_bellazon_failed(self, creator_id: int):
|
|
"""Remove permanently failed attachments and delete posts left with zero attachments."""
|
|
try:
|
|
with self.unified_db.get_connection(for_write=True) as conn:
|
|
cursor = conn.cursor()
|
|
# Delete failed attachments for this creator
|
|
cursor.execute("""
|
|
DELETE FROM paid_content_attachments
|
|
WHERE status = 'failed'
|
|
AND post_id IN (SELECT id FROM paid_content_posts WHERE creator_id = ?)
|
|
""", (creator_id,))
|
|
removed_atts = cursor.rowcount
|
|
|
|
if removed_atts > 0:
|
|
# Delete posts that now have zero attachments
|
|
cursor.execute("""
|
|
DELETE FROM paid_content_posts
|
|
WHERE creator_id = ?
|
|
AND id NOT IN (SELECT DISTINCT post_id FROM paid_content_attachments)
|
|
""", (creator_id,))
|
|
removed_posts = cursor.rowcount
|
|
|
|
# Update attachment_count on remaining posts
|
|
cursor.execute("""
|
|
UPDATE paid_content_posts
|
|
SET attachment_count = (
|
|
SELECT COUNT(*) FROM paid_content_attachments WHERE post_id = paid_content_posts.id
|
|
)
|
|
WHERE creator_id = ?
|
|
""", (creator_id,))
|
|
|
|
conn.commit()
|
|
if removed_atts or removed_posts:
|
|
self.log(f"Cleanup: removed {removed_atts} failed attachments, {removed_posts} empty posts", 'info')
|
|
|
|
# Update creator post count
|
|
current_count = self.db.get_creator_post_count(creator_id)
|
|
self.db.update_creator(creator_id, {'post_count': current_count})
|
|
except Exception as e:
|
|
self.log(f"Error during bellazon cleanup: {e}", 'warning')
|
|
|
|
async def _download_bellazon_media(self, creator_id: int) -> Dict:
|
|
"""Download pending Bellazon images/videos via direct HTTP"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
self.log(f"Downloading {len(pending)} media files for {creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading', 'status': f'Downloading {len(pending)} files...',
|
|
'total_files': len(pending), 'downloaded': 0
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'status': f'Downloading {len(pending)} files...',
|
|
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
headers = BellazonClient.HEADERS
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
|
|
timeout = aiohttp.ClientTimeout(total=120)
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
for i, att in enumerate(pending):
|
|
try:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
failed += 1
|
|
continue
|
|
|
|
# Organize by date: paid_content/bellazon/{slug}/{YYYY-MM-DD}/
|
|
date_str = (post.get('published_at') or '')[:10] or 'unknown-date'
|
|
output_dir = base_path / 'bellazon' / self._sanitize_filename(creator['username']) / date_str
|
|
|
|
download_url = att.get('download_url')
|
|
if not download_url:
|
|
self.db.update_attachment_status(att['id'], 'failed', error_message='No download URL')
|
|
failed += 1
|
|
continue
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading {i + 1}/{len(pending)}...',
|
|
'phase': 'downloading',
|
|
'downloaded': downloaded,
|
|
'progress': i + 1,
|
|
'total_files': len(pending),
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'status': f'Downloading {i + 1}/{len(pending)}...',
|
|
'phase': 'downloading', 'downloaded': downloaded,
|
|
'progress': i + 1, 'total_files': len(pending),
|
|
})
|
|
|
|
filename = att.get('name') or att.get('local_filename') or f"media_{i}.jpg"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_path = output_dir / filename
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
|
|
# Direct HTTP download
|
|
try:
|
|
async with session.get(download_url, headers=headers) as resp:
|
|
if resp.status != 200:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=f'HTTP {resp.status}',
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
total = 0
|
|
async with aiofiles.open(str(output_path), 'wb') as f:
|
|
async for chunk in resp.content.iter_chunked(65536):
|
|
await f.write(chunk)
|
|
total += len(chunk)
|
|
except Exception as dl_err:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(dl_err),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
file_path = str(output_path)
|
|
file_size = total
|
|
|
|
# Extract dimensions for images/videos
|
|
width, height, duration = None, None, None
|
|
file_type = att.get('file_type', 'image')
|
|
if file_path:
|
|
width, height, duration = self._extract_dimensions(Path(file_path), file_type)
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name,
|
|
file_size=file_size,
|
|
width=width, height=height, duration=duration,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
downloaded += 1
|
|
|
|
if file_path:
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name,
|
|
'source': creator['username'],
|
|
'content_type': file_type
|
|
})
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading Bellazon media: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
|
|
|
|
# -------------------------------------------------------------------------
|
|
# BestEyeCandy
|
|
# -------------------------------------------------------------------------
|
|
|
|
async def _add_besteyecandy_creator(self, creator_id_str: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a BestEyeCandy celeb as a creator.
|
|
|
|
creator_id_str is in 'cid/slug' format (e.g. '800/Myleene-Klass')
|
|
or just 'cid' if slug wasn't available.
|
|
"""
|
|
client = self._get_besteyecandy_client()
|
|
|
|
# Parse cid and slug from creator_id_str
|
|
if '/' in creator_id_str:
|
|
cid, celeb_slug = creator_id_str.split('/', 1)
|
|
else:
|
|
cid = creator_id_str
|
|
celeb_slug = cid # fallback
|
|
|
|
profile_info = await client.get_profile_info(cid, celeb_slug)
|
|
if not profile_info:
|
|
return {'success': False, 'error': 'Celeb not found on BestEyeCandy'}
|
|
|
|
# Ensure scraper row exists for cookie storage
|
|
try:
|
|
with self.unified_db.get_connection(for_write=True) as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT id FROM scrapers WHERE id = ?", ('besteyecandy',))
|
|
if not cursor.fetchone():
|
|
cursor.execute(
|
|
"INSERT INTO scrapers (id, name, enabled) VALUES (?, ?, ?)",
|
|
('besteyecandy', 'BestEyeCandy', 1)
|
|
)
|
|
conn.commit()
|
|
self.log("Created besteyecandy scraper row", 'info')
|
|
except Exception as e:
|
|
self.log(f"Error ensuring scraper row: {e}", 'warning')
|
|
|
|
creator_data = {
|
|
'service_id': 'besteyecandy',
|
|
'platform': 'besteyecandy',
|
|
'creator_id': cid,
|
|
'username': profile_info['username'],
|
|
'display_name': profile_info['display_name'],
|
|
'post_count': profile_info.get('post_count', 0),
|
|
'bio': profile_info.get('celeb_url', ''),
|
|
'auto_download': 1 if auto_download else 0,
|
|
'download_embeds': 1 if download_embeds else 0,
|
|
}
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
return {'success': True, 'creator': {'id': db_id, **creator_data}}
|
|
|
|
async def _sync_besteyecandy_creator(self, creator: Dict, download: bool = True,
|
|
scheduled: bool = False) -> SyncResult:
|
|
"""Sync a BestEyeCandy celeb -- scrape listing pages for photo posts."""
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing BestEyeCandy celeb: {creator['display_name'] or creator['username']}",
|
|
'info')
|
|
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'besteyecandy',
|
|
'service': 'besteyecandy',
|
|
'status': 'Fetching photo listings...',
|
|
'phase': 'fetching',
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
|
|
|
|
try:
|
|
client = self._get_besteyecandy_client()
|
|
|
|
# Get known post IDs for incremental sync
|
|
known_post_ids = set()
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
|
|
(creator_id,)
|
|
)
|
|
known_post_ids = {row[0] for row in cursor.fetchall()}
|
|
|
|
cid = creator['creator_id']
|
|
celeb_slug = creator['username']
|
|
|
|
def progress_callback(status_msg):
|
|
self._update_active_sync(creator_id, {
|
|
'status': status_msg,
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': status_msg,
|
|
'phase': 'fetching',
|
|
})
|
|
|
|
posts = await client.get_posts(
|
|
cid, celeb_slug,
|
|
known_post_ids=known_post_ids,
|
|
progress_callback=progress_callback,
|
|
)
|
|
|
|
if posts:
|
|
self.log(f"Found {len(posts)} new photos for {creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} photos...',
|
|
'phase': 'processing', 'total_posts': len(posts)
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
for post in posts:
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
current_count = self.db.get_creator_post_count(creator_id)
|
|
self.db.update_creator(creator_id, {'post_count': current_count})
|
|
self._emit_event('paid_content_creator_updated', {
|
|
'creator_id': creator_id,
|
|
'post_count': current_count,
|
|
})
|
|
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
# Download media files
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self._download_besteyecandy_media(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
# Clean up permanently failed attachments and empty posts
|
|
self._cleanup_besteyecandy_failed(creator_id)
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': new_posts, 'new_attachments': new_attachments,
|
|
'downloaded': downloaded, 'failed': failed
|
|
})
|
|
self._send_creator_notification(creator, new_posts, downloaded,
|
|
downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True, new_posts=new_posts, new_attachments=new_attachments,
|
|
downloaded_files=downloaded, failed_files=failed,
|
|
downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing BestEyeCandy celeb {creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
def _cleanup_besteyecandy_failed(self, creator_id: int):
|
|
"""Remove permanently failed attachments and delete posts left with zero attachments."""
|
|
try:
|
|
with self.unified_db.get_connection(for_write=True) as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
DELETE FROM paid_content_attachments
|
|
WHERE status = 'failed'
|
|
AND post_id IN (SELECT id FROM paid_content_posts WHERE creator_id = ?)
|
|
""", (creator_id,))
|
|
removed_atts = cursor.rowcount
|
|
|
|
if removed_atts > 0:
|
|
cursor.execute("""
|
|
DELETE FROM paid_content_posts
|
|
WHERE creator_id = ?
|
|
AND id NOT IN (SELECT DISTINCT post_id FROM paid_content_attachments)
|
|
""", (creator_id,))
|
|
removed_posts = cursor.rowcount
|
|
|
|
cursor.execute("""
|
|
UPDATE paid_content_posts
|
|
SET attachment_count = (
|
|
SELECT COUNT(*) FROM paid_content_attachments WHERE post_id = paid_content_posts.id
|
|
)
|
|
WHERE creator_id = ?
|
|
""", (creator_id,))
|
|
|
|
conn.commit()
|
|
if removed_atts or removed_posts:
|
|
self.log(f"Cleanup: removed {removed_atts} failed attachments, "
|
|
f"{removed_posts} empty posts", 'info')
|
|
|
|
current_count = self.db.get_creator_post_count(creator_id)
|
|
self.db.update_creator(creator_id, {'post_count': current_count})
|
|
except Exception as e:
|
|
self.log(f"Error during besteyecandy cleanup: {e}", 'warning')
|
|
|
|
async def _download_besteyecandy_media(self, creator_id: int) -> Dict:
|
|
"""Download pending BestEyeCandy images via direct HTTP with cookies."""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
self.log(f"Downloading {len(pending)} images for {creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading', 'status': f'Downloading {len(pending)} files...',
|
|
'total_files': len(pending), 'downloaded': 0
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'status': f'Downloading {len(pending)} files...',
|
|
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
client = self._get_besteyecandy_client()
|
|
headers = BestEyeCandyClient.HEADERS
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
|
|
timeout = aiohttp.ClientTimeout(total=120)
|
|
async with client._create_session(timeout=timeout) as session:
|
|
for i, att in enumerate(pending):
|
|
try:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
failed += 1
|
|
continue
|
|
|
|
# Organize: paid_content/besteyecandy/{slug}/{filename}
|
|
output_dir = base_path / 'besteyecandy' / self._sanitize_filename(
|
|
creator['username'])
|
|
|
|
download_url = att.get('download_url')
|
|
if not download_url:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message='No download URL')
|
|
failed += 1
|
|
continue
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading {i + 1}/{len(pending)}...',
|
|
'phase': 'downloading',
|
|
'downloaded': downloaded,
|
|
'progress': i + 1,
|
|
'total_files': len(pending),
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'status': f'Downloading {i + 1}/{len(pending)}...',
|
|
'phase': 'downloading', 'downloaded': downloaded,
|
|
'progress': i + 1, 'total_files': len(pending),
|
|
})
|
|
|
|
filename = att.get('name') or att.get('local_filename') or f"photo_{i}.jpg"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_path = output_dir / filename
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
|
|
# Direct HTTP download with cookies
|
|
try:
|
|
async with session.get(download_url, headers=headers) as resp:
|
|
if resp.status != 200:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=f'HTTP {resp.status}',
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
total = 0
|
|
async with aiofiles.open(str(output_path), 'wb') as f:
|
|
async for chunk in resp.content.iter_chunked(65536):
|
|
await f.write(chunk)
|
|
total += len(chunk)
|
|
except Exception as dl_err:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(dl_err),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
file_path = str(output_path)
|
|
file_size = total
|
|
|
|
# Extract dimensions for images
|
|
width, height, duration = None, None, None
|
|
file_type = att.get('file_type', 'image')
|
|
if file_path:
|
|
width, height, duration = self._extract_dimensions(
|
|
Path(file_path), file_type)
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name,
|
|
file_size=file_size,
|
|
width=width, height=height, duration=duration,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
downloaded += 1
|
|
|
|
if file_path:
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name,
|
|
'source': creator['username'],
|
|
'content_type': file_type
|
|
})
|
|
|
|
# Rate limit: 2s between downloads
|
|
await asyncio.sleep(2)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading BestEyeCandy media: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Coppermine Gallery
|
|
# -------------------------------------------------------------------------
|
|
|
|
async def _add_coppermine_creator(self, creator_id_str: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a Coppermine gallery as a creator.
|
|
|
|
creator_id_str is domain/path (e.g. 'kylie-jenner.org/gallery').
|
|
"""
|
|
client = self._get_coppermine_client()
|
|
|
|
# Reconstruct gallery URL from creator_id
|
|
gallery_url = f"https://{creator_id_str}"
|
|
|
|
profile_info = await client.get_profile_info(gallery_url)
|
|
if not profile_info:
|
|
return {'success': False, 'error': 'Gallery not found or not a Coppermine gallery'}
|
|
|
|
creator_data = {
|
|
'service_id': 'coppermine',
|
|
'platform': 'coppermine',
|
|
'creator_id': creator_id_str,
|
|
'username': profile_info['username'],
|
|
'display_name': profile_info['display_name'],
|
|
'post_count': profile_info.get('post_count', 0),
|
|
'bio': gallery_url,
|
|
'auto_download': 1 if auto_download else 0,
|
|
'download_embeds': 1 if download_embeds else 0,
|
|
}
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
return {'success': True, 'creator': {'id': db_id, **creator_data}}
|
|
|
|
async def _sync_coppermine_creator(self, creator: Dict, download: bool = True,
|
|
scheduled: bool = False) -> SyncResult:
|
|
"""Sync a Coppermine gallery — crawl categories/albums, upsert + download per album."""
|
|
creator_id = creator['id']
|
|
do_download = download and creator.get('auto_download', True)
|
|
self.log(f"Syncing Coppermine gallery: {creator['display_name'] or creator['username']}",
|
|
'info')
|
|
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'coppermine',
|
|
'service': 'coppermine',
|
|
'status': 'Crawling gallery categories...',
|
|
'phase': 'fetching',
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
|
|
|
|
try:
|
|
client = self._get_coppermine_client()
|
|
|
|
# Get known post IDs for incremental sync
|
|
known_post_ids = set()
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
|
|
(creator_id,)
|
|
)
|
|
known_post_ids = {row[0] for row in cursor.fetchall()}
|
|
|
|
# Reconstruct gallery URL from bio field
|
|
gallery_url = (creator.get('bio') or '').strip()
|
|
if not gallery_url or not gallery_url.startswith('http'):
|
|
gallery_url = f"https://{creator['creator_id']}"
|
|
|
|
def progress_callback(status_msg):
|
|
self._update_active_sync(creator_id, {
|
|
'status': status_msg,
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': status_msg,
|
|
'phase': 'fetching',
|
|
})
|
|
|
|
# Streaming state — upsert each album immediately, download via worker pool
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
headers = CoppermineClient.HEADERS
|
|
dl_timeout = aiohttp.ClientTimeout(total=None, sock_connect=30, sock_read=120)
|
|
dl_session = aiohttp.ClientSession(timeout=dl_timeout) if do_download else None
|
|
dl_queue: asyncio.Queue = asyncio.Queue()
|
|
dl_done = asyncio.Event() # signals workers to stop
|
|
|
|
async def _dl_worker():
|
|
"""Worker that pulls items from the download queue."""
|
|
nonlocal downloaded, failed
|
|
while True:
|
|
try:
|
|
att, output_dir = await asyncio.wait_for(dl_queue.get(), timeout=2.0)
|
|
except asyncio.TimeoutError:
|
|
if dl_done.is_set() and dl_queue.empty():
|
|
return
|
|
continue
|
|
|
|
try:
|
|
download_url = att.get('download_url')
|
|
if not download_url:
|
|
self.db.update_attachment_status(
|
|
att['id'], 'failed', error_message='No download URL')
|
|
failed += 1
|
|
continue
|
|
|
|
filename = att.get('name') or att.get('local_filename') or 'photo.jpg'
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_path = output_dir / filename
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
try:
|
|
async with dl_session.get(download_url, headers=headers) as resp:
|
|
if resp.status != 200:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=f'HTTP {resp.status}',
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat())
|
|
failed += 1
|
|
continue
|
|
|
|
total = 0
|
|
async with aiofiles.open(str(output_path), 'wb') as f:
|
|
async for chunk in resp.content.iter_chunked(65536):
|
|
await f.write(chunk)
|
|
total += len(chunk)
|
|
except Exception as dl_err:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(dl_err),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat())
|
|
failed += 1
|
|
continue
|
|
|
|
file_path = str(output_path)
|
|
file_type = att.get('file_type', 'image')
|
|
width, height, duration = self._extract_dimensions(
|
|
Path(file_path), file_type)
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name,
|
|
file_size=total,
|
|
width=width, height=height, duration=duration,
|
|
downloaded_at=datetime.now().isoformat())
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
self.db.increment_creator_download_stats(creator_id, 1, total or 0)
|
|
downloaded += 1
|
|
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name,
|
|
'source': creator['username'],
|
|
'content_type': file_type,
|
|
})
|
|
except Exception as e:
|
|
self.log(f"Download worker error: {e}", 'error')
|
|
finally:
|
|
dl_queue.task_done()
|
|
|
|
async def on_album(post):
|
|
nonlocal new_posts, new_attachments
|
|
|
|
# Upsert post
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if not post_db_id:
|
|
return
|
|
|
|
if is_new_post:
|
|
new_posts += 1
|
|
|
|
# Upsert attachments
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
current_count = self.db.get_creator_post_count(creator_id)
|
|
self.db.update_creator(creator_id, {'post_count': current_count})
|
|
self._emit_event('paid_content_creator_updated', {
|
|
'creator_id': creator_id,
|
|
'post_count': current_count,
|
|
})
|
|
|
|
# Queue downloads (don't block crawling)
|
|
if do_download and dl_session:
|
|
pending = self.db.get_pending_attachments_for_post(post_db_id)
|
|
if pending:
|
|
album_title = self._sanitize_filename(
|
|
post.content or post.title or f"album_{post.post_id}")
|
|
domain = self._sanitize_filename(creator['username'])
|
|
output_dir = base_path / 'coppermine' / domain / album_title
|
|
|
|
for att in pending:
|
|
await dl_queue.put((att, output_dir))
|
|
|
|
# Start download workers
|
|
workers = []
|
|
if do_download and dl_session:
|
|
workers = [asyncio.create_task(_dl_worker()) for _ in range(5)]
|
|
|
|
try:
|
|
await client.get_posts(
|
|
gallery_url,
|
|
known_post_ids=known_post_ids,
|
|
progress_callback=progress_callback,
|
|
post_callback=on_album,
|
|
)
|
|
|
|
# Signal workers that no more items will be added, then wait for queue drain
|
|
if workers:
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Finishing downloads ({dl_queue.qsize()} remaining)...',
|
|
'phase': 'downloading',
|
|
})
|
|
await dl_queue.join()
|
|
dl_done.set()
|
|
await asyncio.gather(*workers, return_exceptions=True)
|
|
finally:
|
|
dl_done.set()
|
|
if dl_session:
|
|
await dl_session.close()
|
|
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
# Clean up permanently failed attachments and empty posts
|
|
self._cleanup_coppermine_failed(creator_id)
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': new_posts, 'new_attachments': new_attachments,
|
|
'downloaded': downloaded, 'failed': failed
|
|
})
|
|
self._send_creator_notification(creator, new_posts, downloaded,
|
|
downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True, new_posts=new_posts, new_attachments=new_attachments,
|
|
downloaded_files=downloaded, failed_files=failed,
|
|
downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing Coppermine gallery {creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
def _cleanup_coppermine_failed(self, creator_id: int):
|
|
"""Remove permanently failed attachments and delete posts left with zero attachments."""
|
|
try:
|
|
with self.unified_db.get_connection(for_write=True) as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
DELETE FROM paid_content_attachments
|
|
WHERE status = 'failed'
|
|
AND post_id IN (SELECT id FROM paid_content_posts WHERE creator_id = ?)
|
|
""", (creator_id,))
|
|
removed_atts = cursor.rowcount
|
|
|
|
if removed_atts > 0:
|
|
cursor.execute("""
|
|
DELETE FROM paid_content_posts
|
|
WHERE creator_id = ?
|
|
AND id NOT IN (SELECT DISTINCT post_id FROM paid_content_attachments)
|
|
""", (creator_id,))
|
|
removed_posts = cursor.rowcount
|
|
|
|
cursor.execute("""
|
|
UPDATE paid_content_posts
|
|
SET attachment_count = (
|
|
SELECT COUNT(*) FROM paid_content_attachments WHERE post_id = paid_content_posts.id
|
|
)
|
|
WHERE creator_id = ?
|
|
""", (creator_id,))
|
|
|
|
conn.commit()
|
|
if removed_atts or removed_posts:
|
|
self.log(f"Cleanup: removed {removed_atts} failed attachments, {removed_posts} empty posts", 'info')
|
|
|
|
current_count = self.db.get_creator_post_count(creator_id)
|
|
self.db.update_creator(creator_id, {'post_count': current_count})
|
|
except Exception as e:
|
|
self.log(f"Error during coppermine cleanup: {e}", 'warning')
|
|
|
|
async def _download_coppermine_media(self, creator_id: int) -> Dict:
|
|
"""Download pending Coppermine images via direct HTTP (no auth needed)."""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
self.log(f"Downloading {len(pending)} images for {creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading', 'status': f'Downloading {len(pending)} files...',
|
|
'total_files': len(pending), 'downloaded': 0
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'status': f'Downloading {len(pending)} files...',
|
|
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
headers = CoppermineClient.HEADERS
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
|
|
timeout = aiohttp.ClientTimeout(total=120)
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
for i, att in enumerate(pending):
|
|
try:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
failed += 1
|
|
continue
|
|
|
|
# Organize: /paid-content/coppermine/{domain}/{album-title}/
|
|
album_title = self._sanitize_filename(post.get('content') or post.get('title') or f"album_{post.get('post_id', 'unknown')}")
|
|
domain = self._sanitize_filename(creator['username'])
|
|
output_dir = base_path / 'coppermine' / domain / album_title
|
|
|
|
download_url = att.get('download_url')
|
|
if not download_url:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message='No download URL')
|
|
failed += 1
|
|
continue
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading {i + 1}/{len(pending)}...',
|
|
'phase': 'downloading',
|
|
'downloaded': downloaded,
|
|
'progress': i + 1,
|
|
'total_files': len(pending),
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'status': f'Downloading {i + 1}/{len(pending)}...',
|
|
'phase': 'downloading', 'downloaded': downloaded,
|
|
'progress': i + 1, 'total_files': len(pending),
|
|
})
|
|
|
|
filename = att.get('name') or att.get('local_filename') or f"photo_{i}.jpg"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_path = output_dir / filename
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
|
|
# Direct HTTP download (no auth needed)
|
|
try:
|
|
async with session.get(download_url, headers=headers) as resp:
|
|
if resp.status != 200:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=f'HTTP {resp.status}',
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
total = 0
|
|
async with aiofiles.open(str(output_path), 'wb') as f:
|
|
async for chunk in resp.content.iter_chunked(65536):
|
|
await f.write(chunk)
|
|
total += len(chunk)
|
|
except Exception as dl_err:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(dl_err),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
file_path = str(output_path)
|
|
file_size = total
|
|
|
|
# Extract dimensions for images
|
|
width, height, duration = None, None, None
|
|
file_type = att.get('file_type', 'image')
|
|
if file_path:
|
|
width, height, duration = self._extract_dimensions(
|
|
Path(file_path), file_type)
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name,
|
|
file_size=file_size,
|
|
width=width, height=height, duration=duration,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
downloaded += 1
|
|
|
|
if file_path:
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name,
|
|
'source': creator['username'],
|
|
'content_type': file_type
|
|
})
|
|
|
|
# Rate limit: 0.5s between downloads (no auth = lighter)
|
|
await asyncio.sleep(0.5)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading Coppermine media: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
|
|
|
|
# -------------------------------------------------------------------------
|
|
# XenForo Forums (HQCelebCorner, PicturePub, etc.)
|
|
# -------------------------------------------------------------------------
|
|
|
|
async def _add_xenforo_creator(self, service_id: str, celebrity_name: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a XenForo forum celebrity as a creator (creator_id = celebrity name)."""
|
|
client = self._get_xenforo_client(service_id)
|
|
|
|
# Search for threads to validate the celebrity name returns results
|
|
threads = await client.search_threads(celebrity_name)
|
|
if not threads:
|
|
return {'success': False, 'error': f'No threads found for "{celebrity_name}" on {service_id}'}
|
|
|
|
# Sanitize name for username
|
|
username = re.sub(r'[^a-zA-Z0-9_-]', '-', celebrity_name.lower()).strip('-')
|
|
username = re.sub(r'-+', '-', username)
|
|
|
|
# Build search URL for re-use during sync
|
|
from urllib.parse import quote_plus
|
|
search_url = f"{client.BASE_URL}/index.php?search/&q={quote_plus(celebrity_name)}&c[title_only]=1&o=date"
|
|
|
|
creator_data = {
|
|
'service_id': service_id,
|
|
'platform': service_id,
|
|
'creator_id': celebrity_name,
|
|
'username': username,
|
|
'display_name': celebrity_name,
|
|
'post_count': len(threads),
|
|
'bio': search_url,
|
|
'auto_download': 1 if auto_download else 0,
|
|
'download_embeds': 1 if download_embeds else 0,
|
|
}
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
self.log(f"Added {service_id} creator: {celebrity_name} ({len(threads)} threads found)", 'info')
|
|
return {'success': True, 'creator': {'id': db_id, **creator_data}}
|
|
|
|
async def _sync_xenforo_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
|
|
"""Sync a XenForo forum celebrity — search threads, scrape images."""
|
|
creator_id = creator['id']
|
|
service_id = creator['service_id']
|
|
celebrity_name = creator['creator_id'] # The search query
|
|
self.log(f"Syncing {service_id}: {celebrity_name}", 'info')
|
|
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': service_id,
|
|
'service': service_id,
|
|
'status': 'Searching threads...',
|
|
'phase': 'fetching',
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
|
|
|
|
_dl_done = asyncio.Event()
|
|
_dl_task = None
|
|
try:
|
|
client = self._get_xenforo_client(service_id)
|
|
|
|
# Get known post IDs and their stored metadata (reply counts)
|
|
known_post_ids = set()
|
|
known_metadata = {}
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT post_id, metadata FROM paid_content_posts WHERE creator_id = ?",
|
|
(creator_id,)
|
|
)
|
|
for row in cursor.fetchall():
|
|
known_post_ids.add(row[0])
|
|
if row[1]:
|
|
try:
|
|
known_metadata[row[0]] = json.loads(row[1])
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
|
|
# Search for threads
|
|
threads = await client.search_threads(celebrity_name)
|
|
if not threads:
|
|
self.log(f"No threads found for {celebrity_name}", 'info')
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
self._unregister_active_sync(creator_id)
|
|
return SyncResult(success=True, new_posts=0, new_attachments=0)
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Found {len(threads)} threads, checking for updates...',
|
|
'posts_fetched': len(threads),
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
threads_to_scrape = []
|
|
|
|
for thread in threads:
|
|
post_id = f"thread_{thread['thread_id']}"
|
|
|
|
if post_id in known_post_ids:
|
|
# Known thread — skip for now (update detection happens on
|
|
# re-scrape when we have a way to check reply counts efficiently)
|
|
continue
|
|
else:
|
|
# New thread — queue for scraping (page_count=None lets
|
|
# get_thread_images auto-detect it from page 1)
|
|
threads_to_scrape.append({
|
|
**thread,
|
|
'post_id': post_id,
|
|
'is_update': False,
|
|
'start_page': 1,
|
|
'page_count': None,
|
|
'reply_count': 0,
|
|
})
|
|
|
|
self.log(f"{len(threads_to_scrape)} threads to scrape ({len(threads)} total found)", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Scraping {len(threads_to_scrape)} threads...',
|
|
'phase': 'processing',
|
|
})
|
|
|
|
# Concurrent download loop — downloads while scraping continues
|
|
_dl_results = {'downloaded': 0, 'failed': 0, 'downloaded_file_info': []}
|
|
|
|
async def _bg_download_loop():
|
|
await asyncio.sleep(15) # Let some posts queue up first
|
|
while not _dl_done.is_set():
|
|
r = await self._download_xenforo_media(creator_id, quiet=True)
|
|
_dl_results['downloaded'] += r.get('downloaded', 0)
|
|
_dl_results['failed'] += r.get('failed', 0)
|
|
_dl_results['downloaded_file_info'].extend(r.get('downloaded_file_info', []))
|
|
if r.get('downloaded', 0) == 0:
|
|
try:
|
|
await asyncio.wait_for(_dl_done.wait(), timeout=10)
|
|
except asyncio.TimeoutError:
|
|
pass
|
|
|
|
if download and creator.get('auto_download', True):
|
|
_dl_task = asyncio.create_task(_bg_download_loop())
|
|
|
|
# Track direct image URLs across threads to detect signatures
|
|
from collections import Counter
|
|
_cross_thread_urls = Counter()
|
|
|
|
# Shared per-host semaphores for URL resolution across all threads
|
|
_resolve_sem: Dict[str, asyncio.Semaphore] = {}
|
|
# Semaphore to limit concurrent thread scraping (avoid hammering the forum)
|
|
_scrape_sem = asyncio.Semaphore(3)
|
|
_threads_done = 0
|
|
# Lock for DB writes (not thread-safe from concurrent coroutines)
|
|
_db_lock = asyncio.Lock()
|
|
|
|
async def _scrape_resolve_and_store(thread: Dict) -> None:
|
|
"""Scrape a thread, resolve image URLs, and write to DB immediately."""
|
|
nonlocal _threads_done, new_posts, new_attachments
|
|
async with _scrape_sem:
|
|
image_links = await client.get_thread_images(
|
|
thread['url'],
|
|
page_count=thread['page_count'],
|
|
start_page=thread['start_page'],
|
|
)
|
|
|
|
if not image_links:
|
|
_threads_done += 1
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processed {_threads_done}/{len(threads_to_scrape)} threads...',
|
|
'phase': 'processing',
|
|
})
|
|
return
|
|
|
|
# Resolve image host URLs to direct URLs concurrently
|
|
async def _resolve_one(img_info: Dict, session: aiohttp.ClientSession) -> Optional[Attachment]:
|
|
img_url = img_info['url']
|
|
# Skip dead hosts
|
|
if any(dh in img_url.lower() for dh in XenForoForumClient.DEAD_HOSTS):
|
|
return None
|
|
if img_info.get('host') == 'direct':
|
|
direct_url = img_url
|
|
else:
|
|
try:
|
|
host = urlparse(img_url).netloc.lower()
|
|
except Exception:
|
|
host = '_default'
|
|
if host not in _resolve_sem:
|
|
_resolve_sem[host] = asyncio.Semaphore(3)
|
|
async with _resolve_sem[host]:
|
|
direct_url = await client.resolve_image_url(img_url, session=session)
|
|
await asyncio.sleep(0.3)
|
|
if direct_url:
|
|
filename = client._filename_from_url(direct_url)
|
|
ext = client._get_extension(filename)
|
|
_cross_thread_urls[direct_url] += 1
|
|
return Attachment(
|
|
name=filename,
|
|
file_type='image' if ext in client.IMAGE_EXTS else 'unknown',
|
|
extension=ext or None,
|
|
server_path=img_url,
|
|
download_url=direct_url,
|
|
)
|
|
return None
|
|
|
|
resolve_timeout = aiohttp.ClientTimeout(total=30)
|
|
async with aiohttp.ClientSession(timeout=resolve_timeout) as session:
|
|
# Batch image resolution to avoid spawning hundreds of coroutines
|
|
_IMG_BATCH = 50
|
|
attachments = []
|
|
for ri in range(0, len(image_links), _IMG_BATCH):
|
|
img_batch = image_links[ri:ri + _IMG_BATCH]
|
|
results = await asyncio.gather(*[_resolve_one(img, session) for img in img_batch])
|
|
attachments.extend(a for a in results if a is not None)
|
|
|
|
_threads_done += 1
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processed {_threads_done}/{len(threads_to_scrape)} threads...',
|
|
'phase': 'processing',
|
|
})
|
|
|
|
if not attachments:
|
|
return
|
|
|
|
# Filter out signature images (same URL in 3+ threads)
|
|
attachments = [a for a in attachments if _cross_thread_urls.get(a.download_url, 0) < 3]
|
|
if not attachments:
|
|
return
|
|
|
|
post = Post(
|
|
post_id=thread['post_id'],
|
|
service_id=service_id,
|
|
platform=service_id,
|
|
creator_id=celebrity_name,
|
|
title=thread['title'],
|
|
content='',
|
|
published_at=thread.get('published_at'),
|
|
)
|
|
post.attachments = attachments
|
|
|
|
post_data = post.to_dict()
|
|
post_data['metadata'] = json.dumps({
|
|
'reply_count': thread.get('reply_count', 0),
|
|
'thread_id': thread['thread_id'],
|
|
'thread_url': thread['url'],
|
|
})
|
|
|
|
# DB writes under lock so background downloader sees them immediately
|
|
async with _db_lock:
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post_data)
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
for att_idx, attachment in enumerate(attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = att_idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
# Scrape + resolve + store all threads concurrently
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Scraping {len(threads_to_scrape)} threads (3 concurrent)...',
|
|
'phase': 'processing',
|
|
})
|
|
|
|
# Process threads in batches to avoid spawning thousands of coroutines
|
|
_THREAD_BATCH = 50
|
|
for i in range(0, len(threads_to_scrape), _THREAD_BATCH):
|
|
batch = threads_to_scrape[i:i + _THREAD_BATCH]
|
|
await asyncio.gather(
|
|
*[_scrape_resolve_and_store(t) for t in batch]
|
|
)
|
|
|
|
# Stop concurrent download loop and collect its results
|
|
_dl_done.set()
|
|
if _dl_task:
|
|
await _dl_task
|
|
|
|
downloaded = _dl_results['downloaded']
|
|
failed = _dl_results['failed']
|
|
downloaded_file_info = list(_dl_results['downloaded_file_info'])
|
|
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'post_count': self.db.get_creator_post_count(creator_id),
|
|
})
|
|
|
|
# Final download sweep for remaining attachments
|
|
if download and creator.get('auto_download', True):
|
|
result = await self._download_xenforo_media(creator_id)
|
|
downloaded += result.get('downloaded', 0)
|
|
failed += result.get('failed', 0)
|
|
downloaded_file_info.extend(result.get('downloaded_file_info', []))
|
|
|
|
# Clean up permanently failed attachments and empty posts
|
|
self._cleanup_xenforo_failed(creator_id)
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': new_posts, 'new_attachments': new_attachments,
|
|
'downloaded': downloaded, 'failed': failed,
|
|
})
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True, new_posts=new_posts, new_attachments=new_attachments,
|
|
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info,
|
|
)
|
|
|
|
except Exception as e:
|
|
_dl_done.set()
|
|
if _dl_task:
|
|
try:
|
|
await _dl_task
|
|
except Exception:
|
|
pass
|
|
self.log(f"Error syncing {service_id} {celebrity_name}: {e}", 'error')
|
|
import traceback
|
|
self.log(traceback.format_exc(), 'debug')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id, 'username': creator['username'], 'error': str(e),
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
def _cleanup_xenforo_failed(self, creator_id: int):
|
|
"""Remove permanently failed attachments and delete posts left with zero attachments."""
|
|
try:
|
|
with self.unified_db.get_connection(for_write=True) as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
DELETE FROM paid_content_attachments
|
|
WHERE status = 'failed'
|
|
AND post_id IN (SELECT id FROM paid_content_posts WHERE creator_id = ?)
|
|
""", (creator_id,))
|
|
removed_atts = cursor.rowcount
|
|
|
|
if removed_atts > 0:
|
|
cursor.execute("""
|
|
DELETE FROM paid_content_posts
|
|
WHERE creator_id = ?
|
|
AND id NOT IN (SELECT DISTINCT post_id FROM paid_content_attachments)
|
|
""", (creator_id,))
|
|
removed_posts = cursor.rowcount
|
|
|
|
cursor.execute("""
|
|
UPDATE paid_content_posts
|
|
SET attachment_count = (
|
|
SELECT COUNT(*) FROM paid_content_attachments WHERE post_id = paid_content_posts.id
|
|
)
|
|
WHERE creator_id = ?
|
|
""", (creator_id,))
|
|
|
|
conn.commit()
|
|
if removed_atts or removed_posts:
|
|
self.log(f"XenForo cleanup: removed {removed_atts} failed attachments, {removed_posts} empty posts", 'info')
|
|
|
|
current_count = self.db.get_creator_post_count(creator_id)
|
|
self.db.update_creator(creator_id, {'post_count': current_count})
|
|
except Exception as e:
|
|
self.log(f"Error during XenForo cleanup: {e}", 'warning')
|
|
|
|
async def _download_xenforo_media(self, creator_id: int, quiet: bool = False) -> Dict:
|
|
"""Download pending XenForo forum images via concurrent HTTP."""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
service_id = creator['service_id']
|
|
self.log(f"Downloading {len(pending)} media files for {creator['display_name']}", 'info')
|
|
if not quiet:
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading', 'status': f'Downloading {len(pending)} files...',
|
|
'total_files': len(pending), 'downloaded': 0,
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'status': f'Downloading {len(pending)} files...',
|
|
'phase': 'downloading', 'total_files': len(pending), 'downloaded': 0,
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
celebrity_dir = self._sanitize_filename(creator['display_name'] or creator['username'])
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
completed_count = 0
|
|
downloaded_file_info = []
|
|
# Per-host semaphores to avoid overwhelming individual image hosts
|
|
_host_semaphores: Dict[str, asyncio.Semaphore] = {}
|
|
_MAX_PER_HOST = 4
|
|
# Hosts that need lower concurrency (known to reject/throttle)
|
|
_SENSITIVE_HOSTS = {'pixhost.to': 2}
|
|
|
|
def _get_host_sem(url: str) -> asyncio.Semaphore:
|
|
try:
|
|
host = urlparse(url).netloc.lower()
|
|
except Exception:
|
|
host = '_default'
|
|
# Group subdomains under base domain (e.g. img1.pixhost.to → pixhost.to)
|
|
parts = host.rsplit('.', 2)
|
|
base_domain = '.'.join(parts[-2:]) if len(parts) >= 2 else host
|
|
# Use base domain as key so all pixhost subdomains share one semaphore
|
|
sem_key = base_domain
|
|
if sem_key not in _host_semaphores:
|
|
limit = _SENSITIVE_HOSTS.get(base_domain, _MAX_PER_HOST)
|
|
_host_semaphores[sem_key] = asyncio.Semaphore(limit)
|
|
return _host_semaphores[sem_key]
|
|
|
|
async def _download_one(att: Dict, session: aiohttp.ClientSession) -> None:
|
|
nonlocal downloaded, failed, completed_count
|
|
try:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
failed += 1
|
|
completed_count += 1
|
|
return
|
|
|
|
thread_title = self._sanitize_filename(post.get('title') or 'unknown-thread')
|
|
output_dir = base_path / service_id / celebrity_dir / thread_title
|
|
|
|
download_url = att.get('download_url')
|
|
if not download_url:
|
|
self.db.update_attachment_status(att['id'], 'failed', error_message='No download URL')
|
|
failed += 1
|
|
completed_count += 1
|
|
return
|
|
|
|
# Skip dead hosts entirely
|
|
dl_lower = download_url.lower()
|
|
if any(dh in dl_lower for dh in XenForoForumClient.DEAD_HOSTS):
|
|
self.db.update_attachment_status(att['id'], 'failed', error_message='Dead host')
|
|
failed += 1
|
|
completed_count += 1
|
|
return
|
|
|
|
filename = att.get('name') or att.get('local_filename') or f"media_{att['id']}.jpg"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_path = output_dir / filename
|
|
|
|
# Skip if already exists
|
|
if output_path.exists() and output_path.stat().st_size > 0:
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=str(output_path),
|
|
local_filename=output_path.name,
|
|
file_size=output_path.stat().st_size,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
downloaded += 1
|
|
completed_count += 1
|
|
return
|
|
|
|
headers = dict(XenForoForumClient.HEADERS)
|
|
try:
|
|
host_domain = urlparse(att.get('server_path', download_url)).netloc
|
|
headers['Referer'] = f'https://{host_domain}/'
|
|
except Exception:
|
|
pass
|
|
|
|
host_sem = _get_host_sem(download_url)
|
|
|
|
# Retry up to 3 times for transient errors (connection resets, incomplete responses)
|
|
last_err = None
|
|
for attempt in range(3):
|
|
async with host_sem:
|
|
# Mark as downloading only when we actually hold the semaphore
|
|
if attempt == 0:
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
try:
|
|
async with session.get(download_url, headers=headers) as resp:
|
|
if resp.status != 200:
|
|
last_err = f'HTTP {resp.status}'
|
|
break # Non-transient, don't retry
|
|
|
|
total = 0
|
|
async with aiofiles.open(str(output_path), 'wb') as f:
|
|
async for chunk in resp.content.iter_chunked(65536):
|
|
await f.write(chunk)
|
|
total += len(chunk)
|
|
last_err = None
|
|
break # Success
|
|
except Exception as dl_err:
|
|
last_err = str(dl_err)
|
|
# Clean up partial file
|
|
try:
|
|
if output_path.exists():
|
|
output_path.unlink()
|
|
except Exception:
|
|
pass
|
|
# Don't retry DNS failures or permanent connection errors
|
|
err_lower = last_err.lower()
|
|
if 'name or service not known' in err_lower or 'no address associated' in err_lower:
|
|
break
|
|
if attempt < 2:
|
|
await asyncio.sleep(2 * (attempt + 1))
|
|
|
|
if last_err:
|
|
# 404/410 = permanently gone, don't show in failed queue
|
|
if last_err in ('HTTP 404', 'HTTP 410'):
|
|
status = 'gone'
|
|
else:
|
|
status = 'failed'
|
|
self.db.update_attachment_status(att['id'], status,
|
|
error_message=last_err,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
completed_count += 1
|
|
return
|
|
|
|
file_path = str(output_path)
|
|
file_size = total
|
|
|
|
width, height, duration = None, None, None
|
|
file_type = att.get('file_type', 'image')
|
|
if file_path:
|
|
width, height, duration = self._extract_dimensions(Path(file_path), file_type)
|
|
|
|
# Detect image-host placeholder/thumbnail images and discard them.
|
|
# Real forum photos are 800px+ on the long side; anything under
|
|
# 500px is a thumbnail, placeholder, or "image removed" stub.
|
|
if file_type == 'image' and width and height:
|
|
max_dim = max(width, height)
|
|
if max_dim < 500:
|
|
try:
|
|
output_path.unlink(missing_ok=True)
|
|
except Exception:
|
|
pass
|
|
self.db.update_attachment_status(att['id'], 'gone',
|
|
error_message=f'Thumbnail/placeholder ({width}x{height})')
|
|
failed += 1
|
|
completed_count += 1
|
|
return
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name,
|
|
file_size=file_size,
|
|
width=width, height=height, duration=duration,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
downloaded += 1
|
|
completed_count += 1
|
|
|
|
if file_path:
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name,
|
|
'source': creator['display_name'] or creator['username'],
|
|
'content_type': file_type,
|
|
})
|
|
|
|
if not quiet:
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloaded {downloaded}/{len(pending)}...',
|
|
'phase': 'downloading',
|
|
'downloaded': downloaded,
|
|
'progress': completed_count,
|
|
'total_files': len(pending),
|
|
})
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading XenForo media: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
completed_count += 1
|
|
|
|
# Process in batches to avoid spawning tens of thousands of coroutines
|
|
_BATCH_SIZE = 50
|
|
timeout = aiohttp.ClientTimeout(total=120)
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
for i in range(0, len(pending), _BATCH_SIZE):
|
|
batch = pending[i:i + _BATCH_SIZE]
|
|
tasks = [_download_one(att, session) for att in batch]
|
|
await asyncio.gather(*tasks)
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Snapchat
|
|
# -------------------------------------------------------------------------
|
|
|
|
async def _add_snapchat_creator(self, username: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a Snapchat creator"""
|
|
client = self._get_snapchat_client()
|
|
|
|
# Single fetch — get_creator_info fetches the profile page once
|
|
raw_info = await asyncio.to_thread(client.get_creator_info, username)
|
|
if not raw_info:
|
|
return {'success': False, 'error': 'Snapchat creator not found'}
|
|
|
|
creator_data = {
|
|
'service_id': 'snapchat',
|
|
'platform': 'snapchat',
|
|
'creator_id': username,
|
|
'username': raw_info.get('creator_name', username),
|
|
'display_name': raw_info.get('creator_name'),
|
|
'profile_image_url': raw_info.get('profile_image_url'),
|
|
'auto_download': 1 if auto_download else 0,
|
|
'download_embeds': 1 if download_embeds else 0,
|
|
}
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
|
|
# Cache profile image
|
|
if raw_info.get('profile_image_url'):
|
|
cached = await self._cache_profile_image(raw_info['profile_image_url'], 'snapchat', username, 'avatar')
|
|
if cached:
|
|
self.db.update_creator(db_id, {'profile_image_url': cached})
|
|
creator_data['profile_image_url'] = cached
|
|
|
|
return {'success': True, 'creator': {'id': db_id, **creator_data}}
|
|
|
|
async def _sync_snapchat_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
|
|
"""Sync a Snapchat creator - fetch spotlights/highlights and optionally download them"""
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing Snapchat creator: {creator['username']}", 'info')
|
|
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'snapchat',
|
|
'service': 'snapchat',
|
|
'status': 'Fetching snaps...',
|
|
'phase': 'fetching',
|
|
'failed': 0
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
|
|
|
|
try:
|
|
client = self._get_snapchat_client()
|
|
username = creator['creator_id']
|
|
|
|
# Update profile info (avatar, display name)
|
|
try:
|
|
profile_info = await asyncio.to_thread(client.get_creator_info, username)
|
|
if profile_info:
|
|
profile_updates = {}
|
|
if profile_info.get('creator_name') and profile_info['creator_name'] != username:
|
|
profile_updates['display_name'] = profile_info['creator_name']
|
|
if profile_info.get('profile_image_url'):
|
|
cached = await self._cache_profile_image(profile_info['profile_image_url'], 'snapchat', username, 'avatar')
|
|
profile_updates['profile_image_url'] = cached or profile_info['profile_image_url']
|
|
if profile_updates:
|
|
self.db.update_creator(creator_id, profile_updates)
|
|
self.log(f"Updated Snapchat profile for @{username}: {list(profile_updates.keys())}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Failed to update Snapchat profile: {e}", 'debug')
|
|
|
|
# Determine date cutoff
|
|
from datetime import timedelta
|
|
if scheduled:
|
|
since_date = (datetime.now() - timedelta(days=3)).isoformat()
|
|
else:
|
|
since_date = creator.get('last_post_date')
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': 'Fetching spotlights and highlights...',
|
|
'phase': 'fetching'
|
|
})
|
|
|
|
posts = await asyncio.to_thread(client.get_posts, username, since_date)
|
|
|
|
if not posts:
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': 0, 'new_attachments': 0, 'downloaded': 0, 'failed': 0
|
|
})
|
|
return SyncResult(success=True, new_posts=0, new_attachments=0)
|
|
|
|
self.log(f"Found {len(posts)} collections for @{username}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} posts...',
|
|
'phase': 'processing', 'total_posts': len(posts)
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
for post in posts:
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
# Apply auto tags (e.g. "Spotlight", "Highlight")
|
|
if is_new_post and post.auto_tags:
|
|
for tag_name in post.auto_tags:
|
|
tag = self.db.get_tag_by_slug(tag_name.lower().replace(' ', '-'))
|
|
if not tag:
|
|
tag_id = self.db.create_tag(tag_name,
|
|
color='#eab308' if tag_name == 'Spotlight' else '#8b5cf6',
|
|
description=f'Snapchat {tag_name}')
|
|
else:
|
|
tag_id = tag['id']
|
|
if tag_id:
|
|
self.db.add_tag_to_post(post_db_id, tag_id)
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'last_post_date': latest_post_date or creator.get('last_post_date'),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self._download_snapchat_posts(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': new_posts, 'new_attachments': new_attachments,
|
|
'downloaded': downloaded, 'failed': failed
|
|
})
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True, new_posts=new_posts, new_attachments=new_attachments,
|
|
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing Snapchat creator @{creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
async def _download_snapchat_posts(self, creator_id: int) -> Dict:
|
|
"""Download pending Snapchat attachments via curl_cffi"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
client = self._get_snapchat_client()
|
|
|
|
self.log(f"Downloading {len(pending)} Snapchat files for @{creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading', 'status': f'Downloading {len(pending)} files...',
|
|
'total_files': len(pending), 'downloaded': 0
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
|
|
for i, att in enumerate(pending):
|
|
try:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
failed += 1
|
|
continue
|
|
|
|
published_at = post.get('published_at') or ''
|
|
post_date = published_at[:10] if published_at else 'unknown-date'
|
|
output_dir = base_path / 'snapchat' / self._sanitize_filename(creator['username']) / post_date
|
|
|
|
download_url = att.get('download_url')
|
|
if not download_url:
|
|
self.db.update_attachment_status(att['id'], 'failed', error_message='No media URL')
|
|
failed += 1
|
|
continue
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading file {i + 1}/{len(pending)}...',
|
|
'downloaded': downloaded
|
|
})
|
|
|
|
filename = att.get('name') or f"snap_{i}.mp4"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_path = str(output_dir / filename)
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
|
|
success = await asyncio.to_thread(client.download_snap, download_url, output_path)
|
|
|
|
if not success:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message='Download failed',
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
fp = Path(output_path)
|
|
if not fp.exists() or fp.stat().st_size == 0:
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message='Empty file',
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
f_size = fp.stat().st_size
|
|
ext = fp.suffix.lower()
|
|
file_type = att.get('file_type', 'video')
|
|
|
|
w, h, dur = None, None, None
|
|
if file_type == 'video':
|
|
w, h, dur = self._extract_dimensions(fp, 'video')
|
|
else:
|
|
w, h, _ = self._extract_dimensions(fp, 'image')
|
|
|
|
thumb_data = None
|
|
if file_type == 'video':
|
|
thumb_data = self._generate_video_thumbnail(fp, seek_time='00:00:01')
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=str(fp),
|
|
local_filename=fp.name,
|
|
name=fp.name,
|
|
extension=ext,
|
|
file_size=f_size,
|
|
file_type=file_type,
|
|
width=w, height=h, duration=dur,
|
|
thumbnail_data=thumb_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
self.db.increment_creator_download_stats(creator_id, 1, f_size)
|
|
downloaded += 1
|
|
|
|
downloaded_file_info.append({
|
|
'file_path': str(fp),
|
|
'filename': fp.name,
|
|
'source': creator['username'],
|
|
'content_type': file_type
|
|
})
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading Snapchat file: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
|
|
|
|
# =========================================================================
|
|
# REDDIT METHODS
|
|
# =========================================================================
|
|
|
|
async def _add_reddit_creator(self, subreddit: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a Reddit subreddit as a creator"""
|
|
client = self._get_reddit_client()
|
|
|
|
info = await asyncio.to_thread(client.get_subreddit_info, subreddit)
|
|
if not info:
|
|
return {'success': False, 'error': 'Could not access subreddit'}
|
|
|
|
creator_data = {
|
|
'service_id': 'reddit',
|
|
'platform': 'reddit',
|
|
'creator_id': info.get('creator_id', subreddit.lower()),
|
|
'username': info.get('creator_name', f'r/{subreddit}'),
|
|
'display_name': info.get('display_name') or info.get('creator_name', f'r/{subreddit}'),
|
|
'profile_image_url': info.get('profile_image_url'),
|
|
'banner_image_url': info.get('banner_image_url'),
|
|
'bio': info.get('bio'),
|
|
'joined_date': info.get('joined_date'),
|
|
'auto_download': 1 if auto_download else 0,
|
|
'download_embeds': 1 if download_embeds else 0,
|
|
}
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
|
|
# Cache profile and banner images locally
|
|
if info.get('profile_image_url'):
|
|
cached = await self._cache_profile_image(info['profile_image_url'], 'reddit', creator_data['creator_id'], 'avatar')
|
|
if cached:
|
|
self.db.update_creator(db_id, {'profile_image_url': cached})
|
|
creator_data['profile_image_url'] = cached
|
|
if info.get('banner_image_url'):
|
|
cached = await self._cache_profile_image(info['banner_image_url'], 'reddit', creator_data['creator_id'], 'banner')
|
|
if cached:
|
|
self.db.update_creator(db_id, {'banner_image_url': cached})
|
|
creator_data['banner_image_url'] = cached
|
|
|
|
return {'success': True, 'creator': {'id': db_id, **creator_data}}
|
|
|
|
async def _sync_reddit_creator(self, creator: Dict, download: bool = True, scheduled: bool = False,
|
|
force_backfill: bool = False) -> SyncResult:
|
|
"""Sync a Reddit subreddit - fetch posts and download media via gallery-dl.
|
|
|
|
Processes files incrementally as gallery-dl downloads them — posts and
|
|
attachments appear in the UI progressively instead of waiting for the
|
|
entire download to finish.
|
|
"""
|
|
creator_id = creator['id']
|
|
subreddit = creator['creator_id']
|
|
self.log(f"Syncing Reddit subreddit: r/{subreddit}", 'info')
|
|
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'reddit',
|
|
'service': 'reddit',
|
|
'status': 'Starting...',
|
|
'phase': 'fetching',
|
|
'failed': 0
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
self._emit_event('paid_content_sync_started', {'creator_id': creator_id, **sync_data})
|
|
|
|
temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_')
|
|
try:
|
|
client = self._get_reddit_client()
|
|
|
|
# Determine date cutoff
|
|
from datetime import timedelta
|
|
if scheduled:
|
|
since_date = (datetime.now() - timedelta(days=3)).isoformat()
|
|
else:
|
|
since_date = creator.get('last_post_date')
|
|
|
|
# Update profile info (icon, banner, bio, joined_date, display_name)
|
|
try:
|
|
profile_info = await asyncio.to_thread(client.get_subreddit_info, subreddit)
|
|
if profile_info:
|
|
profile_updates = {}
|
|
if profile_info.get('profile_image_url'):
|
|
cached = await self._cache_profile_image(profile_info['profile_image_url'], 'reddit', subreddit, 'avatar')
|
|
profile_updates['profile_image_url'] = cached or profile_info['profile_image_url']
|
|
if profile_info.get('banner_image_url'):
|
|
cached = await self._cache_profile_image(profile_info['banner_image_url'], 'reddit', subreddit, 'banner')
|
|
profile_updates['banner_image_url'] = cached or profile_info['banner_image_url']
|
|
if profile_info.get('display_name'):
|
|
profile_updates['display_name'] = profile_info['display_name']
|
|
if profile_info.get('bio'):
|
|
profile_updates['bio'] = profile_info['bio']
|
|
if profile_info.get('joined_date'):
|
|
profile_updates['joined_date'] = profile_info['joined_date']
|
|
if profile_updates:
|
|
self.db.update_creator(creator_id, profile_updates)
|
|
except Exception as e:
|
|
self.log(f"Failed to update Reddit profile: {e}", 'debug')
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': 'Connecting to Reddit...',
|
|
'phase': 'fetching'
|
|
})
|
|
|
|
# Shared counters (updated from gallery-dl thread)
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
latest_post_date = None
|
|
|
|
def _on_progress(dl_count, skip_count, total_seen):
|
|
status = f'Fetching — {dl_count} downloaded'
|
|
if skip_count:
|
|
status += f', {skip_count} skipped'
|
|
status += f' ({total_seen} total)'
|
|
if downloaded > 0:
|
|
status += f' | {downloaded} processed'
|
|
self._update_active_sync(creator_id, {
|
|
'status': status,
|
|
'phase': 'fetching',
|
|
'total_files': total_seen,
|
|
'downloaded': downloaded,
|
|
})
|
|
|
|
def _on_batch(files):
|
|
"""Process a batch of downloaded files — runs in gallery-dl thread."""
|
|
nonlocal new_posts, new_attachments, downloaded, failed, latest_post_date
|
|
|
|
self.log(f"Processing batch of {len(files)} files from r/{subreddit}", 'debug')
|
|
|
|
# Group files by post using JSON sidecars
|
|
grouped = client._group_files_by_post(files, temp_dir, subreddit)
|
|
self.log(f"Grouped into {len(grouped)} posts", 'debug')
|
|
|
|
for post_id, post_data in grouped.items():
|
|
post_dict = {
|
|
'post_id': post_id,
|
|
'title': post_data.get('title'),
|
|
'content': post_data.get('title'),
|
|
'published_at': post_data.get('date'),
|
|
}
|
|
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post_dict)
|
|
if not post_db_id:
|
|
continue
|
|
if is_new_post:
|
|
new_posts += 1
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
# Track latest date
|
|
if post_data.get('date'):
|
|
if not latest_post_date or post_data['date'] > latest_post_date:
|
|
latest_post_date = post_data['date']
|
|
|
|
post_date = (post_data.get('date') or '')[:10] or 'unknown-date'
|
|
output_dir = base_path / 'reddit' / self._sanitize_filename(subreddit) / post_date
|
|
|
|
for idx, file_path in enumerate(post_data['files']):
|
|
try:
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
final_path = output_dir / file_path.name
|
|
already_existed = False
|
|
|
|
if final_path.exists() and final_path.stat().st_size > 0:
|
|
# File already at final path — skip the move but still create attachment record
|
|
file_path.unlink(missing_ok=True)
|
|
already_existed = True
|
|
elif final_path.exists():
|
|
# Zero-byte file at final path — give unique name
|
|
stem = final_path.stem
|
|
suffix = final_path.suffix
|
|
counter = 1
|
|
while final_path.exists():
|
|
final_path = output_dir / f"{stem}_{counter}{suffix}"
|
|
counter += 1
|
|
|
|
if not already_existed:
|
|
shutil.move(str(file_path), str(final_path))
|
|
|
|
fp = final_path
|
|
f_size = fp.stat().st_size if fp.exists() else 0
|
|
ext = fp.suffix.lower()
|
|
file_type = RedditClient._detect_file_type(ext)
|
|
|
|
w, h, dur = None, None, None
|
|
if file_type == 'video':
|
|
w, h, dur = self._extract_dimensions(fp, 'video')
|
|
elif file_type == 'image':
|
|
w, h, _ = self._extract_dimensions(fp, 'image')
|
|
|
|
thumb_data = None
|
|
if file_type == 'video':
|
|
thumb_data = self._generate_video_thumbnail(fp, seek_time='00:00:01')
|
|
|
|
att_data = {
|
|
'name': fp.name,
|
|
'server_path': str(fp),
|
|
'file_type': file_type,
|
|
'extension': ext,
|
|
'file_size': f_size,
|
|
'attachment_index': idx,
|
|
}
|
|
att_id = self.db.upsert_attachment(post_db_id, att_data)
|
|
if att_id:
|
|
self.db.update_attachment_status(att_id, 'completed',
|
|
local_path=str(fp),
|
|
local_filename=fp.name,
|
|
name=fp.name,
|
|
extension=ext,
|
|
file_size=f_size,
|
|
file_type=file_type,
|
|
width=w, height=h, duration=dur,
|
|
thumbnail_data=thumb_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
if not already_existed:
|
|
self.db.increment_creator_download_stats(creator_id, 1, f_size)
|
|
downloaded_file_info.append({
|
|
'file_path': str(fp),
|
|
'filename': fp.name,
|
|
'source': creator['username'],
|
|
'content_type': file_type
|
|
})
|
|
|
|
self.db.mark_post_downloaded(post_db_id)
|
|
downloaded += 1
|
|
new_attachments += 1
|
|
|
|
except Exception as e:
|
|
self.log(f"Error processing Reddit file {file_path.name}: {e}", 'error')
|
|
failed += 1
|
|
|
|
# First sync (no last_post_date) = unlimited; scheduled = 500 recent
|
|
is_first_sync = not creator.get('last_post_date')
|
|
max_posts = 500 if scheduled else 0
|
|
|
|
# Run gallery-dl with incremental batch processing
|
|
result = await asyncio.to_thread(
|
|
client.run_gallery_dl, subreddit, temp_dir, since_date, max_posts,
|
|
_on_progress, _on_batch, 50)
|
|
|
|
dl_total = result.get('dl_count', 0)
|
|
skip_total = result.get('skip_count', 0)
|
|
|
|
# Pullpush historical backfill on first sync or when forced
|
|
if (is_first_sync or force_backfill) and not scheduled:
|
|
try:
|
|
backfill_counters = {
|
|
'new_posts': 0, 'new_attachments': 0,
|
|
'downloaded': 0, 'failed': 0, 'latest_post_date': None
|
|
}
|
|
await self._backfill_reddit_pullpush(
|
|
creator_id, subreddit, base_path, creator,
|
|
temp_dir, client, downloaded_file_info, backfill_counters)
|
|
new_posts += backfill_counters['new_posts']
|
|
new_attachments += backfill_counters['new_attachments']
|
|
downloaded += backfill_counters['downloaded']
|
|
failed += backfill_counters['failed']
|
|
if backfill_counters['latest_post_date']:
|
|
if not latest_post_date or backfill_counters['latest_post_date'] > latest_post_date:
|
|
latest_post_date = backfill_counters['latest_post_date']
|
|
except Exception as e:
|
|
self.log(f"Pullpush backfill failed for r/{subreddit}: {e}", 'error')
|
|
import traceback
|
|
self.log(traceback.format_exc(), 'debug')
|
|
|
|
# Post-sync sweep: find files on disk without attachment records
|
|
sweep_count = self._sweep_reddit_missing_attachments(
|
|
creator_id, subreddit, base_path, downloaded_file_info, creator)
|
|
if sweep_count > 0:
|
|
self.log(f"Sweep found {sweep_count} orphaned files for r/{subreddit}", 'info')
|
|
new_attachments += sweep_count
|
|
downloaded += sweep_count
|
|
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'last_post_date': latest_post_date or creator.get('last_post_date'),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id, 'username': creator['username'],
|
|
'new_posts': new_posts, 'new_attachments': new_attachments,
|
|
'downloaded': downloaded, 'failed': failed,
|
|
'skipped': skip_total
|
|
})
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True, new_posts=new_posts, new_attachments=new_attachments,
|
|
downloaded_files=downloaded, failed_files=failed, downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing Reddit r/{subreddit}: {e}", 'error')
|
|
import traceback
|
|
self.log(traceback.format_exc(), 'debug')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id, 'username': creator['username'], 'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
finally:
|
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
|
|
async def _backfill_reddit_pullpush(self, creator_id: int, subreddit: str,
|
|
base_path: Path, creator: Dict,
|
|
temp_dir: str, client,
|
|
downloaded_file_info: list,
|
|
counters: dict):
|
|
"""Backfill historical posts from Pullpush (Pushshift) archive.
|
|
|
|
Called after the initial gallery-dl sync on first-time syncs to fetch
|
|
posts older than what Reddit's listing API returns (~1000 posts).
|
|
|
|
Args:
|
|
counters: Mutable dict with keys new_posts, new_attachments, downloaded,
|
|
failed, latest_post_date — updated in place.
|
|
"""
|
|
self.log(f"Starting Pullpush backfill for r/{subreddit}", 'info')
|
|
|
|
# Phase 1: Fetch all post IDs from Pullpush
|
|
self._update_active_sync(creator_id, {
|
|
'status': 'Backfill — fetching post list from archive...',
|
|
'phase': 'backfill'
|
|
})
|
|
|
|
def _pp_progress(count, msg):
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Backfill — {msg}',
|
|
'phase': 'backfill'
|
|
})
|
|
|
|
pp_posts = await asyncio.to_thread(
|
|
client.get_pullpush_post_ids, subreddit,
|
|
progress_callback=_pp_progress)
|
|
|
|
if not pp_posts:
|
|
self.log(f"Pullpush returned no posts for r/{subreddit}", 'info')
|
|
return
|
|
|
|
self.log(f"Pullpush returned {len(pp_posts)} post IDs for r/{subreddit}", 'info')
|
|
|
|
# Phase 2: Filter out post IDs already in our DB
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Backfill — filtering {len(pp_posts)} posts against DB...',
|
|
'phase': 'backfill'
|
|
})
|
|
|
|
existing_post_ids = set()
|
|
try:
|
|
with self.db.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT post_id FROM paid_content_posts WHERE creator_id = ?",
|
|
(creator_id,))
|
|
for row in cursor.fetchall():
|
|
existing_post_ids.add(row[0])
|
|
except Exception as e:
|
|
self.log(f"Error fetching existing post IDs for backfill: {e}", 'error')
|
|
return
|
|
|
|
# Build metadata lookup and filter new posts
|
|
pp_metadata = {}
|
|
new_pp_posts = []
|
|
for pp in pp_posts:
|
|
pid = pp['id']
|
|
pp_metadata[pid] = pp
|
|
if pid not in existing_post_ids:
|
|
new_pp_posts.append(pp)
|
|
|
|
self.log(f"Backfill: {len(new_pp_posts)} new posts out of {len(pp_posts)} total "
|
|
f"({len(existing_post_ids)} already in DB)", 'info')
|
|
|
|
if not new_pp_posts:
|
|
self._update_active_sync(creator_id, {
|
|
'status': 'Backfill — no new historical posts to fetch',
|
|
'phase': 'backfill'
|
|
})
|
|
return
|
|
|
|
# Phase 3: Write URLs to temp file
|
|
urls_file = os.path.join(temp_dir, 'backfill_urls.txt')
|
|
with open(urls_file, 'w') as f:
|
|
for pp in new_pp_posts:
|
|
f.write(f"https://www.reddit.com/r/{subreddit}/comments/{pp['id']}/\n")
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Backfill — downloading {len(new_pp_posts)} historical posts...',
|
|
'phase': 'backfill'
|
|
})
|
|
|
|
# Phase 4: Run gallery-dl with --input-file using same batch processing
|
|
def _on_backfill_progress(dl_count, skip_count, total_seen):
|
|
status = f'Backfill — {dl_count} downloaded'
|
|
if skip_count:
|
|
status += f', {skip_count} skipped'
|
|
status += f' ({total_seen}/{len(new_pp_posts)} posts)'
|
|
if counters['downloaded'] > 0:
|
|
status += f' | {counters["downloaded"]} processed'
|
|
self._update_active_sync(creator_id, {
|
|
'status': status,
|
|
'phase': 'backfill',
|
|
})
|
|
|
|
def _on_backfill_batch(files):
|
|
"""Process a batch of backfilled files — same logic as normal sync."""
|
|
self.log(f"Backfill batch: {len(files)} files from r/{subreddit}", 'debug')
|
|
|
|
grouped = client._group_files_by_post(files, temp_dir, subreddit)
|
|
self.log(f"Backfill grouped into {len(grouped)} posts", 'debug')
|
|
|
|
for post_id, post_data in grouped.items():
|
|
# Use Pullpush metadata as fallback if gallery-dl sidecar is missing
|
|
pp_meta = pp_metadata.get(post_id, {})
|
|
title = post_data.get('title') or pp_meta.get('title', '')
|
|
pub_date = post_data.get('date')
|
|
if not pub_date and pp_meta.get('created_utc'):
|
|
try:
|
|
pub_date = datetime.fromtimestamp(pp_meta['created_utc']).isoformat()
|
|
except (ValueError, OSError):
|
|
pass
|
|
|
|
post_dict = {
|
|
'post_id': post_id,
|
|
'title': title,
|
|
'content': title,
|
|
'published_at': pub_date,
|
|
}
|
|
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post_dict)
|
|
if not post_db_id:
|
|
continue
|
|
if is_new_post:
|
|
counters['new_posts'] += 1
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
if pub_date:
|
|
if not counters['latest_post_date'] or pub_date > counters['latest_post_date']:
|
|
counters['latest_post_date'] = pub_date
|
|
|
|
post_date = (pub_date or '')[:10] or 'unknown-date'
|
|
output_dir = base_path / 'reddit' / self._sanitize_filename(subreddit) / post_date
|
|
|
|
for idx, file_path in enumerate(post_data['files']):
|
|
try:
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
final_path = output_dir / file_path.name
|
|
already_existed = False
|
|
|
|
if final_path.exists() and final_path.stat().st_size > 0:
|
|
file_path.unlink(missing_ok=True)
|
|
already_existed = True
|
|
elif final_path.exists():
|
|
stem = final_path.stem
|
|
suffix = final_path.suffix
|
|
counter = 1
|
|
while final_path.exists():
|
|
final_path = output_dir / f"{stem}_{counter}{suffix}"
|
|
counter += 1
|
|
|
|
if not already_existed:
|
|
shutil.move(str(file_path), str(final_path))
|
|
|
|
fp = final_path
|
|
f_size = fp.stat().st_size if fp.exists() else 0
|
|
ext = fp.suffix.lower()
|
|
file_type = RedditClient._detect_file_type(ext)
|
|
|
|
w, h, dur = None, None, None
|
|
if file_type == 'video':
|
|
w, h, dur = self._extract_dimensions(fp, 'video')
|
|
elif file_type == 'image':
|
|
w, h, _ = self._extract_dimensions(fp, 'image')
|
|
|
|
thumb_data = None
|
|
if file_type == 'video':
|
|
thumb_data = self._generate_video_thumbnail(fp, seek_time='00:00:01')
|
|
|
|
att_data = {
|
|
'name': fp.name,
|
|
'server_path': str(fp),
|
|
'file_type': file_type,
|
|
'extension': ext,
|
|
'file_size': f_size,
|
|
'attachment_index': idx,
|
|
}
|
|
att_id = self.db.upsert_attachment(post_db_id, att_data)
|
|
if att_id:
|
|
self.db.update_attachment_status(att_id, 'completed',
|
|
local_path=str(fp),
|
|
local_filename=fp.name,
|
|
name=fp.name,
|
|
extension=ext,
|
|
file_size=f_size,
|
|
file_type=file_type,
|
|
width=w, height=h, duration=dur,
|
|
thumbnail_data=thumb_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
if not already_existed:
|
|
self.db.increment_creator_download_stats(creator_id, 1, f_size)
|
|
downloaded_file_info.append({
|
|
'file_path': str(fp),
|
|
'filename': fp.name,
|
|
'source': creator['username'],
|
|
'content_type': file_type
|
|
})
|
|
|
|
self.db.mark_post_downloaded(post_db_id)
|
|
counters['downloaded'] += 1
|
|
counters['new_attachments'] += 1
|
|
|
|
except Exception as e:
|
|
self.log(f"Error processing backfill file {file_path.name}: {e}", 'error')
|
|
counters['failed'] += 1
|
|
|
|
result = await asyncio.to_thread(
|
|
client.run_gallery_dl_urls, urls_file, temp_dir,
|
|
_on_backfill_progress, _on_backfill_batch, 50)
|
|
|
|
backfill_dl = result.get('dl_count', 0)
|
|
backfill_skip = result.get('skip_count', 0)
|
|
self.log(f"Backfill complete for r/{subreddit}: {backfill_dl} downloaded, "
|
|
f"{backfill_skip} skipped", 'info')
|
|
|
|
async def _download_reddit_posts(self, creator_id: int) -> Dict:
|
|
"""Re-download pending Reddit attachments (for retry scenarios).
|
|
|
|
Reddit files are normally downloaded during sync via gallery-dl.
|
|
This handles the case where files need to be re-fetched.
|
|
"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
# For Reddit, pending attachments without local files need a re-sync
|
|
# Mark them as needing attention — gallery-dl will pick them up on next sync
|
|
self.log(f"Reddit has {len(pending)} pending attachments for r/{creator['creator_id']} — will be fetched on next sync", 'info')
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
def _sweep_reddit_missing_attachments(self, creator_id: int, subreddit: str,
|
|
base_path: Path, downloaded_file_info: list,
|
|
creator: dict) -> int:
|
|
"""Scan files on disk for this Reddit creator and create any missing attachment records.
|
|
|
|
This catches files that were moved to the final path but whose attachment records
|
|
weren't created (e.g., due to batch boundary splits or the already-exists skip).
|
|
"""
|
|
media_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.mp4', '.webm', '.mov', '.avi', '.mkv'}
|
|
creator_dir = base_path / 'reddit' / self._sanitize_filename(subreddit)
|
|
if not creator_dir.exists():
|
|
return 0
|
|
|
|
# Get all existing attachment local_paths for this creator
|
|
existing_paths = set()
|
|
try:
|
|
with self.db.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT a.server_path FROM paid_content_attachments a
|
|
JOIN paid_content_posts p ON a.post_id = p.id
|
|
WHERE p.creator_id = ?
|
|
""", (creator_id,))
|
|
for row in cursor.fetchall():
|
|
if row[0]:
|
|
existing_paths.add(row[0])
|
|
except Exception as e:
|
|
self.log(f"Error fetching existing paths for sweep: {e}", 'debug')
|
|
return 0
|
|
|
|
# Scan disk for media files not in DB
|
|
orphaned = []
|
|
for date_dir in creator_dir.iterdir():
|
|
if not date_dir.is_dir():
|
|
continue
|
|
for fp in date_dir.iterdir():
|
|
if fp.suffix.lower() in media_exts and str(fp) not in existing_paths:
|
|
orphaned.append(fp)
|
|
|
|
if not orphaned:
|
|
return 0
|
|
|
|
self.log(f"Sweep: found {len(orphaned)} files on disk without attachment records for r/{subreddit}", 'info')
|
|
|
|
# Group orphaned files by post ID (from filename: "{post_id} {num} {title}.{ext}")
|
|
post_files = {}
|
|
for fp in orphaned:
|
|
parts = fp.stem.split(' ', 2)
|
|
if len(parts) >= 2:
|
|
post_id = parts[0]
|
|
else:
|
|
post_id = fp.stem
|
|
post_files.setdefault(post_id, []).append(fp)
|
|
|
|
created = 0
|
|
for post_id, files in post_files.items():
|
|
# Find the DB post for this post_id
|
|
try:
|
|
with self.db.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT id FROM paid_content_posts
|
|
WHERE creator_id = ? AND post_id = ?
|
|
""", (creator_id, post_id))
|
|
row = cursor.fetchone()
|
|
|
|
if not row:
|
|
continue
|
|
post_db_id = row[0]
|
|
|
|
for idx, fp in enumerate(files):
|
|
try:
|
|
f_size = fp.stat().st_size if fp.exists() else 0
|
|
ext = fp.suffix.lower()
|
|
file_type = RedditClient._detect_file_type(ext)
|
|
|
|
att_data = {
|
|
'name': fp.name,
|
|
'server_path': str(fp),
|
|
'file_type': file_type,
|
|
'extension': ext,
|
|
'file_size': f_size,
|
|
'attachment_index': idx,
|
|
}
|
|
att_id = self.db.upsert_attachment(post_db_id, att_data)
|
|
if att_id:
|
|
self.db.update_attachment_status(att_id, 'completed',
|
|
local_path=str(fp),
|
|
local_filename=fp.name,
|
|
name=fp.name,
|
|
extension=ext,
|
|
file_size=f_size,
|
|
file_type=file_type,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
self.db.mark_post_downloaded(post_db_id)
|
|
created += 1
|
|
except Exception as e:
|
|
self.log(f"Sweep error for {fp.name}: {e}", 'debug')
|
|
|
|
except Exception as e:
|
|
self.log(f"Sweep error for post {post_id}: {e}", 'debug')
|
|
|
|
return created
|
|
|
|
async def _sync_twitch_creator(self, creator: Dict, download: bool = True, scheduled: bool = False) -> SyncResult:
|
|
"""Sync a Twitch channel - fetch new clips and optionally download them"""
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing Twitch channel: {creator['username']}", 'info')
|
|
|
|
# Register active sync for polling-based updates
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'twitch',
|
|
'service': 'twitch',
|
|
'status': 'Fetching clips...',
|
|
'phase': 'fetching',
|
|
'failed': 0
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
|
|
# Emit WebSocket event
|
|
self._emit_event('paid_content_sync_started', {
|
|
'creator_id': creator_id,
|
|
**sync_data
|
|
})
|
|
|
|
try:
|
|
twitch = self._get_twitch_client()
|
|
if not twitch.is_available():
|
|
error = "yt-dlp not available"
|
|
self._unregister_active_sync(creator_id)
|
|
return SyncResult(success=False, error=error)
|
|
|
|
# Build channel URL from creator_id (which stores the channel name)
|
|
channel_url = f"https://www.twitch.tv/{creator['creator_id']}/clips"
|
|
|
|
# Fetch and update creator profile (display name, avatar, banner, bio, etc.)
|
|
try:
|
|
profile_info = await twitch.get_channel_profile(creator['creator_id'])
|
|
if profile_info:
|
|
profile_updates = {}
|
|
if profile_info.get('display_name'):
|
|
profile_updates['display_name'] = profile_info['display_name']
|
|
if profile_info.get('avatar'):
|
|
cached = await self._cache_profile_image(profile_info['avatar'], 'twitch', creator['creator_id'], 'avatar')
|
|
profile_updates['profile_image_url'] = cached or profile_info['avatar']
|
|
if profile_info.get('banner'):
|
|
cached = await self._cache_profile_image(profile_info['banner'], 'twitch', creator['creator_id'], 'banner')
|
|
profile_updates['banner_image_url'] = cached or profile_info['banner']
|
|
if profile_info.get('bio'):
|
|
profile_updates['bio'] = profile_info['bio']
|
|
if profile_info.get('joined_date'):
|
|
profile_updates['joined_date'] = profile_info['joined_date']
|
|
if profile_info.get('external_links'):
|
|
profile_updates['external_links'] = profile_info['external_links']
|
|
if profile_updates:
|
|
self.db.update_creator(creator_id, profile_updates)
|
|
self.log(f"Updated Twitch creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Failed to update Twitch creator profile: {e}", 'warning')
|
|
|
|
# Fetch clips since last check with progress callback
|
|
# Scheduled syncs only check last 3 days for efficiency
|
|
from datetime import timedelta
|
|
if scheduled:
|
|
since_date = (datetime.now() - timedelta(days=3)).isoformat()
|
|
else:
|
|
since_date = creator.get('last_post_date')
|
|
|
|
def progress_callback(count: int):
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Fetched {count} clips...',
|
|
'posts_fetched': count
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': f'Fetched {count} clips...',
|
|
'phase': 'fetching',
|
|
'posts_fetched': count
|
|
})
|
|
|
|
# Get clips as Post objects (with thumbnails cached)
|
|
# For scheduled syncs, limit to 50 clips max (recent content only)
|
|
max_clips = 50 if scheduled else None
|
|
posts = await twitch.get_posts(
|
|
channel_url,
|
|
since_date=since_date,
|
|
max_clips=max_clips,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
if not posts:
|
|
self.log(f"No new clips for {creator['username']}", 'debug')
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
|
|
# Still download any pending clips even if no new posts
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
pending_count = self.db.get_pending_attachment_count(creator_id)
|
|
if pending_count > 0:
|
|
self.log(f"Downloading {pending_count} pending clips for {creator['username']}", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading {pending_count} pending clips...',
|
|
'phase': 'downloading',
|
|
'total_files': pending_count
|
|
})
|
|
result = await self._download_twitch_clips(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': 0,
|
|
'new_attachments': 0,
|
|
'downloaded': downloaded,
|
|
'failed': failed
|
|
})
|
|
return SyncResult(success=True, new_posts=0, new_attachments=0,
|
|
downloaded_files=downloaded, failed_files=failed,
|
|
downloaded_file_info=downloaded_file_info)
|
|
|
|
self.log(f"Found {len(posts)} new clips for {creator['username']}", 'info')
|
|
|
|
# Update status
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} clips...',
|
|
'phase': 'processing',
|
|
'total_posts': len(posts)
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
|
|
for post in posts:
|
|
# Insert/update post in database
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
|
|
# Insert clip attachment
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
# Update creator stats - find the actual newest post date (posts may not be sorted by date)
|
|
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'last_post_date': latest_post_date or creator.get('last_post_date'),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
# Download clips if enabled
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self._download_twitch_clips(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
# Unregister from active syncs
|
|
self._unregister_active_sync(creator_id)
|
|
|
|
# Emit completed event
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': new_posts,
|
|
'new_attachments': new_attachments,
|
|
'downloaded': downloaded,
|
|
'failed': failed
|
|
})
|
|
|
|
# Send push notification for new downloads
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled)
|
|
|
|
return SyncResult(
|
|
success=True,
|
|
new_posts=new_posts,
|
|
new_attachments=new_attachments,
|
|
downloaded_files=downloaded,
|
|
failed_files=failed,
|
|
downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing Twitch channel {creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
async def _download_twitch_clips(self, creator_id: int) -> Dict:
|
|
"""Download pending Twitch clips using yt-dlp"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
twitch = self._get_twitch_client()
|
|
if not twitch.is_available():
|
|
return {'downloaded': 0, 'failed': 0, 'error': 'yt-dlp not available'}
|
|
|
|
self.log(f"Downloading {len(pending)} Twitch clips for {creator['username']}", 'info')
|
|
|
|
# Update status
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading',
|
|
'status': f'Downloading {len(pending)} clips...',
|
|
'total_files': len(pending),
|
|
'downloaded': 0
|
|
})
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
quality = self.config.get('embed_quality', 'best')
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
|
|
for i, att in enumerate(pending):
|
|
try:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
self.log(f"Post not found for attachment {att.get('id')}", 'warning')
|
|
failed += 1
|
|
continue
|
|
|
|
# Build output directory
|
|
published_at = post.get('published_at') or ''
|
|
post_date = published_at[:10] if published_at else 'unknown-date'
|
|
output_dir = base_path / 'twitch' / self._sanitize_filename(creator['username']) / post_date
|
|
|
|
# Clip URL is stored in download_url
|
|
clip_url = att.get('download_url')
|
|
if not clip_url:
|
|
self.log(f"No download URL for attachment {att.get('id')}, {att.get('name')}", 'warning')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message='No clip URL'
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
# Update status
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading clip {i + 1}/{len(pending)}: {att.get("name", "")[:40]}...',
|
|
'downloaded': downloaded
|
|
})
|
|
|
|
self.db.update_attachment_status(att['id'], 'downloading')
|
|
|
|
# Download using yt-dlp
|
|
result = await twitch.download_clip(clip_url, output_dir, quality=quality)
|
|
|
|
if not result:
|
|
self.log(f"No result from yt-dlp for {att.get('name')}", 'warning')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message='yt-dlp returned no result',
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
continue
|
|
|
|
if result.get('success'):
|
|
file_path = result.get('file_path')
|
|
file_size = result.get('file_size', 0)
|
|
|
|
# Get cached thumbnail from Twitch (already cached during clip fetch)
|
|
thumbnail_data = None
|
|
# The thumbnail URL should be in post metadata or we can use the original thumbnail
|
|
# For now, we'll skip thumbnail generation for clips
|
|
|
|
# Extract video dimensions
|
|
width, height, duration = None, None, None
|
|
if file_path:
|
|
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
|
|
if width and height:
|
|
self.log(f"Extracted dimensions for {att.get('name', 'clip')}: {width}x{height}, {duration}s", 'debug')
|
|
|
|
self.db.update_attachment_status(att['id'], 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name if file_path else None,
|
|
file_size=file_size,
|
|
width=width,
|
|
height=height,
|
|
duration=duration,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
# Update post as downloaded if all attachments are done
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
|
|
# Update creator stats
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
|
|
downloaded += 1
|
|
self.log(f"Downloaded: {att.get('name', 'clip')}", 'debug')
|
|
|
|
# Collect file info for notifications
|
|
if file_path:
|
|
downloaded_file_info.append({
|
|
'file_path': file_path,
|
|
'filename': Path(file_path).name if file_path else None,
|
|
'source': creator['username'],
|
|
'content_type': att.get('file_type', 'video')
|
|
})
|
|
else:
|
|
error = result.get('error', 'Unknown error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
self.log(f"Failed to download {att.get('name', 'clip')}: {error}", 'warning')
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading Twitch clip: {e}", 'error')
|
|
self.db.update_attachment_status(att['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed, 'downloaded_file_info': downloaded_file_info}
|
|
|
|
async def download_all_pending(self) -> Dict:
|
|
"""Download all pending attachments across all enabled creators.
|
|
|
|
Used by scheduler to process download queue after sync completes.
|
|
Returns total counts across all creators.
|
|
"""
|
|
creators = self.db.get_creators(enabled_only=True)
|
|
|
|
total_downloaded = 0
|
|
total_failed = 0
|
|
|
|
for creator in creators:
|
|
if not creator.get('auto_download', True):
|
|
continue
|
|
|
|
pending_count = self.db.get_pending_attachment_count(creator['id'])
|
|
if pending_count == 0:
|
|
continue
|
|
|
|
self.log(f"Processing {pending_count} pending downloads for {creator['username']}", 'info')
|
|
# Register a task so download_pending_for_creator doesn't think it's cancelled
|
|
task_id = f"paid_content_sync_{creator['id']}"
|
|
self.activity_manager.start_background_task(
|
|
task_id, 'paid_content_sync',
|
|
display_name=f"Downloading {creator['username']}"
|
|
)
|
|
try:
|
|
result = await self.download_pending_for_creator(creator['id'])
|
|
total_downloaded += result.get('downloaded', 0)
|
|
total_failed += result.get('failed', 0)
|
|
finally:
|
|
self.activity_manager.stop_background_task(task_id)
|
|
|
|
return {'downloaded': total_downloaded, 'failed': total_failed}
|
|
|
|
async def download_pending_for_creator(self, creator_id: int) -> Dict:
|
|
"""Download all pending attachments for a creator.
|
|
|
|
If auto_retry_failed is enabled, this will keep looping and retrying
|
|
failed downloads (that were re-queued as pending) until all files are
|
|
downloaded or the sync is cancelled.
|
|
"""
|
|
creator = self.db.get_creator(creator_id)
|
|
if not creator:
|
|
return {'downloaded': 0, 'failed': 0, 'skipped': 0, 'error': 'Creator not found'}
|
|
|
|
total_downloaded = 0
|
|
total_failed = 0
|
|
total_skipped = 0
|
|
downloaded_file_info = [] # Collect file info for notifications
|
|
round_num = 0
|
|
max_rounds = 100 # Safety limit to prevent infinite loops
|
|
|
|
base_path = Path(self.config.get('base_download_path', '/paid-content'))
|
|
auto_retry = self.config.get('auto_retry_failed', 1)
|
|
|
|
while round_num < max_rounds:
|
|
round_num += 1
|
|
|
|
# Check if sync was cancelled
|
|
# activity_manager: only cancel if task is currently active and then gets stopped
|
|
# (stale inactive entries from previous syncs should NOT cancel retries)
|
|
task_id = f"paid_content_sync_{creator_id}"
|
|
task_status = self.activity_manager.get_background_task(task_id)
|
|
if task_status and task_status.get('active'):
|
|
# Task is registered and active - this is a real sync, check won't
|
|
# trip here. But if on next iteration it becomes inactive, that means cancelled.
|
|
pass
|
|
elif task_status and not task_status.get('active') and round_num > 1:
|
|
# Was active in a previous round but now inactive = cancelled mid-download
|
|
self.log(f"Sync cancelled for {creator['username']} (activity_manager), stopping downloads", 'info')
|
|
break
|
|
# app_state: only check if creator is registered in active syncs
|
|
if self.app_state and hasattr(self.app_state, 'active_paid_content_syncs'):
|
|
sync_entry = self.app_state.active_paid_content_syncs.get(creator_id)
|
|
if sync_entry is not None and not sync_entry.get('active', True):
|
|
self.log(f"Sync cancelled for {creator['username']} (app_state), stopping downloads", 'info')
|
|
break
|
|
|
|
pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
|
|
if not pending:
|
|
if round_num > 1:
|
|
self.log(f"All downloads complete for {creator['username']} after {round_num - 1} rounds", 'info')
|
|
break
|
|
|
|
total_files = len(pending)
|
|
|
|
if round_num == 1:
|
|
self.log(f"Downloading {total_files} files for {creator['username']}", 'info')
|
|
else:
|
|
self.log(f"Round {round_num}: Retrying {total_files} re-queued files for {creator['username']}", 'info')
|
|
|
|
# Emit download started event
|
|
self._emit_event('paid_content_download_started', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'total_files': total_files,
|
|
'status': f'Downloading {total_files} files...' if round_num == 1 else f'Retrying {total_files} files (round {round_num})...',
|
|
'phase': 'downloading'
|
|
})
|
|
|
|
# Update polling-based active sync status
|
|
self._update_active_sync(creator_id, {
|
|
'phase': 'downloading',
|
|
'status': f'Downloading {total_files} files...' if round_num == 1 else f'Retrying {total_files} files (round {round_num})...',
|
|
'total_files': total_files,
|
|
'downloaded': total_downloaded
|
|
})
|
|
|
|
# Download this batch
|
|
results = await self._download_attachments(pending, base_path, creator)
|
|
|
|
downloaded = sum(1 for r in results if r.success)
|
|
failed = sum(1 for r in results if not r.success and not r.is_duplicate)
|
|
skipped = sum(1 for r in results if r.is_duplicate)
|
|
|
|
# Collect file info for notifications (include metadata for lightbox)
|
|
for r in results:
|
|
if r.success and r.file_path:
|
|
file_info = {
|
|
'file_path': r.file_path,
|
|
'filename': Path(r.file_path).name if r.file_path else None,
|
|
'source': creator['username'],
|
|
'content_type': 'post',
|
|
'file_size': r.file_size,
|
|
'platform': creator.get('platform')
|
|
}
|
|
# Try to get additional metadata from the attachment record
|
|
if r.file_path:
|
|
att = self.db.get_attachment_by_path(r.file_path)
|
|
if att:
|
|
file_info['attachment_id'] = att.get('id')
|
|
file_info['width'] = att.get('width')
|
|
file_info['height'] = att.get('height')
|
|
file_info['file_type'] = att.get('file_type')
|
|
file_info['duration'] = att.get('duration')
|
|
file_info['post_content'] = att.get('post_content')
|
|
file_info['post_date'] = att.get('post_date')
|
|
downloaded_file_info.append(file_info)
|
|
|
|
total_downloaded += downloaded
|
|
total_failed += failed
|
|
total_skipped += skipped
|
|
|
|
# Update creator stats after each round
|
|
stats = self.db.get_creator_stats(creator_id)
|
|
self.db.update_creator(creator_id, {
|
|
'downloaded_count': stats['downloaded_attachments'],
|
|
'total_size_bytes': stats['total_size_bytes']
|
|
})
|
|
|
|
# If nothing was downloaded and nothing was re-queued, stop
|
|
# (all failures were permanent errors)
|
|
if downloaded == 0 and failed > 0:
|
|
# Check if any items were re-queued (status = pending)
|
|
new_pending = self.db.get_pending_attachments(creator_id=creator_id)
|
|
if len(new_pending) == 0:
|
|
self.log(f"All remaining failures are permanent, stopping", 'info')
|
|
break
|
|
|
|
# If auto-retry is disabled, stop after first round
|
|
if not auto_retry:
|
|
break
|
|
|
|
# Small delay before next round to avoid hammering the server
|
|
if len(pending) > 0:
|
|
await asyncio.sleep(5)
|
|
|
|
# Download embeds if enabled
|
|
if self.config.get('download_embeds', True) and creator.get('download_embeds', True):
|
|
embed_results = await self._download_pending_embeds(creator_id, base_path, creator)
|
|
total_downloaded += embed_results.get('downloaded', 0)
|
|
total_failed += embed_results.get('failed', 0)
|
|
# Add embed file info if available
|
|
if embed_results.get('downloaded_file_info'):
|
|
downloaded_file_info.extend(embed_results['downloaded_file_info'])
|
|
|
|
# Download pending message attachments
|
|
msg_attachments = self.db.get_pending_message_attachments(creator_id)
|
|
if msg_attachments:
|
|
self.log(f"Downloading {len(msg_attachments)} message attachments for {creator['username']}", 'info')
|
|
msg_results = await self._download_message_attachments(msg_attachments, base_path, creator)
|
|
msg_downloaded = sum(1 for r in msg_results if r.success)
|
|
msg_failed = sum(1 for r in msg_results if not r.success and not r.is_duplicate)
|
|
total_downloaded += msg_downloaded
|
|
total_failed += msg_failed
|
|
|
|
return {
|
|
'downloaded': total_downloaded,
|
|
'failed': total_failed,
|
|
'skipped': total_skipped,
|
|
'downloaded_file_info': downloaded_file_info
|
|
}
|
|
|
|
async def _download_message_attachments(self, attachments: List[Dict], base_path: Path,
|
|
creator: Dict) -> List[DownloadResult]:
|
|
"""Download message attachments using the same worker pattern as post attachments"""
|
|
results = []
|
|
queue = asyncio.Queue()
|
|
|
|
for att in attachments:
|
|
# Get message info for path building
|
|
if not att.get('message_id'):
|
|
results.append(DownloadResult(success=False, error="No message_id"))
|
|
continue
|
|
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT * FROM paid_content_messages WHERE id = ?", (att['message_id'],))
|
|
msg_row = cursor.fetchone()
|
|
if not msg_row:
|
|
results.append(DownloadResult(success=False, error="Message not found"))
|
|
continue
|
|
message = dict(msg_row)
|
|
|
|
dest_path = self._build_message_file_path(base_path, creator, message, att)
|
|
download_url = att.get('download_url')
|
|
if not download_url:
|
|
results.append(DownloadResult(success=False, error="No download URL"))
|
|
continue
|
|
|
|
await queue.put((att, download_url, dest_path))
|
|
|
|
if queue.empty():
|
|
return results
|
|
|
|
total = queue.qsize()
|
|
self._download_progress = {'completed': 0, 'success': 0, 'failed': 0}
|
|
self._active_downloads = {}
|
|
self._download_results = []
|
|
self._results_lock = asyncio.Lock()
|
|
self._active_workers = 0
|
|
|
|
num_workers = min(self.max_concurrent_downloads, total)
|
|
workers = []
|
|
for i in range(num_workers):
|
|
worker = asyncio.create_task(
|
|
self._download_worker(queue, creator['id'], total, worker_id=i)
|
|
)
|
|
workers.append(worker)
|
|
|
|
await queue.join()
|
|
for worker in workers:
|
|
worker.cancel()
|
|
await asyncio.gather(*workers, return_exceptions=True)
|
|
|
|
results.extend(self._download_results)
|
|
return results
|
|
|
|
async def _download_attachments(self, attachments: List[Dict], base_path: Path,
|
|
creator: Dict) -> List[DownloadResult]:
|
|
"""Download attachments using a queue with worker pattern for true concurrent downloads"""
|
|
total = len(attachments)
|
|
|
|
# Shared progress counter for real-time updates
|
|
self._download_progress = {'completed': 0, 'success': 0, 'failed': 0}
|
|
# Track currently active downloads for status display
|
|
self._active_downloads = {}
|
|
# Results storage
|
|
self._download_results = []
|
|
# Lock for thread-safe updates to shared state
|
|
self._results_lock = asyncio.Lock()
|
|
# Track active worker count for debugging
|
|
self._active_workers = 0
|
|
|
|
num_workers = min(self.max_concurrent_downloads, total) # Don't create more workers than files
|
|
self.log(f"Starting download of {total} files with {num_workers} concurrent workers", 'info')
|
|
|
|
# Create a queue of download jobs
|
|
queue = asyncio.Queue()
|
|
|
|
# Add all attachments to the queue
|
|
queued = 0
|
|
for att in attachments:
|
|
post = self.db.get_post(att['post_id'])
|
|
if not post:
|
|
async with self._results_lock:
|
|
self._download_results.append(DownloadResult(success=False, error="Post not found"))
|
|
self._download_progress['completed'] += 1
|
|
self._download_progress['failed'] += 1
|
|
continue
|
|
|
|
dest_path = self._build_file_path(base_path, creator, post, att)
|
|
# Use direct download_url if available (Fansly Direct), otherwise build from server_path
|
|
if att.get('download_url'):
|
|
download_url = att['download_url']
|
|
else:
|
|
client = self._get_client(creator['service_id'])
|
|
download_url = client.get_attachment_url(att['server_path'])
|
|
|
|
await queue.put((att, download_url, dest_path))
|
|
queued += 1
|
|
|
|
self.log(f"Queued {queued} downloads, starting {num_workers} workers", 'info')
|
|
|
|
# Create worker tasks
|
|
workers = []
|
|
for i in range(num_workers):
|
|
worker = asyncio.create_task(
|
|
self._download_worker(queue, creator['id'], total, worker_id=i)
|
|
)
|
|
workers.append(worker)
|
|
|
|
# Wait for all items in queue to be processed
|
|
await queue.join()
|
|
|
|
self.log(f"All downloads complete, cancelling workers", 'debug')
|
|
|
|
# Cancel workers (they're waiting on empty queue)
|
|
for worker in workers:
|
|
worker.cancel()
|
|
|
|
# Wait for workers to finish cancelling
|
|
await asyncio.gather(*workers, return_exceptions=True)
|
|
|
|
return self._download_results
|
|
|
|
async def _download_worker(self, queue: asyncio.Queue, creator_id: int, total_files: int, worker_id: int = 0):
|
|
"""Worker that pulls from queue and downloads files"""
|
|
self._active_workers += 1
|
|
self.log(f"Worker {worker_id} started (active workers: {self._active_workers})", 'debug')
|
|
|
|
while True:
|
|
try:
|
|
# Get next item from queue - non-blocking check first to log state
|
|
try:
|
|
att, url, dest_path = queue.get_nowait()
|
|
except asyncio.QueueEmpty:
|
|
# Queue empty, wait for more items or cancellation
|
|
att, url, dest_path = await queue.get()
|
|
|
|
att_name = att.get('name', 'unknown')
|
|
self.log(f"Worker {worker_id} picked up: {att_name} (queue size: {queue.qsize()}, active: {len(self._active_downloads)})", 'info')
|
|
|
|
try:
|
|
result = await self._download_single_attachment_no_semaphore(
|
|
att, url, dest_path, creator_id, total_files
|
|
)
|
|
async with self._results_lock:
|
|
self._download_results.append(result)
|
|
except Exception as e:
|
|
self.log(f"Worker {worker_id} exception: {e}", 'error')
|
|
async with self._results_lock:
|
|
self._download_results.append(DownloadResult(success=False, error=str(e)))
|
|
finally:
|
|
# Mark task as done so queue.join() knows
|
|
queue.task_done()
|
|
self.log(f"Worker {worker_id} finished task, marking done", 'debug')
|
|
|
|
except asyncio.CancelledError:
|
|
# Worker was cancelled, exit gracefully
|
|
self._active_workers -= 1
|
|
self.log(f"Worker {worker_id} cancelled (remaining: {self._active_workers})", 'debug')
|
|
break
|
|
|
|
async def _download_via_ytdlp(self, att: Dict, url: str, dest_path: Path,
|
|
creator_id: int = None, total_files: int = None) -> DownloadResult:
|
|
"""Download a YouTube video using yt-dlp instead of direct HTTP download.
|
|
Downloads to a local temp dir first to avoid mergerfs rename issues with .part files,
|
|
then moves the final merged file to the target location."""
|
|
att_id = att['id']
|
|
att_name = att.get('name', 'unknown')
|
|
|
|
youtube = self._get_youtube_client()
|
|
if not youtube.is_available():
|
|
self.log(f"yt-dlp not available for {att_name}", 'error')
|
|
return DownloadResult(success=False, error='yt-dlp not available')
|
|
|
|
quality = self.config.get('embed_quality', 'best')
|
|
final_dir = dest_path.parent
|
|
|
|
# Download to local temp dir to avoid mergerfs .part rename issues
|
|
import tempfile
|
|
with tempfile.TemporaryDirectory(prefix='ytdlp_') as tmp_dir:
|
|
tmp_path = Path(tmp_dir)
|
|
|
|
self.log(f"Downloading via yt-dlp: {att_name}", 'info')
|
|
result = await youtube.download_video(url, tmp_path, quality=quality)
|
|
|
|
if not result or not result.get('success'):
|
|
error = (result or {}).get('error', 'yt-dlp returned no result')
|
|
self.log(f"yt-dlp failed for {att_name}: {error}", 'warning')
|
|
self.db.update_attachment_status(att_id, 'pending',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
return DownloadResult(success=False, error=error)
|
|
|
|
tmp_file = Path(result.get('file_path'))
|
|
if not tmp_file.exists():
|
|
error = f"yt-dlp reported success but file not found: {tmp_file}"
|
|
self.log(error, 'error')
|
|
return DownloadResult(success=False, error=error)
|
|
|
|
# Move to final destination
|
|
final_dir.mkdir(parents=True, exist_ok=True)
|
|
final_path = final_dir / tmp_file.name
|
|
import shutil
|
|
shutil.move(str(tmp_file), str(final_path))
|
|
|
|
file_path = str(final_path)
|
|
file_size = final_path.stat().st_size
|
|
|
|
# Download YouTube thumbnail
|
|
thumbnail_data = None
|
|
post = self.db.get_post(att['post_id'])
|
|
video_id = post.get('post_id') if post else None
|
|
if video_id:
|
|
thumbnail_data = await self._download_youtube_thumbnail(video_id)
|
|
|
|
# Extract video dimensions
|
|
width, height, duration = None, None, None
|
|
if file_path:
|
|
width, height, duration = self._extract_dimensions(Path(file_path), 'video')
|
|
|
|
self.db.update_attachment_status(att_id, 'completed',
|
|
local_path=file_path,
|
|
local_filename=Path(file_path).name if file_path else None,
|
|
file_size=file_size,
|
|
width=width,
|
|
height=height,
|
|
duration=duration,
|
|
thumbnail_data=thumbnail_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
|
|
if creator_id:
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size or 0)
|
|
|
|
if creator_id and total_files:
|
|
async with self._results_lock:
|
|
self._download_progress['completed'] += 1
|
|
self._download_progress['success'] += 1
|
|
self._update_download_status(creator_id, total_files)
|
|
|
|
self.log(f"Downloaded via yt-dlp: {att_name} ({file_size} bytes)", 'info')
|
|
return DownloadResult(success=True, file_path=file_path, file_size=file_size)
|
|
|
|
async def _download_via_tiktok(self, att: Dict, url: str, dest_path: Path,
|
|
creator_id: int = None, total_files: int = None) -> DownloadResult:
|
|
"""Download a TikTok video using the TikTok client (gallery-dl with cookies)."""
|
|
att_id = att['id']
|
|
att_name = att.get('name', 'unknown')
|
|
|
|
tiktok = self._get_tiktok_client()
|
|
if not tiktok.is_available():
|
|
self.log(f"TikTok client not available for {att_name}", 'error')
|
|
return DownloadResult(success=False, error='yt-dlp/gallery-dl not available')
|
|
|
|
# Get creator username for the download
|
|
post = self.db.get_post(att['post_id'])
|
|
creator = self.db.get_creator(creator_id) if creator_id else None
|
|
username = creator['username'] if creator else ''
|
|
|
|
output_dir = dest_path.parent
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
self.log(f"Downloading via TikTok client: {att_name}", 'info')
|
|
result = await tiktok.download_video(url, output_dir, username=username)
|
|
|
|
if not result or not result.get('success'):
|
|
error = (result or {}).get('error', 'TikTok download failed')
|
|
self.log(f"TikTok download failed for {att_name}: {error}", 'warning')
|
|
self.db.update_attachment_status(att_id, 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
async with self._results_lock:
|
|
self._download_progress['completed'] += 1
|
|
self._download_progress['failed'] += 1
|
|
if creator_id and total_files:
|
|
self._update_download_status(creator_id, total_files)
|
|
return DownloadResult(success=False, error=error)
|
|
|
|
all_files = result.get('all_files', [])
|
|
file_path = result.get('file_path')
|
|
total_size = 0
|
|
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp'}
|
|
|
|
# Process all downloaded files (carousel photos or single video)
|
|
for file_idx, file_str in enumerate(all_files):
|
|
fp = Path(file_str)
|
|
if not fp.exists():
|
|
continue
|
|
|
|
f_size = fp.stat().st_size
|
|
total_size += f_size
|
|
ext = fp.suffix.lower()
|
|
c_type = 'image' if ext in image_exts else 'video'
|
|
|
|
w, h, dur = None, None, None
|
|
if c_type == 'video':
|
|
w, h, dur = self._extract_dimensions(fp, 'video')
|
|
else:
|
|
w, h, _ = self._extract_dimensions(fp, 'image')
|
|
|
|
thumb_data = None
|
|
if c_type == 'video':
|
|
thumb_data = self._generate_video_thumbnail(fp, seek_time='00:00:01')
|
|
|
|
if file_idx == 0:
|
|
# Update the existing attachment with the first file
|
|
self.db.update_attachment_status(att_id, 'completed',
|
|
local_path=str(fp),
|
|
local_filename=fp.name,
|
|
name=fp.name,
|
|
extension=ext,
|
|
file_size=f_size,
|
|
file_type=c_type,
|
|
width=w, height=h, duration=dur,
|
|
thumbnail_data=thumb_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
else:
|
|
# Create additional attachments for carousel photos
|
|
# Use unique server_path per file to avoid upsert collisions
|
|
dl_url = att.get('download_url', '')
|
|
self.db.upsert_attachment(att['post_id'], {
|
|
'name': fp.name,
|
|
'file_type': c_type,
|
|
'extension': ext,
|
|
'server_path': f"{dl_url}#slide_{file_idx}",
|
|
'download_url': dl_url,
|
|
'status': 'completed',
|
|
'local_path': str(fp),
|
|
'local_filename': fp.name,
|
|
'file_size': f_size,
|
|
'width': w, 'height': h, 'duration': dur,
|
|
'thumbnail_data': thumb_data,
|
|
'downloaded_at': datetime.now().isoformat(),
|
|
})
|
|
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
|
|
if creator_id:
|
|
self.db.increment_creator_download_stats(creator_id, 1, total_size)
|
|
|
|
if creator_id and total_files:
|
|
async with self._results_lock:
|
|
self._download_progress['completed'] += 1
|
|
self._download_progress['success'] += 1
|
|
self._update_download_status(creator_id, total_files)
|
|
|
|
file_count = len([f for f in all_files if Path(f).exists()])
|
|
self.log(f"Downloaded via TikTok client: {att_name} ({file_count} file{'s' if file_count > 1 else ''}, {total_size} bytes)", 'info')
|
|
return DownloadResult(success=True, file_path=file_path, file_size=total_size)
|
|
|
|
async def _download_single_attachment_no_semaphore(self, att: Dict, url: str, dest_path: Path,
|
|
creator_id: int = None, total_files: int = None) -> DownloadResult:
|
|
"""Download a single attachment with automatic retry/resume on stall or failure"""
|
|
att_id = att['id']
|
|
att_name = att.get('name', 'unknown')
|
|
max_retries = 5 # Max retry attempts for stalls/failures
|
|
chunk_timeout = 60 # Timeout for receiving each chunk (detect stalls)
|
|
|
|
# Helper to clean up active download tracking
|
|
def cleanup_active_download():
|
|
if att_id in self._active_downloads:
|
|
del self._active_downloads[att_id]
|
|
|
|
# Update status to downloading
|
|
self.db.update_attachment_status(att_id, 'downloading')
|
|
|
|
# Create directory
|
|
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Route YouTube URLs through yt-dlp (direct aiohttp download gets HTML, not video)
|
|
if 'youtube.com/watch' in url or 'youtu.be/' in url:
|
|
return await self._download_via_ytdlp(att, url, dest_path, creator_id, total_files)
|
|
|
|
# Route TikTok URLs through TikTok client (gallery-dl with cookies)
|
|
if 'tiktok.com/' in url:
|
|
return await self._download_via_tiktok(att, url, dest_path, creator_id, total_files)
|
|
|
|
# Handle streaming formats (m3u8/HLS and mpd/DASH) with ffmpeg
|
|
if '.m3u8' in url or '.mpd' in url:
|
|
return await self._download_stream_with_ffmpeg(att, url, dest_path, creator_id, total_files)
|
|
|
|
file_size = 0
|
|
expected_size = None
|
|
last_error = None
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
# Check for existing partial file to enable resume
|
|
existing_size = 0
|
|
if dest_path.exists():
|
|
existing_size = dest_path.stat().st_size
|
|
if existing_size > 0:
|
|
if attempt == 0:
|
|
self.log(f"Resuming download: {att_name} from {self._format_bytes(existing_size)}", 'info')
|
|
else:
|
|
self.log(f"Retry {attempt + 1}/{max_retries}: Resuming {att_name} from {self._format_bytes(existing_size)}", 'info')
|
|
else:
|
|
self.log(f"Downloading: {att_name}", 'debug')
|
|
else:
|
|
if attempt > 0:
|
|
self.log(f"Retry {attempt + 1}/{max_retries}: Starting fresh download of {att_name}", 'info')
|
|
else:
|
|
self.log(f"Downloading: {att_name}", 'debug')
|
|
|
|
# Download file with proper headers
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': '*/*',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Referer': url.split('/data')[0] if '/data' in url else url
|
|
}
|
|
|
|
# Add Range header for resume
|
|
if existing_size > 0:
|
|
headers['Range'] = f'bytes={existing_size}-'
|
|
|
|
# Use shorter timeouts to detect stalls faster
|
|
timeout = aiohttp.ClientTimeout(total=None, connect=30, sock_read=chunk_timeout)
|
|
|
|
async with aiohttp.ClientSession(headers=headers) as session:
|
|
async with session.get(url, timeout=timeout) as resp:
|
|
# Handle response codes
|
|
if resp.status == 416:
|
|
# Range not satisfiable - file is likely complete
|
|
self.log(f"Resume not needed for {att_name} - file may be complete", 'info')
|
|
file_size = existing_size
|
|
break # Exit retry loop, proceed to verification
|
|
|
|
elif resp.status == 200 and existing_size > 0:
|
|
# Server doesn't support resume, need to start over
|
|
self.log(f"Server doesn't support resume for {att_name}, restarting download", 'warning')
|
|
existing_size = 0
|
|
# Delete partial file and start fresh
|
|
dest_path.unlink(missing_ok=True)
|
|
|
|
elif resp.status not in (200, 206):
|
|
last_error = f"HTTP {resp.status}"
|
|
self.log(f"HTTP error {resp.status} for {att_name}, will retry", 'warning')
|
|
await asyncio.sleep(2 ** attempt) # Exponential backoff
|
|
continue # Retry
|
|
|
|
# Calculate expected size
|
|
content_length = resp.headers.get('Content-Length')
|
|
if resp.status == 206:
|
|
expected_size = existing_size + (int(content_length) if content_length else 0)
|
|
else:
|
|
expected_size = int(content_length) if content_length else None
|
|
|
|
file_size = existing_size
|
|
last_progress_update = existing_size
|
|
|
|
# Register this download as active
|
|
if creator_id and total_files:
|
|
self._active_downloads[att_id] = {
|
|
'name': att_name,
|
|
'size': expected_size,
|
|
'progress': existing_size
|
|
}
|
|
self._update_download_status(creator_id, total_files)
|
|
|
|
# Open in append mode for resume, write mode for fresh start
|
|
file_mode = 'ab' if existing_size > 0 else 'wb'
|
|
|
|
async with aiofiles.open(dest_path, file_mode) as f:
|
|
async for chunk in resp.content.iter_chunked(8192):
|
|
await f.write(chunk)
|
|
file_size += len(chunk)
|
|
|
|
if creator_id and total_files and (file_size - last_progress_update) >= 512 * 1024:
|
|
last_progress_update = file_size
|
|
if att_id in self._active_downloads:
|
|
self._active_downloads[att_id]['progress'] = file_size
|
|
self._update_download_status(creator_id, total_files)
|
|
|
|
# Download completed successfully
|
|
cleanup_active_download()
|
|
|
|
# Verify file size if we know the expected size
|
|
if expected_size and file_size < expected_size:
|
|
self.log(f"Incomplete download for {att_name}: got {file_size}, expected {expected_size}", 'warning')
|
|
last_error = f"Incomplete: {file_size}/{expected_size} bytes"
|
|
continue # Retry
|
|
|
|
break # Success - exit retry loop
|
|
|
|
except asyncio.TimeoutError:
|
|
cleanup_active_download()
|
|
# Check how much we got
|
|
current_size = dest_path.stat().st_size if dest_path.exists() else 0
|
|
last_error = f"Stalled at {self._format_bytes(current_size)}"
|
|
self.log(f"Download stalled for {att_name} at {self._format_bytes(current_size)}, will retry", 'warning')
|
|
await asyncio.sleep(2) # Brief pause before retry
|
|
continue
|
|
|
|
except (aiohttp.ClientError, ConnectionError, OSError) as e:
|
|
cleanup_active_download()
|
|
current_size = dest_path.stat().st_size if dest_path.exists() else 0
|
|
last_error = str(e)
|
|
self.log(f"Connection error for {att_name} at {self._format_bytes(current_size)}: {e}, will retry", 'warning')
|
|
await asyncio.sleep(2 ** attempt) # Exponential backoff
|
|
continue
|
|
|
|
else:
|
|
# Exhausted all retries for this sync
|
|
cleanup_active_download()
|
|
error = f"Failed after {max_retries} attempts: {last_error}"
|
|
self.log(f"Download failed for {att_name}: {error}", 'error')
|
|
|
|
# Check if this is a permanent error or a retriable one
|
|
if self._is_permanent_error(last_error):
|
|
# Permanent error (500, 404, etc.) - mark as failed
|
|
self.db.update_attachment_status(att_id, 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
self.log(f"Permanent error for {att_name}, marked as failed", 'warning')
|
|
else:
|
|
# Retriable error (timeout, partial, connection) - put back in queue
|
|
# Don't increment download_attempts so it keeps trying
|
|
self.db.update_attachment_status(att_id, 'pending',
|
|
error_message=f"Will retry: {error}",
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
self.log(f"Retriable error for {att_name}, re-queued for next sync", 'info')
|
|
|
|
self.db.record_download_attempt(
|
|
attachment_id=att_id, url=url, status='failed',
|
|
error_message=error
|
|
)
|
|
if creator_id and total_files:
|
|
async with self._results_lock:
|
|
self._download_progress['completed'] += 1
|
|
self._download_progress['failed'] += 1
|
|
self._update_download_status(creator_id, total_files)
|
|
return DownloadResult(success=False, error=error)
|
|
|
|
# Download succeeded - proceed with verification and processing
|
|
try:
|
|
# Calculate SHA256 hash
|
|
file_hash = await self._compute_file_hash_async(dest_path)
|
|
if not file_hash:
|
|
error = f"Failed to compute hash - file may be missing: {dest_path.name}"
|
|
self.log(f"Hash computation failed for {att_name}: {error}", 'error')
|
|
self.db.update_attachment_status(att_id, 'pending',
|
|
error_message=f"Will retry: {error}",
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
if creator_id and total_files:
|
|
async with self._results_lock:
|
|
self._download_progress['completed'] += 1
|
|
self._download_progress['failed'] += 1
|
|
self._update_download_status(creator_id, total_files)
|
|
return DownloadResult(success=False, error=error)
|
|
|
|
# Verify hash matches expected hash from Coomer/Kemono server path
|
|
# Server path format: /xx/yy/HASH.ext (e.g., /90/b0/90b023c9...714.mp4)
|
|
server_path = att.get('server_path', '')
|
|
if server_path and '/' in server_path:
|
|
expected_hash = Path(server_path).stem # Get filename without extension
|
|
if len(expected_hash) == 64 and expected_hash != file_hash:
|
|
# Hash mismatch - file is corrupt, delete and re-queue
|
|
dest_path.unlink(missing_ok=True)
|
|
error = f"Hash mismatch: expected {expected_hash[:16]}..., got {file_hash[:16]}... - file corrupt"
|
|
self.log(f"Hash verification failed for {att_name}: {error}", 'warning')
|
|
self.db.update_attachment_status(att_id, 'pending',
|
|
error_message=f"Will retry: {error}",
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
if creator_id and total_files:
|
|
async with self._results_lock:
|
|
self._download_progress['completed'] += 1
|
|
self._download_progress['failed'] += 1
|
|
self._update_download_status(creator_id, total_files)
|
|
return DownloadResult(success=False, error=error)
|
|
|
|
# Compute perceptual hash for images
|
|
perceptual_hash = self._compute_perceptual_hash(dest_path)
|
|
|
|
# Generate thumbnail and extract dimensions
|
|
file_type = att.get('file_type', '')
|
|
thumbnail_data = None
|
|
width, height, duration = None, None, None
|
|
if file_type in ('image', 'video'):
|
|
thumbnail_data = self._generate_thumbnail(dest_path, file_type)
|
|
if thumbnail_data:
|
|
self.log(f"Generated thumbnail for {att_name} ({len(thumbnail_data)} bytes)", 'debug')
|
|
# Also generate large thumbnail for feed view (file cached)
|
|
large_thumb_data = self._generate_thumbnail(dest_path, file_type, max_size=(800, 800))
|
|
if large_thumb_data:
|
|
large_cache_dir = Path('/opt/media-downloader/cache/thumbnails/large')
|
|
large_cache_dir.mkdir(parents=True, exist_ok=True)
|
|
large_cache_file = large_cache_dir / f"{att_id}.jpg"
|
|
large_cache_file.write_bytes(large_thumb_data)
|
|
# Extract dimensions
|
|
width, height, duration = self._extract_dimensions(dest_path, file_type)
|
|
if width and height:
|
|
self.log(f"Extracted dimensions for {att_name}: {width}x{height}" + (f", {duration}s" if duration else ""), 'debug')
|
|
|
|
# Skip placeholder/missing-photo images (tiny thumbnails under 15KB and 200px)
|
|
if file_type == 'image' and file_size and file_size < 15000 and width and height and max(width, height) <= 200:
|
|
self.log(f"Skipping placeholder image {att_name} ({width}x{height}, {file_size} bytes)", 'info')
|
|
dest_path.unlink(missing_ok=True)
|
|
self.db.update_attachment_status(att_id, 'skipped',
|
|
error_message=f'Placeholder image ({width}x{height}, {file_size} bytes)')
|
|
return DownloadResult(success=True, file_path=None, file_hash=None, file_size=0)
|
|
|
|
# Update database
|
|
self.db.update_attachment_status(att_id, 'completed',
|
|
local_path=str(dest_path),
|
|
local_filename=dest_path.name,
|
|
file_hash=file_hash,
|
|
perceptual_hash=perceptual_hash,
|
|
file_size=file_size,
|
|
width=width,
|
|
height=height,
|
|
duration=duration,
|
|
thumbnail_data=thumbnail_data,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
self.db.record_download_attempt(
|
|
attachment_id=att_id, url=url, status='success'
|
|
)
|
|
|
|
# Check if all attachments for post are complete
|
|
post = self.db.get_post(att['post_id'])
|
|
if post:
|
|
all_complete = all(a['status'] == 'completed' for a in post['attachments'])
|
|
if all_complete:
|
|
self.db.mark_post_downloaded(att['post_id'])
|
|
|
|
self.log(f"Downloaded: {att_name} ({file_size} bytes)", 'debug')
|
|
|
|
# Update creator stats
|
|
if creator_id:
|
|
self.db.increment_creator_download_stats(creator_id, 1, file_size)
|
|
|
|
# Update progress
|
|
if creator_id and total_files:
|
|
async with self._results_lock:
|
|
self._download_progress['completed'] += 1
|
|
self._download_progress['success'] += 1
|
|
self._update_download_status(creator_id, total_files)
|
|
|
|
return DownloadResult(
|
|
success=True,
|
|
file_path=str(dest_path),
|
|
file_hash=file_hash,
|
|
file_size=file_size
|
|
)
|
|
|
|
except Exception as e:
|
|
# Error during post-download processing (hashing, duplicate check, etc.)
|
|
import traceback
|
|
error = f"Post-processing error: {str(e)}"
|
|
self.log(f"Error processing {att_name}: {error}\n{traceback.format_exc()}", 'error')
|
|
self.db.update_attachment_status(att_id, 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
if creator_id and total_files:
|
|
async with self._results_lock:
|
|
self._download_progress['completed'] += 1
|
|
self._download_progress['failed'] += 1
|
|
self._update_download_status(creator_id, total_files)
|
|
return DownloadResult(success=False, error=error)
|
|
|
|
async def _download_stream_with_ffmpeg(self, att: Dict, url: str, dest_path: Path,
|
|
creator_id: int = None, total_files: int = None) -> DownloadResult:
|
|
"""Download streaming formats (m3u8/HLS, mpd/DASH) using ffmpeg"""
|
|
import subprocess
|
|
import re
|
|
import tempfile
|
|
import aiohttp
|
|
|
|
att_id = att['id']
|
|
att_name = att.get('name', 'unknown')
|
|
|
|
stream_type = 'HLS' if '.m3u8' in url else 'DASH' if '.mpd' in url else 'stream'
|
|
self.log(f"Downloading {stream_type} stream: {att_name}", 'info')
|
|
|
|
# Track as active download
|
|
if creator_id and total_files:
|
|
self._active_downloads[att_id] = {
|
|
'name': att_name,
|
|
'size': att.get('file_size'),
|
|
'progress': 0
|
|
}
|
|
self._update_download_status(creator_id, total_files)
|
|
|
|
temp_m3u8_path = None
|
|
segment_urls = []
|
|
try:
|
|
# For HLS with CloudFront signed URLs, we need to modify the playlist
|
|
# to include signed params on each segment URL (ffmpeg doesn't carry them)
|
|
input_source = url
|
|
use_protocol_whitelist = False
|
|
|
|
if '.m3u8' in url and ('Key-Pair-Id=' in url or 'Policy=' in url):
|
|
self.log("Processing CloudFront signed HLS playlist", 'debug')
|
|
|
|
# Extract signed params from URL
|
|
params_match = re.search(r'\?(.+)$', url)
|
|
if params_match:
|
|
signed_params = '?' + params_match.group(1)
|
|
|
|
# Get base URL for constructing absolute segment URLs
|
|
# Fansly: https://cdn3.fansly.com/new/{account}/{media}/{media}.m3u8?...
|
|
base_url_match = re.match(r'(https://[^/]+/new/\d+/\d+)', url)
|
|
if base_url_match:
|
|
base_url = base_url_match.group(1)
|
|
|
|
# Determine variant path (e.g., media-1) from URL or default
|
|
# The URL might be master.m3u8 or already a variant like media-1/stream.m3u8
|
|
if '/media-' in url:
|
|
# Already a variant URL
|
|
variant_match = re.search(r'(/media-\d+/)', url)
|
|
variant_path = variant_match.group(1) if variant_match else '/media-1/'
|
|
else:
|
|
# Master playlist - fetch it to find the highest quality variant
|
|
variant_path = None
|
|
|
|
# Fetch the playlist content
|
|
async with aiohttp.ClientSession() as session:
|
|
# If this is a master playlist, fetch it and pick best variant
|
|
if 'stream.m3u8' not in url and variant_path is None:
|
|
master_url = url
|
|
self.log(f"Fetching master playlist to find best variant", 'debug')
|
|
async with session.get(master_url) as master_resp:
|
|
if master_resp.status == 200:
|
|
master_content = await master_resp.text()
|
|
# Parse master playlist for variant streams
|
|
# Look for lines like: media-1/stream.m3u8, media-5/stream.m3u8
|
|
# and #EXT-X-STREAM-INF with RESOLUTION=WxH
|
|
best_variant = None
|
|
best_resolution = 0
|
|
current_bandwidth = 0
|
|
for line in master_content.splitlines():
|
|
line = line.strip()
|
|
if line.startswith('#EXT-X-STREAM-INF:'):
|
|
# Extract resolution if available
|
|
res_match = re.search(r'RESOLUTION=(\d+)x(\d+)', line)
|
|
bw_match = re.search(r'BANDWIDTH=(\d+)', line)
|
|
if res_match:
|
|
w, h = int(res_match.group(1)), int(res_match.group(2))
|
|
current_bandwidth = w * h
|
|
elif bw_match:
|
|
current_bandwidth = int(bw_match.group(1))
|
|
else:
|
|
current_bandwidth = 0
|
|
elif line and not line.startswith('#'):
|
|
# This is a variant URI line
|
|
vm = re.search(r'(media-\d+)/stream\.m3u8', line)
|
|
if vm:
|
|
if current_bandwidth > best_resolution:
|
|
best_resolution = current_bandwidth
|
|
best_variant = vm.group(1)
|
|
current_bandwidth = 0
|
|
if best_variant:
|
|
variant_path = f'/{best_variant}/'
|
|
self.log(f"Selected best variant: {best_variant} (resolution score: {best_resolution})", 'info')
|
|
else:
|
|
variant_path = '/media-1/'
|
|
self.log(f"No variants found in master playlist, defaulting to media-1", 'warning')
|
|
else:
|
|
variant_path = '/media-1/'
|
|
self.log(f"Failed to fetch master playlist: HTTP {master_resp.status}, defaulting to media-1", 'warning')
|
|
|
|
variant_url = f"{base_url}{variant_path}stream.m3u8{signed_params}"
|
|
self.log(f"Fetching variant playlist: {variant_path}stream.m3u8", 'debug')
|
|
elif 'stream.m3u8' not in url:
|
|
variant_url = f"{base_url}{variant_path}stream.m3u8{signed_params}"
|
|
self.log(f"Fetching variant playlist: {variant_path}stream.m3u8", 'debug')
|
|
else:
|
|
variant_url = url
|
|
|
|
async with session.get(variant_url) as resp:
|
|
if resp.status == 200:
|
|
playlist_content = await resp.text()
|
|
# Normalize CRLF to LF (Windows -> Unix line endings)
|
|
playlist_content = playlist_content.replace('\r\n', '\n').replace('\r', '\n')
|
|
|
|
# Replace relative segment URLs with absolute signed URLs
|
|
# Matches: segment-0.ts, segment-123.ts, etc.
|
|
# Capture base_url and variant_path in closure
|
|
local_base = base_url
|
|
local_variant = variant_path
|
|
local_params = signed_params
|
|
|
|
def replace_segment(match):
|
|
segment = match.group(1)
|
|
return f"{local_base}{local_variant}{segment}{local_params}"
|
|
|
|
modified_content = re.sub(
|
|
r'^(segment-\d+\.ts)$',
|
|
replace_segment,
|
|
playlist_content,
|
|
flags=re.MULTILINE
|
|
)
|
|
|
|
# Extract signed segment URLs and total duration for direct download
|
|
playlist_duration = 0.0
|
|
for seg_line in modified_content.splitlines():
|
|
seg_line = seg_line.strip()
|
|
if seg_line.startswith('#EXTINF:'):
|
|
try:
|
|
playlist_duration += float(seg_line.split(':')[1].split(',')[0])
|
|
except (ValueError, IndexError):
|
|
pass
|
|
elif seg_line and not seg_line.startswith('#') and seg_line.startswith('https://'):
|
|
segment_urls.append(seg_line)
|
|
self.log(f"Extracted {len(segment_urls)} segment URLs, playlist duration: {playlist_duration:.2f}s", 'info')
|
|
else:
|
|
self.log(f"Failed to fetch variant playlist: HTTP {resp.status}", 'warning')
|
|
|
|
if segment_urls:
|
|
# Direct segment download approach - avoids ffmpeg HLS networking
|
|
# which stalls on large 4K streams with CloudFront signed URLs
|
|
import shutil
|
|
import os as _os
|
|
|
|
temp_dir = tempfile.mkdtemp(prefix='hls_segments_')
|
|
try:
|
|
total_segments = len(segment_urls)
|
|
self.log(f"Downloading {total_segments} HLS segments directly", 'info')
|
|
|
|
segment_paths = {}
|
|
completed_count = 0
|
|
total_bytes = 0
|
|
progress_lock = asyncio.Lock()
|
|
|
|
seg_timeout = aiohttp.ClientTimeout(total=120, connect=15, sock_read=60)
|
|
connector = aiohttp.TCPConnector(limit=5)
|
|
|
|
async with aiohttp.ClientSession(timeout=seg_timeout, connector=connector) as dl_session:
|
|
semaphore = asyncio.Semaphore(5)
|
|
|
|
async def _dl_segment(idx, seg_url):
|
|
nonlocal completed_count, total_bytes
|
|
seg_path = _os.path.join(temp_dir, f'segment-{idx:05d}.ts')
|
|
|
|
async with semaphore:
|
|
for attempt in range(3):
|
|
try:
|
|
async with dl_session.get(seg_url) as seg_resp:
|
|
if seg_resp.status != 200:
|
|
raise Exception(f"HTTP {seg_resp.status}")
|
|
with open(seg_path, 'wb') as sf:
|
|
async for chunk in seg_resp.content.iter_chunked(65536):
|
|
sf.write(chunk)
|
|
break
|
|
except Exception as e:
|
|
if attempt == 2:
|
|
raise Exception(f"Segment {idx}/{total_segments} failed after 3 attempts: {e}")
|
|
self.log(f"Segment {idx} attempt {attempt+1} failed: {e}, retrying", 'warning')
|
|
try:
|
|
_os.unlink(seg_path)
|
|
except OSError:
|
|
pass
|
|
await asyncio.sleep(2 ** attempt)
|
|
|
|
seg_size = _os.path.getsize(seg_path)
|
|
segment_paths[idx] = seg_path
|
|
|
|
async with progress_lock:
|
|
completed_count += 1
|
|
total_bytes += seg_size
|
|
|
|
# Update progress every ~5%
|
|
update_interval = max(1, total_segments // 20)
|
|
if completed_count % update_interval == 0 or completed_count == total_segments:
|
|
pct = completed_count / total_segments
|
|
est_total = int(total_bytes / pct) if pct > 0.01 else 0
|
|
if att_id in self._active_downloads:
|
|
self._active_downloads[att_id]['progress'] = total_bytes
|
|
self._active_downloads[att_id]['size'] = est_total
|
|
if creator_id and total_files:
|
|
self._update_download_status(creator_id, total_files)
|
|
self.log(f"Segments: {completed_count}/{total_segments} ({self._format_bytes(total_bytes)})", 'debug')
|
|
|
|
# Download all segments with limited concurrency
|
|
tasks = [_dl_segment(i, u) for i, u in enumerate(segment_urls)]
|
|
await asyncio.gather(*tasks)
|
|
|
|
self.log(f"All {total_segments} segments downloaded ({self._format_bytes(total_bytes)})", 'info')
|
|
|
|
# Create concat list for ffmpeg
|
|
concat_path = _os.path.join(temp_dir, 'concat.txt')
|
|
with open(concat_path, 'w') as cf:
|
|
for i in range(total_segments):
|
|
cf.write(f"file '{segment_paths[i]}'\n")
|
|
|
|
# Remux segments to MP4 with ffmpeg (local files only, no network)
|
|
self.log(f"Remuxing {total_segments} segments to MP4...", 'info')
|
|
remux_cmd = [
|
|
'ffmpeg', '-y',
|
|
'-f', 'concat',
|
|
'-safe', '0',
|
|
'-i', concat_path,
|
|
'-c', 'copy',
|
|
'-bsf:a', 'aac_adtstoasc',
|
|
'-avoid_negative_ts', 'make_zero',
|
|
'-fflags', '+genpts',
|
|
]
|
|
# Trim to exact playlist duration to remove HLS segment padding
|
|
if playlist_duration > 0:
|
|
remux_cmd.extend(['-t', f'{playlist_duration:.6f}'])
|
|
remux_cmd.extend([
|
|
'-progress', 'pipe:1', '-nostats',
|
|
str(dest_path)
|
|
])
|
|
process = await asyncio.create_subprocess_exec(
|
|
*remux_cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE
|
|
)
|
|
|
|
# Track remux progress
|
|
remux_last_update = 0
|
|
duration_us = int(playlist_duration * 1_000_000) if playlist_duration > 0 else 0
|
|
while True:
|
|
line = await process.stdout.readline()
|
|
if not line:
|
|
break
|
|
line_str = line.decode('utf-8', errors='replace').strip()
|
|
if line_str.startswith('out_time_us=') and duration_us > 0:
|
|
try:
|
|
current_us = int(line_str.split('=')[1])
|
|
if current_us > 0:
|
|
pct = min(current_us / duration_us, 1.0)
|
|
if pct - remux_last_update >= 0.10:
|
|
remux_last_update = pct
|
|
self.log(f"Remuxing: {int(pct * 100)}%", 'info')
|
|
if att_id in self._active_downloads:
|
|
self._active_downloads[att_id]['progress'] = total_bytes
|
|
self._active_downloads[att_id]['size'] = total_bytes
|
|
self._active_downloads[att_id]['name'] = f"{att_name} (remuxing {int(pct * 100)}%)"
|
|
if creator_id and total_files:
|
|
self._update_download_status(creator_id, total_files)
|
|
except (ValueError, ZeroDivisionError):
|
|
pass
|
|
|
|
_, stderr_data = await process.communicate()
|
|
|
|
# Restore original name
|
|
if att_id in self._active_downloads:
|
|
self._active_downloads[att_id]['name'] = att_name
|
|
|
|
if process.returncode != 0:
|
|
error_msg = stderr_data.decode('utf-8', errors='replace')[-2000:]
|
|
raise Exception(f"ffmpeg remux failed: {error_msg}")
|
|
|
|
finally:
|
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
|
|
else:
|
|
# Standard ffmpeg approach for non-CloudFront HLS or DASH streams
|
|
# Parse total duration from playlist for progress tracking
|
|
total_duration_us = 0
|
|
if use_protocol_whitelist and temp_m3u8_path:
|
|
try:
|
|
with open(temp_m3u8_path, 'r') as f:
|
|
for line in f:
|
|
if line.startswith('#EXTINF:'):
|
|
duration_str = line.split(':')[1].split(',')[0]
|
|
total_duration_us += int(float(duration_str) * 1_000_000)
|
|
except Exception as e:
|
|
self.log(f"Could not parse playlist duration: {e}", 'debug')
|
|
|
|
# Build ffmpeg command
|
|
cmd = ['ffmpeg', '-y']
|
|
|
|
if use_protocol_whitelist:
|
|
cmd.extend(['-protocol_whitelist', 'file,http,https,tcp,tls'])
|
|
|
|
# For DASH/HLS with CloudFront signed params, extract them from the URL
|
|
# and pass as cookies so ffmpeg carries auth to ALL requests (manifest + segments)
|
|
if '?' in input_source and 'CloudFront-' in input_source:
|
|
from urllib.parse import urlparse as _urlparse, parse_qs as _parse_qs, urlunparse as _urlunparse
|
|
parsed_url = _urlparse(input_source)
|
|
domain = parsed_url.hostname
|
|
params = _parse_qs(parsed_url.query)
|
|
|
|
# Extract CloudFront params and build cookies
|
|
cf_cookies = []
|
|
remaining_params = []
|
|
for key, values in params.items():
|
|
val = values[0] if values else ''
|
|
if key.startswith('CloudFront-'):
|
|
cf_cookies.append(f"{key}={val}; path=/; domain={domain};\r\n")
|
|
else:
|
|
remaining_params.append(f"{key}={val}")
|
|
|
|
if cf_cookies:
|
|
cookie_str = ''.join(cf_cookies)
|
|
cmd.extend(['-cookies', cookie_str])
|
|
# Rebuild URL without CloudFront params
|
|
clean_query = '&'.join(remaining_params)
|
|
input_source = _urlunparse(parsed_url._replace(query=clean_query))
|
|
self.log(f"Passing {len(cf_cookies)} CloudFront cookies for {stream_type} stream", 'debug')
|
|
|
|
cmd.extend(['-i', input_source])
|
|
|
|
if total_duration_us > 0:
|
|
cmd.extend(['-progress', 'pipe:1', '-nostats'])
|
|
|
|
cmd.extend([
|
|
'-c', 'copy',
|
|
'-bsf:a', 'aac_adtstoasc',
|
|
'-movflags', '+faststart',
|
|
str(dest_path)
|
|
])
|
|
|
|
process = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE
|
|
)
|
|
|
|
stderr_data = b''
|
|
if total_duration_us > 0 and creator_id and total_files:
|
|
last_update = 0
|
|
stall_timeout = 120
|
|
|
|
while True:
|
|
try:
|
|
line = await asyncio.wait_for(process.stdout.readline(), timeout=stall_timeout)
|
|
except asyncio.TimeoutError:
|
|
self.log(f"ffmpeg stalled for {stall_timeout}s downloading {att_name}, killing", 'warning')
|
|
process.kill()
|
|
await process.wait()
|
|
raise Exception(f"ffmpeg stalled (no output for {stall_timeout}s)")
|
|
if not line:
|
|
break
|
|
line_str = line.decode('utf-8', errors='replace').strip()
|
|
|
|
if line_str.startswith('out_time_us='):
|
|
try:
|
|
current_us = int(line_str.split('=')[1])
|
|
if current_us > 0:
|
|
progress_pct = min(current_us / total_duration_us, 1.0)
|
|
try:
|
|
actual_size = dest_path.stat().st_size if dest_path.exists() else 0
|
|
except OSError:
|
|
actual_size = 0
|
|
estimated_total = int(actual_size / progress_pct) if progress_pct > 0.01 else 0
|
|
|
|
if progress_pct - last_update >= 0.02:
|
|
last_update = progress_pct
|
|
if att_id in self._active_downloads:
|
|
self._active_downloads[att_id]['progress'] = actual_size
|
|
self._active_downloads[att_id]['size'] = estimated_total
|
|
self._update_download_status(creator_id, total_files)
|
|
except (ValueError, ZeroDivisionError):
|
|
pass
|
|
|
|
_, stderr_data = await process.communicate()
|
|
else:
|
|
_, stderr_data = await process.communicate()
|
|
|
|
if process.returncode != 0:
|
|
error_msg = stderr_data.decode('utf-8', errors='replace')[-2000:]
|
|
raise Exception(f"ffmpeg failed: {error_msg}")
|
|
|
|
# Verify file was created
|
|
if not dest_path.exists():
|
|
raise Exception("Output file was not created")
|
|
|
|
file_size = dest_path.stat().st_size
|
|
if file_size == 0:
|
|
dest_path.unlink()
|
|
raise Exception("Output file is empty")
|
|
|
|
self.log(f"Downloaded {stream_type} stream: {att_name} ({self._format_bytes(file_size)})", 'info')
|
|
|
|
# Calculate file hash
|
|
file_hash = await self._compute_file_hash_async(dest_path)
|
|
|
|
# Update database
|
|
self.db.update_attachment_status(
|
|
att_id, 'completed',
|
|
local_path=str(dest_path),
|
|
local_filename=dest_path.name,
|
|
file_hash=file_hash,
|
|
file_size=file_size,
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
|
|
# Get dimensions from the downloaded file
|
|
try:
|
|
probe_cmd = ['ffprobe', '-v', 'error', '-select_streams', 'v:0',
|
|
'-show_entries', 'stream=width,height', '-of', 'csv=p=0', str(dest_path)]
|
|
result = subprocess.run(probe_cmd, capture_output=True, text=True, timeout=30)
|
|
if result.returncode == 0 and result.stdout.strip():
|
|
parts = result.stdout.strip().split(',')
|
|
if len(parts) == 2:
|
|
width, height = int(parts[0]), int(parts[1])
|
|
self.db.update_attachment(att_id, {'width': width, 'height': height})
|
|
self.log(f"Video dimensions: {width}x{height}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Could not get video dimensions: {e}", 'debug')
|
|
|
|
# Update progress
|
|
if creator_id and total_files:
|
|
if att_id in self._active_downloads:
|
|
del self._active_downloads[att_id]
|
|
async with self._results_lock:
|
|
self._download_progress['completed'] += 1
|
|
self._download_progress['success'] += 1
|
|
self._update_download_status(creator_id, total_files)
|
|
|
|
return DownloadResult(
|
|
success=True,
|
|
file_path=str(dest_path),
|
|
file_hash=file_hash,
|
|
file_size=file_size
|
|
)
|
|
|
|
except Exception as e:
|
|
error = f"Stream download failed: {str(e)}"
|
|
self.log(f"Error downloading {att_name}: {error}", 'error')
|
|
self.db.update_attachment_status(att_id, 'failed',
|
|
error_message=error,
|
|
download_attempts=att.get('download_attempts', 0) + 1,
|
|
last_attempt=datetime.now().isoformat()
|
|
)
|
|
if creator_id and total_files:
|
|
if att_id in self._active_downloads:
|
|
del self._active_downloads[att_id]
|
|
async with self._results_lock:
|
|
self._download_progress['completed'] += 1
|
|
self._download_progress['failed'] += 1
|
|
self._update_download_status(creator_id, total_files)
|
|
return DownloadResult(success=False, error=error)
|
|
finally:
|
|
# Clean up temp playlist file
|
|
if temp_m3u8_path:
|
|
try:
|
|
import os
|
|
os.unlink(temp_m3u8_path)
|
|
except Exception:
|
|
pass
|
|
|
|
async def _download_pending_embeds(self, creator_id: int, base_path: Path, creator: Dict) -> Dict:
|
|
"""Download pending embedded videos using yt-dlp"""
|
|
from .embed_downloader import EmbedDownloader
|
|
|
|
pending = self.db.get_pending_embeds(creator_id=creator_id)
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0}
|
|
|
|
self.log(f"Downloading {len(pending)} embeds for {creator['username']}", 'info')
|
|
|
|
downloader = EmbedDownloader(log_callback=self.log_callback)
|
|
quality = self.config.get('embed_quality', 'best')
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
|
|
for embed in pending:
|
|
try:
|
|
post = self.db.get_post(embed['post_id'])
|
|
if not post:
|
|
continue
|
|
|
|
# Build output directory
|
|
post_date = post.get('published_at', '')[:10] or 'unknown-date'
|
|
post_title = self._sanitize_filename(post.get('title') or '')[:50]
|
|
post_id_short = post.get('post_id', 'unknown')[:12]
|
|
post_dir = f"{post_title}_{post_id_short}" if post_title else post_id_short
|
|
|
|
output_dir = base_path / creator['platform'] / self._sanitize_filename(creator['username']) / post_date / post_dir
|
|
|
|
self.db.update_embed_status(embed['id'], 'downloading')
|
|
|
|
result = await downloader.download(embed['url'], output_dir, quality=quality)
|
|
|
|
if result['success']:
|
|
self.db.update_embed_status(embed['id'], 'completed',
|
|
local_path=result.get('file_path'),
|
|
local_filename=result.get('filename'),
|
|
file_size=result.get('file_size'),
|
|
duration=result.get('duration'),
|
|
title=result.get('title'),
|
|
downloaded_at=datetime.now().isoformat()
|
|
)
|
|
downloaded += 1
|
|
else:
|
|
self.db.update_embed_status(embed['id'], 'failed',
|
|
error_message=result.get('error'),
|
|
download_attempts=embed.get('download_attempts', 0) + 1
|
|
)
|
|
failed += 1
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading embed {embed['url']}: {e}", 'error')
|
|
self.db.update_embed_status(embed['id'], 'failed',
|
|
error_message=str(e),
|
|
download_attempts=embed.get('download_attempts', 0) + 1
|
|
)
|
|
failed += 1
|
|
|
|
return {'downloaded': downloaded, 'failed': failed}
|
|
|
|
async def retry_failed_downloads(self, attachment_ids: List[int] = None) -> Dict:
|
|
"""Retry failed downloads"""
|
|
if attachment_ids:
|
|
# Retry specific attachments
|
|
pending = []
|
|
for att_id in attachment_ids:
|
|
att = self.db.get_attachment(att_id)
|
|
if att and att['status'] == 'failed':
|
|
pending.append(att)
|
|
else:
|
|
# Get all eligible failed downloads
|
|
max_attempts = self.config.get('retry_max_attempts', 3)
|
|
pending = self.db.get_failed_downloads(max_attempts=max_attempts)
|
|
|
|
if not pending:
|
|
return {'downloaded': 0, 'failed': 0, 'skipped': 0}
|
|
|
|
self.log(f"Retrying {len(pending)} failed downloads", 'info')
|
|
|
|
# Reset status to pending
|
|
for att in pending:
|
|
self.db.update_attachment_status(att['id'], 'pending')
|
|
|
|
# Group by creator
|
|
by_creator = {}
|
|
for att in pending:
|
|
cid = att['creator_db_id']
|
|
if cid not in by_creator:
|
|
by_creator[cid] = []
|
|
by_creator[cid].append(att)
|
|
|
|
total_downloaded = 0
|
|
total_failed = 0
|
|
total_skipped = 0
|
|
|
|
for creator_id, atts in by_creator.items():
|
|
result = await self.download_pending_for_creator(creator_id)
|
|
total_downloaded += result.get('downloaded', 0)
|
|
total_failed += result.get('failed', 0)
|
|
total_skipped += result.get('skipped', 0)
|
|
|
|
return {
|
|
'downloaded': total_downloaded,
|
|
'failed': total_failed,
|
|
'skipped': total_skipped
|
|
}
|
|
|
|
def _build_file_path(self, base_path: Path, creator: Dict, post: Dict, attachment: Dict) -> Path:
|
|
"""Build destination file path following directory structure"""
|
|
# /paid-content/onlyfans/creatorname/2024-01-15/Post-Title_abc123/001_originalname.jpg
|
|
|
|
platform = creator['platform']
|
|
username = self._sanitize_filename(creator['username'])
|
|
|
|
# Date directory
|
|
post_date = post.get('published_at', '')[:10] or 'unknown-date'
|
|
|
|
# Post directory - always use post_id for consistency
|
|
post_id = post.get('post_id', 'unknown')
|
|
post_dir = post_id
|
|
|
|
# Filename - for Fansly use just the media ID (already unique)
|
|
# For other platforms, use index prefix to avoid collisions
|
|
index = attachment.get('attachment_index', 0) + 1
|
|
original_name = attachment.get('name', '')
|
|
|
|
if original_name:
|
|
# Sanitize the original filename
|
|
sanitized_name = self._sanitize_filename(original_name)
|
|
# Fansly media IDs are unique, no index needed
|
|
if platform == 'fansly':
|
|
filename = sanitized_name
|
|
else:
|
|
# Add index prefix for other platforms
|
|
filename = f"{index:03d}_{sanitized_name}"
|
|
else:
|
|
# Fallback to index + extension if no name
|
|
ext = attachment.get('extension') or 'bin'
|
|
if not ext.startswith('.'):
|
|
ext = '.' + ext
|
|
if platform == 'fansly':
|
|
filename = f"attachment_{index}{ext}"
|
|
else:
|
|
filename = f"{index:03d}{ext}"
|
|
|
|
return base_path / platform / username / post_date / post_dir / filename
|
|
|
|
def _sanitize_filename(self, name: str) -> str:
|
|
"""Sanitize string for use in filename/directory"""
|
|
if not name:
|
|
return 'unnamed'
|
|
# Remove/replace invalid characters
|
|
name = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', name)
|
|
name = re.sub(r'\s+', '-', name.strip())
|
|
return name or 'unnamed'
|
|
|
|
def _build_message_file_path(self, base_path: Path, creator: Dict, message: Dict, attachment: Dict) -> Path:
|
|
"""Build destination file path for message attachments"""
|
|
# /paid-content/onlyfans/creatorname/messages/2024-01-15/001_originalname.jpg
|
|
platform = creator['platform']
|
|
username = self._sanitize_filename(creator['username'])
|
|
|
|
msg_date = (message.get('sent_at') or '')[:10] or 'unknown-date'
|
|
|
|
index = attachment.get('attachment_index', 0) + 1
|
|
original_name = attachment.get('name', '')
|
|
|
|
if original_name:
|
|
sanitized_name = self._sanitize_filename(original_name)
|
|
filename = f"{index:03d}_{sanitized_name}"
|
|
else:
|
|
ext = attachment.get('extension') or 'bin'
|
|
if not ext.startswith('.'):
|
|
ext = '.' + ext
|
|
filename = f"{index:03d}{ext}"
|
|
|
|
return base_path / platform / username / 'messages' / msg_date / filename
|
|
|
|
async def _sync_messages_for_creator(self, creator: Dict, client, platform: str) -> int:
|
|
"""
|
|
Sync messages for a creator.
|
|
|
|
Args:
|
|
creator: Creator dict from database
|
|
client: OnlyFansClient or FanslyDirectClient instance
|
|
platform: 'onlyfans' or 'fansly'
|
|
|
|
Returns:
|
|
Count of new messages
|
|
"""
|
|
creator_id = creator['id']
|
|
new_messages = 0
|
|
|
|
try:
|
|
self.log(f"Syncing messages for {creator['username']} ({platform})", 'info')
|
|
|
|
if platform == 'onlyfans':
|
|
of_user_id = creator.get('creator_id', '')
|
|
if not of_user_id:
|
|
return 0
|
|
messages = await client.get_messages(of_user_id)
|
|
elif platform == 'fansly':
|
|
# Find the chat group for this creator
|
|
chat_list = await client.get_chat_list()
|
|
creator_account_id = creator.get('creator_id', '')
|
|
group_id = None
|
|
for chat in chat_list:
|
|
if str(chat.get('partner_account_id')) == str(creator_account_id):
|
|
group_id = chat['group_id']
|
|
break
|
|
if not group_id:
|
|
self.log(f"No chat group found for {creator['username']}", 'debug')
|
|
return 0
|
|
messages = await client.get_messages(group_id, creator_account_id)
|
|
else:
|
|
return 0
|
|
|
|
if not messages:
|
|
self.log(f"No messages for {creator['username']}", 'debug')
|
|
return 0
|
|
|
|
self.log(f"Processing {len(messages)} messages for {creator['username']}", 'info')
|
|
|
|
for msg in messages:
|
|
msg_data = msg.to_dict()
|
|
msg_db_id, is_new = self.db.upsert_message(creator_id, msg_data)
|
|
if is_new:
|
|
new_messages += 1
|
|
|
|
# Upsert attachments for this message
|
|
for idx, attachment in enumerate(msg.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
self.db.upsert_message_attachment(msg_db_id, att_data)
|
|
|
|
self.log(f"Synced {len(messages)} messages ({new_messages} new) for {creator['username']}", 'info')
|
|
return new_messages
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing messages for {creator['username']}: {e}", 'error')
|
|
return 0
|
|
|
|
def _extract_embeds(self, content: str) -> List[Tuple[str, str, str]]:
|
|
"""Extract embedded video URLs from post content"""
|
|
if not content:
|
|
return []
|
|
|
|
embeds = []
|
|
for pattern, platform in self.EMBED_PATTERNS:
|
|
for match in re.finditer(pattern, content):
|
|
url = match.group(0)
|
|
video_id = match.group(1)
|
|
embeds.append((url, platform, video_id))
|
|
|
|
return embeds
|
|
|
|
async def _compute_file_hash_async(self, file_path: Path) -> Optional[str]:
|
|
"""Compute SHA256 hash of file asynchronously"""
|
|
if not file_path.exists():
|
|
return None
|
|
|
|
sha256 = hashlib.sha256()
|
|
|
|
async with aiofiles.open(file_path, 'rb') as f:
|
|
while chunk := await f.read(65536):
|
|
sha256.update(chunk)
|
|
|
|
return sha256.hexdigest()
|
|
|
|
def _format_bytes(self, size: int) -> str:
|
|
"""Format bytes to human readable string"""
|
|
if size is None:
|
|
return "0 B"
|
|
for unit in ['B', 'KB', 'MB', 'GB']:
|
|
if abs(size) < 1024.0:
|
|
return f"{size:.1f} {unit}"
|
|
size /= 1024.0
|
|
return f"{size:.1f} TB"
|
|
|
|
@staticmethod
|
|
def _get_platform_display_name(creator: Dict) -> str:
|
|
"""Get clean platform display name for notifications."""
|
|
_PLATFORM_NAMES = {
|
|
'onlyfans_direct': 'OnlyFans',
|
|
'onlyfans': 'OnlyFans',
|
|
'fansly_direct': 'Fansly',
|
|
'fansly': 'Fansly',
|
|
'pornhub': 'Pornhub',
|
|
'youtube': 'YouTube',
|
|
'twitch': 'Twitch',
|
|
'coomer': 'Coomer',
|
|
'kemono': 'Kemono',
|
|
}
|
|
key = creator.get('service_id') or creator.get('platform') or 'Unknown'
|
|
return _PLATFORM_NAMES.get(key, key.replace('_', ' ').title())
|
|
|
|
def _send_creator_notification(self, creator: Dict, new_posts: int, downloaded: int,
|
|
downloaded_file_info: List[Dict], scheduled: bool = False,
|
|
new_messages: int = 0):
|
|
"""Send push notification and create DB record for a creator sync.
|
|
|
|
Args:
|
|
creator: Creator dict with id, username, service_id, platform
|
|
new_posts: Number of new posts found
|
|
downloaded: Number of files downloaded
|
|
downloaded_file_info: List of dicts with file info
|
|
scheduled: Whether this is a scheduled sync
|
|
new_messages: Number of new messages found
|
|
"""
|
|
if not (scheduled and (downloaded > 0 or new_messages > 0)):
|
|
return
|
|
|
|
# If creator has a tagged-user filter, recount to only include matching posts
|
|
filter_tagged = creator.get('filter_tagged_users', '') or ''
|
|
filter_tagged = filter_tagged.strip()
|
|
if filter_tagged and filter_tagged != '[]':
|
|
try:
|
|
import json as _json
|
|
filter_users = _json.loads(filter_tagged)
|
|
if isinstance(filter_users, list) and filter_users:
|
|
placeholders = ','.join(['?'] * len(filter_users))
|
|
with self.db.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
# Count new posts that have at least one matching tagged user
|
|
cursor.execute(f"""
|
|
SELECT COUNT(DISTINCT p.id)
|
|
FROM paid_content_posts p
|
|
JOIN paid_content_post_tagged_users tu ON tu.post_id = p.id
|
|
WHERE p.creator_id = ?
|
|
AND tu.username IN ({placeholders})
|
|
AND p.created_at >= datetime('now', '-1 hour')
|
|
""", (creator['id'], *filter_users))
|
|
filtered_new_posts = cursor.fetchone()[0]
|
|
# Count downloaded attachments from matching posts
|
|
cursor.execute(f"""
|
|
SELECT COUNT(DISTINCT a.id)
|
|
FROM paid_content_attachments a
|
|
JOIN paid_content_posts p ON a.post_id = p.id
|
|
JOIN paid_content_post_tagged_users tu ON tu.post_id = p.id
|
|
WHERE p.creator_id = ?
|
|
AND tu.username IN ({placeholders})
|
|
AND a.status = 'downloaded'
|
|
AND a.updated_at >= datetime('now', '-1 hour')
|
|
""", (creator['id'], *filter_users))
|
|
filtered_downloaded = cursor.fetchone()[0]
|
|
self.log(f"Notification filter: {new_posts} posts -> {filtered_new_posts}, "
|
|
f"{downloaded} downloads -> {filtered_downloaded} (filter: {filter_users})", 'debug')
|
|
new_posts = filtered_new_posts
|
|
downloaded = filtered_downloaded
|
|
# Skip notification entirely if nothing matches the filter
|
|
if downloaded == 0 and new_messages == 0:
|
|
return
|
|
except Exception as e:
|
|
self.log(f"Error applying notification filter: {e}", 'debug')
|
|
|
|
self.log(f"Push check: scheduled={scheduled}, new_posts={new_posts}, downloaded={downloaded}, "
|
|
f"notifier={self.notifier is not None}, push_enabled={self.config.get('push_notifications_enabled')}", 'debug')
|
|
|
|
if not (self.notifier and self.config.get('push_notifications_enabled')):
|
|
# Still create DB notification even if push is disabled
|
|
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
platform = self._get_platform_display_name(creator)
|
|
if downloaded > 0:
|
|
title = f"💎 {downloaded} File{'s' if downloaded != 1 else ''} Downloaded"
|
|
else:
|
|
title = f"💬 {new_messages} New Message{'s' if new_messages != 1 else ''}"
|
|
msg_lines = [f"📱 Platform: {platform}", f"📄 Creator: {creator['username']}"]
|
|
if new_posts > 0:
|
|
msg_lines.append(f"📝 Posts: {new_posts}")
|
|
if new_messages > 0:
|
|
msg_lines.append(f"💬 Messages: {new_messages}")
|
|
msg_lines.append(f"\n⏰ {timestamp}")
|
|
message = "\n".join(msg_lines)
|
|
self.db.create_notification(
|
|
notification_type='new_messages' if downloaded == 0 else 'new_content',
|
|
creator_id=creator['id'],
|
|
title=title,
|
|
message=message,
|
|
download_count=new_posts,
|
|
file_count=downloaded,
|
|
media_files=downloaded_file_info[:5] if downloaded_file_info else None
|
|
)
|
|
return
|
|
|
|
# Count images vs videos vs audio and collect media files for preview
|
|
import random
|
|
from pathlib import Path
|
|
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'}
|
|
video_extensions = {'.mp4', '.mov', '.webm', '.avi', '.mkv', '.m4v'}
|
|
audio_extensions = {'.mp3', '.wav', '.flac', '.aac', '.m4a', '.ogg', '.wma'}
|
|
image_files = []
|
|
video_files = []
|
|
image_count = 0
|
|
video_count = 0
|
|
audio_count = 0
|
|
|
|
for file_info in downloaded_file_info:
|
|
file_path = file_info.get('file_path') or file_info.get('local_path')
|
|
if file_path:
|
|
path = Path(file_path)
|
|
ext = path.suffix.lower()
|
|
# Prefer content_type from file_info (set by scraper) over extension
|
|
content_type = file_info.get('content_type', '').lower()
|
|
if content_type == 'audio':
|
|
audio_count += 1
|
|
elif content_type == 'image':
|
|
image_count += 1
|
|
if path.exists():
|
|
image_files.append(str(path))
|
|
elif content_type == 'video':
|
|
video_count += 1
|
|
if path.exists():
|
|
video_files.append(str(path))
|
|
elif ext in audio_extensions:
|
|
audio_count += 1
|
|
elif ext in image_extensions:
|
|
image_count += 1
|
|
if path.exists():
|
|
image_files.append(str(path))
|
|
elif ext in video_extensions:
|
|
video_count += 1
|
|
if path.exists():
|
|
video_files.append(str(path))
|
|
|
|
# Select preview: prefer images, but extract frame from video if no images
|
|
image_path = None
|
|
temp_frame_path = None
|
|
if image_files:
|
|
image_path = random.choice(image_files)
|
|
elif video_files and hasattr(self.notifier, '_extract_random_video_frame'):
|
|
selected_video = random.choice(video_files)
|
|
self.log(f"Extracting frame from video for notification preview: {Path(selected_video).name}", 'debug')
|
|
temp_frame_path = self.notifier._extract_random_video_frame(selected_video)
|
|
if temp_frame_path:
|
|
image_path = temp_frame_path
|
|
self.log(f"Successfully extracted video frame for preview", 'debug')
|
|
|
|
# Build title with counts
|
|
title_parts = []
|
|
if image_count > 0:
|
|
title_parts.append(f"📸 {image_count} Image{'s' if image_count != 1 else ''}")
|
|
if video_count > 0:
|
|
title_parts.append(f"🎬 {video_count} Video{'s' if video_count != 1 else ''}")
|
|
if audio_count > 0:
|
|
title_parts.append(f"🎵 {audio_count} Audio")
|
|
if downloaded > 0 and not title_parts:
|
|
title_parts.append(f"💎 {downloaded} File{'s' if downloaded != 1 else ''}")
|
|
if title_parts:
|
|
title = " + ".join(title_parts) + " Downloaded"
|
|
elif new_messages > 0:
|
|
title = f"💬 {new_messages} New Message{'s' if new_messages != 1 else ''}"
|
|
else:
|
|
title = f"💎 {downloaded} File{'s' if downloaded != 1 else ''} Downloaded"
|
|
|
|
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
platform = self._get_platform_display_name(creator)
|
|
|
|
msg_lines = [f"📱 Platform: {platform}", f"📄 Creator: {creator['username']}"]
|
|
if new_posts > 0:
|
|
msg_lines.append(f"📝 Posts: {new_posts}")
|
|
if new_messages > 0:
|
|
msg_lines.append(f"💬 Messages: {new_messages}")
|
|
msg_lines.append(f"\n⏰ {timestamp}")
|
|
push_message = "\n".join(msg_lines)
|
|
|
|
try:
|
|
self.notifier.send_notification(
|
|
title=title,
|
|
message=push_message,
|
|
priority=-1 if downloaded == 0 else 0,
|
|
image_path=image_path
|
|
)
|
|
except Exception as e:
|
|
self.log(f"Failed to send push notification: {e}", 'warning')
|
|
|
|
# Clean up temp frame file if created
|
|
if temp_frame_path:
|
|
try:
|
|
Path(temp_frame_path).unlink(missing_ok=True)
|
|
except Exception:
|
|
pass
|
|
|
|
# Create notification record in database
|
|
self.db.create_notification(
|
|
notification_type='new_messages' if downloaded == 0 else 'new_content',
|
|
creator_id=creator['id'],
|
|
title=title,
|
|
message=push_message,
|
|
download_count=new_posts,
|
|
file_count=downloaded,
|
|
media_files=downloaded_file_info[:5] if downloaded_file_info else None
|
|
)
|
|
|
|
async def search_creators(self, service_id: str, query: str, platform: str = None) -> List[Dict]:
|
|
"""Search for creators on a service"""
|
|
client = self._get_client(service_id)
|
|
return await client.search_creators(query, platform)
|
|
|
|
async def add_creator(self, service_id: str, platform: str, creator_id: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a new creator to track"""
|
|
# Check if already exists
|
|
existing = self.db.get_creator_by_api_id(service_id, platform, creator_id)
|
|
if existing:
|
|
return {'success': False, 'error': 'Creator already tracked', 'creator': existing}
|
|
|
|
# Handle YouTube channels
|
|
if service_id == 'youtube':
|
|
return await self._add_youtube_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle Twitch channels
|
|
if service_id == 'twitch':
|
|
return await self._add_twitch_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle Fansly Direct
|
|
if service_id == 'fansly_direct':
|
|
return await self._add_fansly_direct_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle OnlyFans Direct
|
|
if service_id == 'onlyfans_direct':
|
|
return await self._add_onlyfans_direct_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle Pornhub
|
|
if service_id == 'pornhub':
|
|
return await self._add_pornhub_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle XHamster
|
|
if service_id == 'xhamster':
|
|
return await self._add_xhamster_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle TikTok
|
|
if service_id == 'tiktok':
|
|
return await self._add_tiktok_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle Instagram
|
|
if service_id == 'instagram':
|
|
return await self._add_instagram_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle Soundgasm
|
|
if service_id == 'soundgasm':
|
|
return await self._add_soundgasm_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle BestEyeCandy
|
|
if service_id == 'besteyecandy':
|
|
return await self._add_besteyecandy_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle Bellazon
|
|
if service_id == 'bellazon':
|
|
return await self._add_bellazon_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle Snapchat
|
|
if service_id == 'snapchat':
|
|
return await self._add_snapchat_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle Reddit
|
|
if service_id == 'reddit':
|
|
return await self._add_reddit_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle Coppermine galleries
|
|
if service_id == 'coppermine':
|
|
return await self._add_coppermine_creator(creator_id, auto_download, download_embeds)
|
|
|
|
# Handle XenForo forums (HQCelebCorner, PicturePub, etc.)
|
|
if service_id in self.XENFORO_FORUMS:
|
|
return await self._add_xenforo_creator(service_id, creator_id, auto_download, download_embeds)
|
|
|
|
# Fetch creator info from API
|
|
client = self._get_client(service_id)
|
|
creator_info = await client.get_creator(platform, creator_id)
|
|
|
|
if not creator_info:
|
|
return {'success': False, 'error': f'Creator not found on {service_id}'}
|
|
|
|
# Add to database
|
|
creator_data = creator_info.to_dict()
|
|
creator_data['auto_download'] = 1 if auto_download else 0
|
|
creator_data['download_embeds'] = 1 if download_embeds else 0
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
|
|
return {
|
|
'success': True,
|
|
'creator': {
|
|
'id': db_id,
|
|
**creator_data
|
|
}
|
|
}
|
|
|
|
async def _add_youtube_creator(self, channel_id_or_url: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a YouTube channel as a creator"""
|
|
youtube = self._get_youtube_client()
|
|
if not youtube.is_available():
|
|
return {'success': False, 'error': 'yt-dlp not available'}
|
|
|
|
# Extract channel ID from URL if necessary
|
|
if 'youtube.com' in channel_id_or_url or 'youtu.be' in channel_id_or_url:
|
|
extracted_id = youtube.extract_channel_id(channel_id_or_url)
|
|
if extracted_id:
|
|
channel_id = extracted_id
|
|
channel_url = channel_id_or_url
|
|
else:
|
|
return {'success': False, 'error': 'Could not extract channel ID from URL'}
|
|
else:
|
|
channel_id = channel_id_or_url
|
|
channel_url = youtube.normalize_channel_url(channel_id)
|
|
|
|
# Fetch channel info
|
|
creator_info = await youtube.get_creator(channel_url)
|
|
if not creator_info:
|
|
return {'success': False, 'error': 'YouTube channel not found'}
|
|
|
|
# Add to database
|
|
creator_data = creator_info.to_dict()
|
|
creator_data['auto_download'] = 1 if auto_download else 0
|
|
creator_data['download_embeds'] = 1 if download_embeds else 0
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
|
|
return {
|
|
'success': True,
|
|
'creator': {
|
|
'id': db_id,
|
|
**creator_data
|
|
}
|
|
}
|
|
|
|
async def _add_pornhub_creator(self, creator_id_str: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a Pornhub creator (pornstar/channel/user/model)"""
|
|
pornhub = self._get_pornhub_client()
|
|
if not pornhub.is_available():
|
|
return {'success': False, 'error': 'yt-dlp not available'}
|
|
|
|
# creator_id_str is in 'type/name' format from URL parser
|
|
creator_url = pornhub.normalize_creator_url(creator_id_str)
|
|
|
|
# Fetch creator info
|
|
creator_info = await pornhub.get_creator(creator_url)
|
|
if not creator_info:
|
|
return {'success': False, 'error': 'Pornhub creator not found'}
|
|
|
|
# Add to database
|
|
creator_data = creator_info.to_dict()
|
|
creator_data['auto_download'] = 1 if auto_download else 0
|
|
creator_data['download_embeds'] = 1 if download_embeds else 0
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
|
|
return {
|
|
'success': True,
|
|
'creator': {
|
|
'id': db_id,
|
|
**creator_data
|
|
}
|
|
}
|
|
|
|
async def _add_twitch_creator(self, channel_name_or_url: str,
|
|
auto_download: bool = True, download_embeds: bool = True) -> Dict:
|
|
"""Add a Twitch channel as a creator"""
|
|
twitch = self._get_twitch_client()
|
|
if not twitch.is_available():
|
|
return {'success': False, 'error': 'yt-dlp not available'}
|
|
|
|
# Extract channel name from URL if necessary
|
|
if 'twitch.tv' in channel_name_or_url:
|
|
extracted_name = twitch.extract_channel_name(channel_name_or_url)
|
|
if extracted_name:
|
|
channel_name = extracted_name
|
|
else:
|
|
return {'success': False, 'error': 'Could not extract channel name from URL'}
|
|
else:
|
|
channel_name = channel_name_or_url.lower()
|
|
|
|
# Fetch channel info
|
|
channel_url = f"https://www.twitch.tv/{channel_name}/clips"
|
|
creator_info = await twitch.get_creator(channel_url)
|
|
if not creator_info:
|
|
return {'success': False, 'error': 'Twitch channel not found or has no clips'}
|
|
|
|
# Add to database
|
|
creator_data = creator_info.to_dict()
|
|
creator_data['auto_download'] = 1 if auto_download else 0
|
|
creator_data['download_embeds'] = 1 if download_embeds else 0
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
|
|
return {
|
|
'success': True,
|
|
'creator': {
|
|
'id': db_id,
|
|
**creator_data
|
|
}
|
|
}
|
|
|
|
def _get_fansly_direct_client(self) -> Optional[FanslyDirectClient]:
|
|
"""Get or create Fansly Direct client"""
|
|
if self._fansly_direct_client is None:
|
|
# Get auth token from fansly_direct service
|
|
service = self.db.get_service('fansly_direct')
|
|
auth_token = service.get('session_cookie') if service else None
|
|
if not auth_token:
|
|
self.log("Fansly Direct auth token not configured", 'warning')
|
|
return None
|
|
self._fansly_direct_client = FanslyDirectClient(
|
|
auth_token=auth_token,
|
|
log_callback=self.log_callback
|
|
)
|
|
return self._fansly_direct_client
|
|
|
|
def _apply_auto_tag_rules(self, post_db_id: int, is_new_post: bool):
|
|
"""Apply auto-tag rules to a post (only for new posts)"""
|
|
if not is_new_post:
|
|
return
|
|
try:
|
|
self.db.apply_auto_tags_to_post(post_db_id)
|
|
except Exception:
|
|
pass # Don't let auto-tagging errors break sync
|
|
|
|
def _get_or_create_ppv_tag(self) -> Optional[Dict]:
|
|
"""Get or create the PPV tag for locked content"""
|
|
# Try to get existing PPV tag
|
|
tag = self.db.get_tag_by_slug('ppv')
|
|
if tag:
|
|
return tag
|
|
|
|
# Create PPV tag with a distinct color (orange/gold for premium content)
|
|
tag_id = self.db.create_tag(
|
|
name='PPV',
|
|
color='#f59e0b', # Amber/orange color
|
|
description='Pay-per-view content requiring manual import'
|
|
)
|
|
if tag_id:
|
|
return self.db.get_tag(tag_id)
|
|
return None
|
|
|
|
async def _add_fansly_direct_creator(
|
|
self,
|
|
username: str,
|
|
auto_download: bool = True,
|
|
download_embeds: bool = True
|
|
) -> Dict:
|
|
"""Add a Fansly creator via direct API"""
|
|
client = self._get_fansly_direct_client()
|
|
if not client:
|
|
return {'success': False, 'error': 'Fansly auth token not configured. Please set it in Settings.'}
|
|
|
|
# Check if creator with this username already exists
|
|
existing = self.db.get_creator_by_api_id('fansly_direct', 'fansly', username)
|
|
if existing:
|
|
return {'success': False, 'error': 'Creator already tracked', 'creator': existing}
|
|
|
|
# Fetch account info from Fansly API
|
|
account = await client.get_account_info(username)
|
|
if not account:
|
|
return {'success': False, 'error': f'Fansly account not found: {username}'}
|
|
|
|
# Cache profile images locally
|
|
fansly_creator_id = account.get('account_id') or username
|
|
cached_avatar = await self._cache_profile_image(account.get('avatar_url'), 'fansly', fansly_creator_id, 'avatar') if account.get('avatar_url') else None
|
|
cached_banner = await self._cache_profile_image(account.get('banner_url'), 'fansly', fansly_creator_id, 'banner') if account.get('banner_url') else None
|
|
|
|
# Create creator data
|
|
creator_data = {
|
|
'service_id': 'fansly_direct',
|
|
'platform': 'fansly',
|
|
'creator_id': fansly_creator_id,
|
|
'username': account.get('username') or username,
|
|
'display_name': account.get('display_name'),
|
|
'profile_image_url': cached_avatar or account.get('avatar_url'),
|
|
'banner_image_url': cached_banner or account.get('banner_url'),
|
|
'auto_download': 1 if auto_download else 0,
|
|
'download_embeds': 1 if download_embeds else 0,
|
|
}
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
|
|
return {
|
|
'success': True,
|
|
'creator': {
|
|
'id': db_id,
|
|
**creator_data
|
|
}
|
|
}
|
|
|
|
async def _sync_fansly_direct_creator(
|
|
self,
|
|
creator: Dict,
|
|
download: bool = True,
|
|
scheduled: bool = False,
|
|
date_from: str = None,
|
|
date_to: str = None,
|
|
days_back: int = None
|
|
) -> SyncResult:
|
|
"""
|
|
Sync a Fansly creator via direct API.
|
|
|
|
Args:
|
|
creator: Creator dict from database
|
|
download: Whether to download files after syncing
|
|
scheduled: If True, create notifications (for scheduled syncs only)
|
|
date_from: Only fetch posts after this date (ISO format)
|
|
date_to: Only fetch posts before this date (ISO format)
|
|
days_back: Fetch posts from the last N days
|
|
"""
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing Fansly creator via direct API: {creator['username']}", 'info')
|
|
|
|
# Register active sync for polling-based updates
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'fansly',
|
|
'service': 'fansly_direct',
|
|
'status': 'Fetching posts...',
|
|
'phase': 'fetching',
|
|
'failed': 0
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
|
|
# Emit WebSocket event
|
|
self._emit_event('paid_content_sync_started', {
|
|
'creator_id': creator_id,
|
|
**sync_data
|
|
})
|
|
|
|
try:
|
|
client = self._get_fansly_direct_client()
|
|
if not client:
|
|
error = "Fansly auth token not configured"
|
|
self._unregister_active_sync(creator_id)
|
|
return SyncResult(success=False, error=error)
|
|
|
|
# Fetch and update creator profile (display name, avatar, banner, bio, etc.)
|
|
try:
|
|
account_info = await client.get_account_info(creator['username'])
|
|
if account_info:
|
|
profile_updates = {}
|
|
if account_info.get('display_name'):
|
|
profile_updates['display_name'] = account_info['display_name']
|
|
if account_info.get('avatar_url'):
|
|
cached = await self._cache_profile_image(account_info['avatar_url'], 'fansly', creator['creator_id'], 'avatar')
|
|
profile_updates['profile_image_url'] = cached or account_info['avatar_url']
|
|
if account_info.get('banner_url'):
|
|
cached = await self._cache_profile_image(account_info['banner_url'], 'fansly', creator['creator_id'], 'banner')
|
|
profile_updates['banner_image_url'] = cached or account_info['banner_url']
|
|
if account_info.get('bio'):
|
|
profile_updates['bio'] = account_info['bio']
|
|
if account_info.get('location'):
|
|
profile_updates['location'] = account_info['location']
|
|
if account_info.get('external_links'):
|
|
profile_updates['external_links'] = account_info['external_links']
|
|
if profile_updates:
|
|
self.db.update_creator(creator_id, profile_updates)
|
|
self.log(f"Updated creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Failed to update creator profile: {e}", 'warning')
|
|
|
|
# Determine date filter
|
|
# Priority: explicit date_from/date_to > days_back > scheduled default (3 days) > last_post_date
|
|
since_date = date_from
|
|
until_date = date_to
|
|
|
|
if days_back:
|
|
from datetime import timedelta
|
|
since_date = (datetime.now() - timedelta(days=days_back)).isoformat()
|
|
elif not since_date:
|
|
from datetime import timedelta
|
|
if scheduled:
|
|
# Scheduled syncs only check last 3 days for efficiency
|
|
since_date = (datetime.now() - timedelta(days=3)).isoformat()
|
|
else:
|
|
# Manual syncs use incremental from last post date
|
|
since_date = creator.get('last_post_date')
|
|
|
|
# Progress callback
|
|
def progress_callback(page: int, total_posts: int):
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Fetched {total_posts} posts (page {page})...',
|
|
'posts_fetched': total_posts,
|
|
'page': page
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': f'Fetched {total_posts} posts (page {page})...',
|
|
'phase': 'fetching',
|
|
'posts_fetched': total_posts,
|
|
'page': page
|
|
})
|
|
|
|
# Fetch posts from Fansly
|
|
posts = await client.get_posts(
|
|
username=creator['username'],
|
|
since_date=since_date,
|
|
until_date=until_date,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
if not posts:
|
|
self.log(f"No new posts for {creator['username']}", 'debug')
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
|
|
# Still download any pending attachments
|
|
downloaded = 0
|
|
failed = 0
|
|
if download and creator.get('auto_download', True):
|
|
pending_count = self.db.get_pending_attachment_count(creator_id)
|
|
if pending_count > 0:
|
|
self.log(f"Downloading {pending_count} pending attachments", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading {pending_count} pending files...',
|
|
'phase': 'downloading',
|
|
'total_files': pending_count
|
|
})
|
|
result = await self.download_pending_for_creator(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': 0,
|
|
'new_attachments': 0,
|
|
'downloaded': downloaded,
|
|
'failed': failed
|
|
})
|
|
return SyncResult(success=True, new_posts=0, new_attachments=0,
|
|
downloaded_files=downloaded, failed_files=failed)
|
|
|
|
self.log(f"Found {len(posts)} posts for {creator['username']}", 'info')
|
|
|
|
# Update status
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} posts...',
|
|
'phase': 'processing',
|
|
'total_posts': len(posts)
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
|
|
for i, post in enumerate(posts):
|
|
# Update progress periodically
|
|
if (i + 1) % 10 == 0:
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing post {i + 1}/{len(posts)}...',
|
|
'phase': 'processing'
|
|
})
|
|
|
|
# Insert/update post in database
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
|
|
# Check if post has PPV content (attachments without download URL)
|
|
has_ppv = any(att.download_url is None for att in post.attachments)
|
|
|
|
# Insert attachments
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
# Mark PPV attachments as unavailable so they don't enter the download queue
|
|
if attachment.download_url is None:
|
|
att_data['status'] = 'unavailable'
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
# Tag as PPV if it has locked content
|
|
if has_ppv:
|
|
ppv_tag = self._get_or_create_ppv_tag()
|
|
if ppv_tag:
|
|
self.db.add_tag_to_post(post_db_id, ppv_tag['id'])
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
# Update pinned posts in DB (handles posts outside date range too)
|
|
if hasattr(client, '_last_pinned_posts') and client._last_pinned_posts:
|
|
self.db.update_pinned_posts(creator_id, client._last_pinned_posts)
|
|
|
|
# Sync messages
|
|
new_messages = await self._sync_messages_for_creator(creator, client, 'fansly')
|
|
|
|
# Update creator stats
|
|
latest_post_date = max((p.published_at for p in posts if p.published_at), default=None) if posts else None
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'last_post_date': latest_post_date or creator.get('last_post_date'),
|
|
'post_count': self.db.get_creator_post_count(creator_id)
|
|
})
|
|
|
|
# Download if enabled
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self.download_pending_for_creator(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
# Unregister from active syncs
|
|
self._unregister_active_sync(creator_id)
|
|
|
|
# Emit completed event
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': new_posts,
|
|
'new_attachments': new_attachments,
|
|
'downloaded': downloaded,
|
|
'failed': failed
|
|
})
|
|
|
|
# Send push notification for new downloads or messages
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled, new_messages=new_messages)
|
|
|
|
return SyncResult(
|
|
success=True,
|
|
new_posts=new_posts,
|
|
new_attachments=new_attachments,
|
|
downloaded_files=downloaded,
|
|
failed_files=failed,
|
|
downloaded_file_info=downloaded_file_info
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing Fansly creator {creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'error': str(e)
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
# =========================================================================
|
|
# OnlyFans Direct
|
|
# =========================================================================
|
|
|
|
def _get_onlyfans_direct_client(self) -> Optional[OnlyFansClient]:
|
|
"""Get or create OnlyFans Direct client"""
|
|
if self._onlyfans_direct_client is None:
|
|
import json
|
|
service = self.db.get_service('onlyfans_direct')
|
|
if not service or not service.get('session_cookie'):
|
|
self.log("OnlyFans Direct credentials not configured", 'warning')
|
|
return None
|
|
|
|
# Auth config is stored as JSON in session_cookie
|
|
raw = service['session_cookie']
|
|
try:
|
|
auth_config = json.loads(raw)
|
|
except (json.JSONDecodeError, TypeError):
|
|
self.log("OnlyFans Direct credentials: invalid JSON in session_cookie", 'error')
|
|
return None
|
|
|
|
if not auth_config.get('sess'):
|
|
self.log("OnlyFans Direct: 'sess' cookie not set", 'warning')
|
|
return None
|
|
|
|
signing_url = auth_config.get('signing_url')
|
|
self._onlyfans_direct_client = OnlyFansClient(
|
|
auth_config=auth_config,
|
|
signing_url=signing_url,
|
|
log_callback=self.log_callback,
|
|
)
|
|
return self._onlyfans_direct_client
|
|
|
|
async def _add_onlyfans_direct_creator(
|
|
self,
|
|
username: str,
|
|
auto_download: bool = True,
|
|
download_embeds: bool = True,
|
|
) -> Dict:
|
|
"""Add an OnlyFans creator via direct API"""
|
|
client = self._get_onlyfans_direct_client()
|
|
if not client:
|
|
return {'success': False, 'error': 'OnlyFans credentials not configured. Please set them in Settings.'}
|
|
|
|
# Check if already tracked
|
|
existing = self.db.get_creator_by_api_id('onlyfans_direct', 'onlyfans', username)
|
|
if existing:
|
|
return {'success': False, 'error': 'Creator already tracked', 'creator': existing}
|
|
|
|
# Fetch user info
|
|
user_info = await client.get_user_info(username)
|
|
if not user_info:
|
|
return {'success': False, 'error': f'OnlyFans user not found: {username}'}
|
|
|
|
# Cache profile images locally
|
|
of_creator_id = user_info.get('user_id') or username
|
|
cached_avatar = await self._cache_profile_image(user_info.get('avatar_url'), 'onlyfans', of_creator_id, 'avatar') if user_info.get('avatar_url') else None
|
|
cached_banner = await self._cache_profile_image(user_info.get('banner_url'), 'onlyfans', of_creator_id, 'banner') if user_info.get('banner_url') else None
|
|
|
|
creator_data = {
|
|
'service_id': 'onlyfans_direct',
|
|
'platform': 'onlyfans',
|
|
'creator_id': of_creator_id,
|
|
'username': user_info.get('username') or username,
|
|
'display_name': user_info.get('display_name'),
|
|
'profile_image_url': cached_avatar or user_info.get('avatar_url'),
|
|
'banner_image_url': cached_banner or user_info.get('banner_url'),
|
|
'bio': user_info.get('bio'),
|
|
'joined_date': user_info.get('join_date'),
|
|
'auto_download': 1 if auto_download else 0,
|
|
'download_embeds': 1 if download_embeds else 0,
|
|
}
|
|
|
|
db_id = self.db.add_creator(creator_data)
|
|
|
|
return {
|
|
'success': True,
|
|
'creator': {
|
|
'id': db_id,
|
|
**creator_data,
|
|
},
|
|
}
|
|
|
|
async def _sync_onlyfans_direct_creator(
|
|
self,
|
|
creator: Dict,
|
|
download: bool = True,
|
|
scheduled: bool = False,
|
|
date_from: str = None,
|
|
date_to: str = None,
|
|
days_back: int = None,
|
|
) -> SyncResult:
|
|
"""
|
|
Sync an OnlyFans creator via direct API.
|
|
|
|
Follows the exact same pattern as _sync_fansly_direct_creator.
|
|
"""
|
|
creator_id = creator['id']
|
|
self.log(f"Syncing OnlyFans creator via direct API: {creator['username']}", 'info')
|
|
|
|
# Register active sync
|
|
sync_data = {
|
|
'username': creator['username'],
|
|
'platform': 'onlyfans',
|
|
'service': 'onlyfans_direct',
|
|
'status': 'Fetching posts...',
|
|
'phase': 'fetching',
|
|
}
|
|
self._register_active_sync(creator_id, sync_data)
|
|
self._emit_event('paid_content_sync_started', {
|
|
'creator_id': creator_id,
|
|
**sync_data,
|
|
})
|
|
|
|
try:
|
|
client = self._get_onlyfans_direct_client()
|
|
if not client:
|
|
error = "OnlyFans credentials not configured"
|
|
self._unregister_active_sync(creator_id)
|
|
return SyncResult(success=False, error=error)
|
|
|
|
# Fetch and update creator profile
|
|
try:
|
|
user_info = await client.get_user_info(creator['username'])
|
|
if user_info:
|
|
profile_updates = {}
|
|
if user_info.get('display_name'):
|
|
profile_updates['display_name'] = user_info['display_name']
|
|
if user_info.get('avatar_url'):
|
|
cached = await self._cache_profile_image(user_info['avatar_url'], 'onlyfans', creator['creator_id'], 'avatar')
|
|
profile_updates['profile_image_url'] = cached or user_info['avatar_url']
|
|
if user_info.get('banner_url'):
|
|
cached = await self._cache_profile_image(user_info['banner_url'], 'onlyfans', creator['creator_id'], 'banner')
|
|
profile_updates['banner_image_url'] = cached or user_info['banner_url']
|
|
if user_info.get('bio'):
|
|
profile_updates['bio'] = user_info['bio']
|
|
if profile_updates:
|
|
self.db.update_creator(creator_id, profile_updates)
|
|
self.log(f"Updated creator profile for {creator['username']}: {list(profile_updates.keys())}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Failed to update creator profile: {e}", 'warning')
|
|
|
|
# Determine date filter
|
|
since_date = date_from
|
|
until_date = date_to
|
|
|
|
if days_back:
|
|
from datetime import timedelta
|
|
since_date = (datetime.now() - timedelta(days=days_back)).isoformat()
|
|
elif not since_date:
|
|
from datetime import timedelta
|
|
if scheduled:
|
|
since_date = (datetime.now() - timedelta(days=3)).isoformat()
|
|
# Manual resync: fetch all posts (no date filter) so old
|
|
# data from previous sources (e.g. Coomer) gets overwritten
|
|
|
|
# We need user_id for the posts endpoint; get it from creator or fetch it
|
|
of_user_id = creator.get('creator_id', '')
|
|
if not of_user_id or of_user_id == creator['username']:
|
|
# Fetch user info to get numeric ID
|
|
info = await client.get_user_info(creator['username'])
|
|
if info:
|
|
of_user_id = info['user_id']
|
|
else:
|
|
self._unregister_active_sync(creator_id)
|
|
return SyncResult(success=False, error=f"Could not resolve user ID for {creator['username']}")
|
|
|
|
# Progress callback
|
|
def progress_callback(page: int, total_posts: int):
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Fetched {total_posts} posts (page {page})...',
|
|
'posts_fetched': total_posts,
|
|
'page': page,
|
|
})
|
|
self._emit_event('paid_content_sync_progress', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'status': f'Fetched {total_posts} posts (page {page})...',
|
|
'phase': 'fetching',
|
|
'posts_fetched': total_posts,
|
|
'page': page,
|
|
})
|
|
|
|
# Fetch posts
|
|
posts = await client.get_posts(
|
|
user_id=of_user_id,
|
|
username=creator['username'],
|
|
since_date=since_date,
|
|
until_date=until_date,
|
|
progress_callback=progress_callback,
|
|
)
|
|
|
|
if not posts:
|
|
self.log(f"No new posts for {creator['username']}", 'debug')
|
|
self.db.update_creator(creator_id, {'last_checked': datetime.now().isoformat()})
|
|
|
|
# Still download pending attachments
|
|
downloaded = 0
|
|
failed = 0
|
|
if download and creator.get('auto_download', True):
|
|
pending_count = self.db.get_pending_attachment_count(creator_id)
|
|
if pending_count > 0:
|
|
self.log(f"Downloading {pending_count} pending attachments", 'info')
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Downloading {pending_count} pending files...',
|
|
'phase': 'downloading',
|
|
'total_files': pending_count,
|
|
})
|
|
result = await self.download_pending_for_creator(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': 0,
|
|
'new_attachments': 0,
|
|
'downloaded': downloaded,
|
|
'failed': failed,
|
|
})
|
|
return SyncResult(success=True, new_posts=0, new_attachments=0,
|
|
downloaded_files=downloaded, failed_files=failed)
|
|
|
|
self.log(f"Found {len(posts)} posts for {creator['username']}", 'info')
|
|
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing {len(posts)} posts...',
|
|
'phase': 'processing',
|
|
'total_posts': len(posts),
|
|
})
|
|
|
|
new_posts = 0
|
|
new_attachments = 0
|
|
|
|
for i, post in enumerate(posts):
|
|
if (i + 1) % 10 == 0:
|
|
self._update_active_sync(creator_id, {
|
|
'status': f'Processing post {i + 1}/{len(posts)}...',
|
|
'phase': 'processing',
|
|
})
|
|
|
|
post_db_id, is_new_post = self.db.upsert_post(creator_id, post.to_dict())
|
|
if post_db_id:
|
|
if is_new_post:
|
|
new_posts += 1
|
|
|
|
# Check for PPV content
|
|
has_ppv = any(att.download_url is None or getattr(att, 'is_preview', False) for att in post.attachments)
|
|
|
|
for idx, attachment in enumerate(post.attachments):
|
|
att_data = attachment.to_dict()
|
|
att_data['attachment_index'] = idx
|
|
# Mark PPV attachments as unavailable so they don't enter the download queue
|
|
if attachment.download_url is None:
|
|
att_data['status'] = 'unavailable'
|
|
if self.db.upsert_attachment(post_db_id, att_data):
|
|
new_attachments += 1
|
|
|
|
if has_ppv:
|
|
ppv_tag = self._get_or_create_ppv_tag()
|
|
if ppv_tag:
|
|
self.db.add_tag_to_post(post_db_id, ppv_tag['id'])
|
|
|
|
self._apply_auto_tag_rules(post_db_id, is_new_post)
|
|
|
|
# Sync messages
|
|
new_messages = await self._sync_messages_for_creator(creator, client, 'onlyfans')
|
|
|
|
# Update creator stats
|
|
latest_post_date = max(
|
|
(p.published_at for p in posts if p.published_at), default=None
|
|
) if posts else None
|
|
self.db.update_creator(creator_id, {
|
|
'last_checked': datetime.now().isoformat(),
|
|
'last_post_date': latest_post_date or creator.get('last_post_date'),
|
|
'post_count': self.db.get_creator_post_count(creator_id),
|
|
})
|
|
|
|
# Upgrade DRM preview frames from Coomer (actual video clips)
|
|
try:
|
|
upgraded = await self._upgrade_drm_from_coomer(creator_id, creator['username'], 'onlyfans')
|
|
if upgraded > 0:
|
|
self.log(f"Upgraded {upgraded} preview frames to video from Coomer", 'info')
|
|
except Exception as e:
|
|
self.log(f"Coomer fallback check failed (non-critical): {e}", 'warning')
|
|
|
|
# Download if enabled
|
|
downloaded = 0
|
|
failed = 0
|
|
downloaded_file_info = []
|
|
if download and creator.get('auto_download', True):
|
|
result = await self.download_pending_for_creator(creator_id)
|
|
downloaded = result.get('downloaded', 0)
|
|
failed = result.get('failed', 0)
|
|
downloaded_file_info = result.get('downloaded_file_info', [])
|
|
|
|
self._unregister_active_sync(creator_id)
|
|
|
|
self._emit_event('paid_content_sync_completed', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'new_posts': new_posts,
|
|
'new_attachments': new_attachments,
|
|
'downloaded': downloaded,
|
|
'failed': failed,
|
|
})
|
|
|
|
self._send_creator_notification(creator, new_posts, downloaded, downloaded_file_info, scheduled=scheduled, new_messages=new_messages)
|
|
|
|
return SyncResult(
|
|
success=True,
|
|
new_posts=new_posts,
|
|
new_attachments=new_attachments,
|
|
downloaded_files=downloaded,
|
|
failed_files=failed,
|
|
downloaded_file_info=downloaded_file_info,
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error syncing OnlyFans creator {creator['username']}: {e}", 'error')
|
|
self._unregister_active_sync(creator_id)
|
|
self._emit_event('paid_content_sync_error', {
|
|
'creator_id': creator_id,
|
|
'username': creator['username'],
|
|
'error': str(e),
|
|
})
|
|
return SyncResult(success=False, error=str(e))
|
|
|
|
async def _upgrade_drm_from_coomer(self, creator_id: int, username: str, platform: str = 'onlyfans') -> int:
|
|
"""Replace DRM preview frame JPGs with actual preview video clips from Coomer.
|
|
|
|
OnlyFans DRM videos only yield a still-frame JPG via direct API. Coomer archives
|
|
the actual free preview video clip. This method finds completed "video" attachments
|
|
that are actually image files (preview frames) and replaces them with the real
|
|
video from Coomer.
|
|
|
|
Only checks once per day per creator to avoid redundant API calls.
|
|
|
|
Returns the number of upgraded attachments.
|
|
"""
|
|
# Check cooldown — only query Coomer once per day per creator
|
|
creator = self.db.get_creator(creator_id)
|
|
if creator and creator.get('last_coomer_check'):
|
|
from datetime import timedelta
|
|
try:
|
|
last_check = datetime.fromisoformat(creator['last_coomer_check'])
|
|
if datetime.now().astimezone() - last_check < timedelta(hours=24):
|
|
self.log(f"Coomer fallback: skipping (last checked {creator['last_coomer_check']})", 'debug')
|
|
return 0
|
|
except (ValueError, TypeError):
|
|
pass # Invalid timestamp, proceed with check
|
|
|
|
# Revert failed Coomer upgrades back to original preview frame state.
|
|
# server_path is preserved as /onlyfans/{media_id} during upgrade, so we can
|
|
# reconstruct the original name/extension. Setting status to 'skipped' lets the
|
|
# next OF sync refresh the download_url and reset to pending automatically.
|
|
with self.db.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT a.id, a.name, a.server_path, p.post_id
|
|
FROM paid_content_attachments a
|
|
JOIN paid_content_posts p ON a.post_id = p.id
|
|
WHERE p.creator_id = ?
|
|
AND a.file_type = 'video'
|
|
AND a.status = 'failed'
|
|
AND a.name LIKE '%\\_source.mp4' ESCAPE '\\'
|
|
""", (creator_id,))
|
|
failed_coomer = [dict(row) for row in cursor.fetchall()]
|
|
|
|
if failed_coomer:
|
|
reverted = 0
|
|
for att in failed_coomer:
|
|
# Reconstruct original preview frame name from preserved server_path
|
|
# e.g. /onlyfans/3878367393 -> 3878367393.jpg
|
|
media_id = att['server_path'].rsplit('/', 1)[-1] if att['server_path'] else ''
|
|
original_name = f"{media_id}.jpg" if media_id else att['name']
|
|
self.db.update_attachment(att['id'], {
|
|
'name': original_name,
|
|
'extension': 'jpg',
|
|
'status': 'skipped',
|
|
'download_url': None,
|
|
'local_path': None,
|
|
'local_filename': None,
|
|
'file_hash': None,
|
|
'perceptual_hash': None,
|
|
'download_attempts': 0,
|
|
'error_message': 'Coomer CDN unavailable - will restore on next sync',
|
|
})
|
|
reverted += 1
|
|
self.log(f"Reverted {reverted} failed Coomer downloads to preview frames", 'info')
|
|
|
|
# Find preview frame attachments: file_type='video' but extension is an image format
|
|
# These are DRM videos where we could only download the preview frame
|
|
with self.db.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT a.id, a.name, a.local_path, a.extension, p.post_id, p.id as post_db_id
|
|
FROM paid_content_attachments a
|
|
JOIN paid_content_posts p ON a.post_id = p.id
|
|
WHERE p.creator_id = ?
|
|
AND a.file_type = 'video'
|
|
AND a.status = 'completed'
|
|
AND lower(a.extension) IN ('jpg', 'jpeg', 'png', 'webp', 'gif')
|
|
""", (creator_id,))
|
|
preview_frames = [dict(row) for row in cursor.fetchall()]
|
|
|
|
if not preview_frames:
|
|
return 0
|
|
|
|
self.log(f"Found {len(preview_frames)} DRM preview frames to check against Coomer", 'info')
|
|
|
|
# Get Coomer client
|
|
try:
|
|
coomer_client = self._get_client('coomer')
|
|
except Exception as e:
|
|
self.log(f"Could not initialize Coomer client: {e}", 'warning')
|
|
return 0
|
|
|
|
# Group by post_id to avoid duplicate API calls
|
|
posts_by_id = {}
|
|
for frame in preview_frames:
|
|
post_id = str(frame['post_id'])
|
|
if post_id not in posts_by_id:
|
|
posts_by_id[post_id] = []
|
|
posts_by_id[post_id].append(frame)
|
|
|
|
upgraded = 0
|
|
checked = 0
|
|
|
|
for post_id, frames in posts_by_id.items():
|
|
checked += 1
|
|
if checked % 10 == 0:
|
|
self.log(f"Coomer fallback: checked {checked}/{len(posts_by_id)} posts...", 'info')
|
|
|
|
try:
|
|
coomer_post = await coomer_client.get_post(platform, username, post_id)
|
|
if not coomer_post:
|
|
continue
|
|
|
|
# Find video attachments in the Coomer post
|
|
video_atts = [
|
|
att for att in coomer_post.attachments
|
|
if att.file_type == 'video' and att.extension in ('mp4', 'mov', 'webm', 'mkv', 'm4v')
|
|
]
|
|
if not video_atts:
|
|
continue
|
|
|
|
# Match preview frames to Coomer videos
|
|
# If there's one video and one frame, it's a direct match
|
|
# If multiple, match by index order
|
|
for i, frame in enumerate(frames):
|
|
if i >= len(video_atts):
|
|
break
|
|
|
|
coomer_att = video_atts[i]
|
|
download_url = coomer_att.download_url
|
|
if not download_url and coomer_att.server_path:
|
|
download_url = coomer_client.get_attachment_url(coomer_att.server_path)
|
|
|
|
if not download_url:
|
|
continue
|
|
|
|
# Delete old preview frame JPG from disk
|
|
if frame['local_path']:
|
|
old_path = Path(frame['local_path'])
|
|
if old_path.exists():
|
|
try:
|
|
old_path.unlink()
|
|
self.log(f"Deleted preview frame: {old_path.name}", 'debug')
|
|
except OSError as e:
|
|
self.log(f"Could not delete preview frame {old_path}: {e}", 'warning')
|
|
|
|
# Update attachment to point to Coomer video.
|
|
# Preserve server_path (/onlyfans/{media_id}) so the OF sync can
|
|
# match and restore the preview frame if the Coomer download fails.
|
|
self.db.update_attachment(frame['id'], {
|
|
'download_url': download_url,
|
|
'name': coomer_att.name,
|
|
'extension': coomer_att.extension or 'mp4',
|
|
'status': 'pending',
|
|
'local_path': None,
|
|
'local_filename': None,
|
|
'file_hash': None,
|
|
'perceptual_hash': None,
|
|
'download_attempts': 0,
|
|
'error_message': None,
|
|
})
|
|
upgraded += 1
|
|
|
|
except Exception as e:
|
|
self.log(f"Coomer fallback error for post {post_id}: {e}", 'debug')
|
|
continue
|
|
|
|
if upgraded > 0:
|
|
self.log(f"Upgraded {upgraded} preview frames to video from Coomer", 'info')
|
|
|
|
# Update last check timestamp so we don't re-check for 24h
|
|
self.db.update_creator(creator_id, {
|
|
'last_coomer_check': datetime.now().astimezone().isoformat()
|
|
})
|
|
|
|
return upgraded
|
|
|
|
def _compute_perceptual_hash(self, file_path: Path) -> Optional[str]:
|
|
"""
|
|
Compute perceptual hash for an image file.
|
|
|
|
Uses dhash (difference hash) which is effective for detecting
|
|
visually similar images even with minor modifications.
|
|
|
|
Returns hex string or None if not an image or error.
|
|
"""
|
|
try:
|
|
# Only process images
|
|
suffix = file_path.suffix.lower()
|
|
if suffix not in {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.heic', '.heif', '.avif'}:
|
|
return None
|
|
|
|
try:
|
|
import imagehash
|
|
from PIL import Image
|
|
except ImportError:
|
|
self.log("imagehash/PIL not available for perceptual hashing", 'debug')
|
|
return None
|
|
|
|
with Image.open(file_path) as img:
|
|
# Convert to RGB if necessary
|
|
if img.mode != 'RGB':
|
|
img = img.convert('RGB')
|
|
|
|
# Compute dhash with 16x16 = 256 bits
|
|
phash = str(imagehash.dhash(img, hash_size=16))
|
|
return phash
|
|
|
|
except Exception as e:
|
|
self.log(f"Failed to compute perceptual hash for {file_path.name}: {e}", 'debug')
|
|
return None
|
|
|
|
def _hamming_distance(self, hash1: str, hash2: str) -> int:
|
|
"""Calculate hamming distance between two hex hash strings"""
|
|
if not hash1 or not hash2 or len(hash1) != len(hash2):
|
|
return 999 # Return high value for invalid comparison
|
|
|
|
try:
|
|
# Convert hex to integers and XOR
|
|
h1 = int(hash1, 16)
|
|
h2 = int(hash2, 16)
|
|
xor = h1 ^ h2
|
|
|
|
# Count set bits (differences)
|
|
return bin(xor).count('1')
|
|
except (ValueError, TypeError):
|
|
return 999
|
|
|
|
def _check_perceptual_duplicate(self, file_path: Path, phash: str, att_id: int) -> Optional[Dict]:
|
|
"""
|
|
Check if file is a perceptual duplicate of existing content.
|
|
|
|
Returns dict with duplicate info if found, None otherwise.
|
|
"""
|
|
if not phash:
|
|
return None
|
|
|
|
# Check if perceptual detection is enabled
|
|
if not self.config.get('perceptual_duplicate_detection', True):
|
|
return None
|
|
|
|
threshold = self.config.get('perceptual_threshold', 12)
|
|
|
|
# Get all existing perceptual hashes
|
|
existing = self.db.get_attachments_with_phash()
|
|
|
|
for existing_att in existing:
|
|
if existing_att['id'] == att_id:
|
|
continue # Skip self
|
|
|
|
existing_phash = existing_att.get('perceptual_hash')
|
|
if not existing_phash:
|
|
continue
|
|
|
|
distance = self._hamming_distance(phash, existing_phash)
|
|
if distance <= threshold:
|
|
return {
|
|
'attachment_id': existing_att['id'],
|
|
'local_path': existing_att.get('local_path'),
|
|
'distance': distance,
|
|
'creator': existing_att.get('username'),
|
|
'post_title': existing_att.get('post_title')
|
|
}
|
|
|
|
return None
|
|
|
|
def _extract_dimensions(self, file_path: Path, file_type: str) -> Tuple[Optional[int], Optional[int], Optional[int]]:
|
|
"""
|
|
Extract dimensions (width, height) and duration (for videos) from a file.
|
|
Returns (width, height, duration) tuple. Duration is in seconds for videos, None for images.
|
|
"""
|
|
width, height, duration = None, None, None
|
|
|
|
try:
|
|
if file_type == 'image' and HAS_PIL:
|
|
with Image.open(file_path) as img:
|
|
width, height = img.size
|
|
elif file_type in ('video', 'audio'):
|
|
# Use ffprobe to get dimensions (video) and duration (video/audio)
|
|
cmd = [
|
|
'ffprobe', '-v', 'quiet',
|
|
'-print_format', 'json',
|
|
'-show_streams', '-show_format',
|
|
]
|
|
if file_type == 'video':
|
|
cmd += ['-select_streams', 'v:0']
|
|
cmd.append(str(file_path))
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
|
if result.returncode == 0:
|
|
import json
|
|
data = json.loads(result.stdout)
|
|
if file_type == 'video' and data.get('streams'):
|
|
stream = data['streams'][0]
|
|
width = stream.get('width')
|
|
height = stream.get('height')
|
|
if data.get('format'):
|
|
duration_str = data['format'].get('duration')
|
|
if duration_str:
|
|
duration = int(float(duration_str))
|
|
except Exception as e:
|
|
self.log(f"Error extracting dimensions from {file_path.name}: {e}", 'warning')
|
|
|
|
return width, height, duration
|
|
|
|
def _generate_thumbnail(self, file_path: Path, file_type: str, max_size: tuple = (300, 300)) -> Optional[bytes]:
|
|
"""
|
|
Generate thumbnail for image or video file.
|
|
Returns thumbnail as JPEG bytes, or None if generation fails.
|
|
"""
|
|
if not HAS_PIL:
|
|
return None
|
|
|
|
try:
|
|
# Check actual file extension — preview frames are jpg but file_type may say 'video'
|
|
actual_ext = file_path.suffix.lower().lstrip('.')
|
|
image_exts = {'jpg', 'jpeg', 'png', 'webp', 'gif', 'bmp', 'tiff'}
|
|
if actual_ext in image_exts:
|
|
file_type = 'image'
|
|
|
|
if file_type == 'image':
|
|
# Native size not supported for images - fall back to large
|
|
result = self._generate_image_thumbnail(file_path, max_size or (800, 800))
|
|
if result:
|
|
return result
|
|
# PIL failed — file may be a video with image extension (e.g. Instagram stories)
|
|
self.log(f"PIL failed for {file_path.name}, trying ffmpeg (may be video with image extension)", 'debug')
|
|
return self._generate_video_thumbnail(file_path, max_size, seek_time='00:00:01')
|
|
elif file_type == 'video':
|
|
# Platform-specific seek times:
|
|
# - Pornhub: 10s to skip intro branding
|
|
# - OnlyFans/Fansly: first frame (no intro logos)
|
|
# - Default: 5s
|
|
path_str = str(file_path)
|
|
if '/pornhub/' in path_str:
|
|
seek_time = '00:00:10'
|
|
elif '/onlyfans/' in path_str or '/fansly/' in path_str:
|
|
seek_time = '00:00:00'
|
|
else:
|
|
seek_time = None
|
|
return self._generate_video_thumbnail(file_path, max_size, seek_time=seek_time)
|
|
except Exception as e:
|
|
self.log(f"Error generating thumbnail for {file_path.name}: {e}", 'warning')
|
|
|
|
return None
|
|
|
|
def _generate_image_thumbnail(self, file_path: Path, max_size: tuple = (300, 300)) -> Optional[bytes]:
|
|
"""Generate thumbnail for image file."""
|
|
try:
|
|
with Image.open(file_path) as img:
|
|
# Convert to RGB if necessary (handles RGBA, P mode, etc.)
|
|
if img.mode in ('RGBA', 'P', 'LA'):
|
|
img = img.convert('RGB')
|
|
elif img.mode != 'RGB':
|
|
img = img.convert('RGB')
|
|
|
|
img.thumbnail(max_size, Image.Resampling.LANCZOS)
|
|
|
|
buffer = BytesIO()
|
|
img.save(buffer, format='JPEG', quality=85, optimize=True)
|
|
return buffer.getvalue()
|
|
except Exception as e:
|
|
self.log(f"Error generating image thumbnail: {e}", 'warning')
|
|
return None
|
|
|
|
def _generate_video_thumbnail(self, file_path: Path, max_size: tuple = (300, 300), seek_time: str = None) -> Optional[bytes]:
|
|
"""Generate thumbnail for video file using ffmpeg.
|
|
|
|
Args:
|
|
seek_time: Override seek position (e.g. '00:00:30'). If None, defaults to 5 seconds
|
|
to skip intro logos (e.g. Pornhub/Modelhub branding).
|
|
"""
|
|
try:
|
|
# Create temp file for thumbnail
|
|
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
|
|
tmp_path = tmp.name
|
|
|
|
# Use ffmpeg to extract frame at specified time
|
|
# -ss before -i for fast input seeking (keyframe-based)
|
|
# Default to 5 seconds to skip platform intro logos
|
|
# max_size=None means native resolution (no scaling)
|
|
scale_args = ['-vf', f'scale={max_size[0]}:{max_size[1]}:force_original_aspect_ratio=decrease'] if max_size else []
|
|
cmd = [
|
|
'ffmpeg', '-y',
|
|
'-ss', seek_time or '00:00:05',
|
|
'-i', str(file_path),
|
|
'-vframes', '1',
|
|
*scale_args,
|
|
'-f', 'image2',
|
|
tmp_path
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, timeout=30)
|
|
|
|
# Retry without seeking if first attempt failed or produced no output
|
|
# (ffmpeg returns 0 even when seeking past end of short videos)
|
|
if result.returncode != 0 or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) < 100:
|
|
if os.path.exists(tmp_path):
|
|
os.unlink(tmp_path)
|
|
# Try without seeking (for very short videos)
|
|
cmd = [
|
|
'ffmpeg', '-y',
|
|
'-i', str(file_path),
|
|
'-vframes', '1',
|
|
*scale_args,
|
|
'-f', 'image2',
|
|
tmp_path
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, timeout=30)
|
|
|
|
if result.returncode == 0 and os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 100:
|
|
with open(tmp_path, 'rb') as f:
|
|
thumbnail_data = f.read()
|
|
os.unlink(tmp_path)
|
|
return thumbnail_data
|
|
|
|
# Clean up temp file if it exists
|
|
if os.path.exists(tmp_path):
|
|
os.unlink(tmp_path)
|
|
|
|
except subprocess.TimeoutExpired:
|
|
self.log(f"Timeout generating video thumbnail for {file_path.name}", 'warning')
|
|
except Exception as e:
|
|
self.log(f"Error generating video thumbnail: {e}", 'warning')
|
|
|
|
return None
|
|
|
|
async def _download_youtube_thumbnail(self, video_id: str, max_size: tuple = (300, 300)) -> Optional[bytes]:
|
|
"""
|
|
Download YouTube thumbnail for a video.
|
|
Tries maxresdefault first, falls back to hqdefault.
|
|
Returns resized JPEG bytes.
|
|
"""
|
|
if not HAS_PIL:
|
|
self.log("PIL not available for thumbnail processing", 'warning')
|
|
return None
|
|
|
|
# YouTube thumbnail URLs in order of quality preference
|
|
thumbnail_urls = [
|
|
f"https://i.ytimg.com/vi/{video_id}/maxresdefault.jpg", # 1280x720
|
|
f"https://i.ytimg.com/vi/{video_id}/sddefault.jpg", # 640x480
|
|
f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg", # 480x360
|
|
f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg", # 320x180
|
|
]
|
|
|
|
try:
|
|
async with aiohttp.ClientSession() as session:
|
|
for url in thumbnail_urls:
|
|
try:
|
|
async with session.get(url, timeout=aiohttp.ClientTimeout(total=10)) as resp:
|
|
if resp.status == 200:
|
|
image_data = await resp.read()
|
|
|
|
# Check if it's a valid image (YouTube returns a placeholder for missing thumbnails)
|
|
if len(image_data) < 1000:
|
|
continue # Too small, likely a placeholder
|
|
|
|
# Resize to standard thumbnail size
|
|
with Image.open(BytesIO(image_data)) as img:
|
|
if img.mode != 'RGB':
|
|
img = img.convert('RGB')
|
|
|
|
img.thumbnail(max_size, Image.Resampling.LANCZOS)
|
|
|
|
buffer = BytesIO()
|
|
img.save(buffer, format='JPEG', quality=85, optimize=True)
|
|
return buffer.getvalue()
|
|
except Exception:
|
|
continue # Try next URL
|
|
|
|
self.log(f"Could not download YouTube thumbnail for {video_id}", 'warning')
|
|
return None
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading YouTube thumbnail: {e}", 'warning')
|
|
return None
|
|
|
|
def backfill_dimensions(self, batch_size: int = 100, log_callback: Callable = None) -> Dict:
|
|
"""
|
|
Scan completed attachments with missing dimensions and extract them from files.
|
|
Returns stats dict with counts.
|
|
"""
|
|
if log_callback:
|
|
self.log_callback = log_callback
|
|
|
|
stats = {
|
|
'total_missing': self.db.count_attachments_missing_dimensions(),
|
|
'processed': 0,
|
|
'updated': 0,
|
|
'failed': 0,
|
|
'not_found': 0
|
|
}
|
|
|
|
self.log(f"Starting dimension backfill: {stats['total_missing']} attachments missing dimensions", 'info')
|
|
|
|
while True:
|
|
attachments = self.db.get_attachments_missing_dimensions(limit=batch_size)
|
|
if not attachments:
|
|
break
|
|
|
|
for att in attachments:
|
|
stats['processed'] += 1
|
|
local_path = att.get('local_path')
|
|
|
|
if not local_path:
|
|
stats['not_found'] += 1
|
|
continue
|
|
|
|
file_path = Path(local_path)
|
|
if not file_path.exists():
|
|
stats['not_found'] += 1
|
|
self.log(f"File not found: {local_path}", 'debug')
|
|
continue
|
|
|
|
try:
|
|
width, height, duration = self._extract_dimensions(file_path, att.get('file_type', ''))
|
|
if width and height:
|
|
updates = {'width': width, 'height': height}
|
|
if duration:
|
|
updates['duration'] = duration
|
|
self.db.update_attachment(att['id'], updates)
|
|
stats['updated'] += 1
|
|
self.log(f"Updated dimensions for attachment {att['id']}: {width}x{height}", 'debug')
|
|
else:
|
|
stats['failed'] += 1
|
|
self.log(f"Could not extract dimensions from {file_path.name}", 'debug')
|
|
except Exception as e:
|
|
stats['failed'] += 1
|
|
self.log(f"Error processing attachment {att['id']}: {e}", 'warning')
|
|
|
|
# Log progress
|
|
remaining = stats['total_missing'] - stats['processed']
|
|
self.log(f"Progress: {stats['processed']}/{stats['total_missing']} processed, {stats['updated']} updated, {remaining} remaining", 'info')
|
|
|
|
self.log(f"Dimension backfill complete: {stats['updated']} updated, {stats['failed']} failed, {stats['not_found']} files not found", 'info')
|
|
return stats
|
|
|
|
async def backfill_truncated_content(self, batch_size: int = 50, creator_id: int = None) -> Dict:
|
|
"""
|
|
Find posts with truncated content (ending with '..') from Coomer/Kemono
|
|
and re-fetch full content from the individual post endpoint.
|
|
|
|
Args:
|
|
batch_size: Number of posts to process per batch
|
|
creator_id: Optional - limit to a specific creator
|
|
|
|
Returns:
|
|
Stats dict with counts
|
|
"""
|
|
stats = {
|
|
'total_truncated': 0,
|
|
'updated': 0,
|
|
'failed': 0,
|
|
'skipped': 0,
|
|
}
|
|
|
|
# Find posts with truncated or missing content
|
|
# Coomer OnlyFans posts store the truncated text in 'title' and have empty 'content'
|
|
# Also catch posts ending with '..' (truncated substring)
|
|
with self.db.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
query = """
|
|
SELECT p.id, p.post_id, p.content, p.title, p.creator_id,
|
|
c.platform, c.creator_id as api_creator_id, c.service_id, c.username
|
|
FROM paid_content_posts p
|
|
JOIN paid_content_creators c ON p.creator_id = c.id
|
|
WHERE c.service_id IN ('coomer', 'kemono')
|
|
AND (
|
|
(p.content IS NULL OR p.content = '') AND p.title IS NOT NULL AND p.title != ''
|
|
OR p.content LIKE '%..'
|
|
OR p.title LIKE '%..'
|
|
)
|
|
"""
|
|
params = []
|
|
if creator_id:
|
|
query += " AND p.creator_id = ?"
|
|
params.append(creator_id)
|
|
query += " ORDER BY p.id"
|
|
cursor.execute(query, params)
|
|
truncated_posts = [dict(row) for row in cursor.fetchall()]
|
|
|
|
stats['total_truncated'] = len(truncated_posts)
|
|
self.log(f"Found {len(truncated_posts)} posts with truncated content", 'info')
|
|
|
|
if not truncated_posts:
|
|
return stats
|
|
|
|
# Group by service_id to use the right client
|
|
from collections import defaultdict
|
|
by_service = defaultdict(list)
|
|
for post in truncated_posts:
|
|
by_service[post['service_id']].append(post)
|
|
|
|
for service_id, posts in by_service.items():
|
|
try:
|
|
client = self._get_client(service_id)
|
|
except Exception as e:
|
|
self.log(f"Could not get client for {service_id}: {e}", 'error')
|
|
stats['failed'] += len(posts)
|
|
continue
|
|
|
|
for i, post in enumerate(posts):
|
|
try:
|
|
full_post = await client.get_post(
|
|
post['platform'],
|
|
post['api_creator_id'],
|
|
post['post_id']
|
|
)
|
|
new_content = None
|
|
if full_post:
|
|
# Use full content from API if available
|
|
new_content = full_post.content
|
|
# If API content is also empty, try the full post title
|
|
if not new_content and full_post.title:
|
|
new_content = full_post.title
|
|
|
|
# If API didn't help, at least copy title to content locally
|
|
if not new_content and post.get('title'):
|
|
new_content = post['title']
|
|
|
|
if new_content and new_content != (post.get('content') or ''):
|
|
updates = {'content': new_content}
|
|
# Clear the title if it was just the truncated content
|
|
# (OnlyFans posts don't have real titles)
|
|
if post.get('title') and new_content.startswith(post['title'].rstrip('.')):
|
|
updates['title'] = None
|
|
self.db.update_post(post['id'], updates)
|
|
stats['updated'] += 1
|
|
self.log(
|
|
f"Updated content for post {post['post_id']} ({post['username']})",
|
|
'debug'
|
|
)
|
|
else:
|
|
stats['skipped'] += 1
|
|
except Exception as e:
|
|
stats['failed'] += 1
|
|
self.log(f"Error fetching full content for post {post['post_id']}: {e}", 'warning')
|
|
|
|
# Progress logging
|
|
if (i + 1) % 25 == 0:
|
|
self.log(
|
|
f"Content backfill progress: {i + 1}/{len(posts)} for {service_id} "
|
|
f"({stats['updated']} updated, {stats['failed']} failed)",
|
|
'info'
|
|
)
|
|
|
|
self.log(
|
|
f"Content backfill complete: {stats['updated']} updated, "
|
|
f"{stats['failed']} failed, {stats['skipped']} skipped "
|
|
f"(of {stats['total_truncated']} truncated)",
|
|
'info'
|
|
)
|
|
return stats
|