""" TikTok Client for Paid Content - Uses yt-dlp for listing and gallery-dl for downloading Adapts the hybrid approach from modules/tiktok_module.py into the paid content client pattern. """ import asyncio import html as html_module import json import os import re import subprocess from datetime import datetime, timedelta from pathlib import Path from typing import Dict, List, Optional, Tuple import aiohttp from modules.base_module import LoggingMixin from .models import Creator, Post, Attachment class TikTokClient(LoggingMixin): """ Client for fetching TikTok creator information and videos. Uses yt-dlp for listing (fast flat-playlist) and gallery-dl for downloading (handles carousels/slideshows properly). """ SERVICE_ID = 'tiktok' PLATFORM = 'tiktok' def __init__(self, unified_db=None, log_callback=None): self._init_logger('PaidContent', log_callback, default_module='TikTok') self.ytdlp_path = self._find_executable('yt-dlp') self.gallery_dl_path = self._find_executable('gallery-dl') self.unified_db = unified_db self._cookies_file = None self._last_pinned_posts = {} if not self.ytdlp_path: self.log("yt-dlp not found, TikTok listing will be disabled", 'warning') if not self.gallery_dl_path: self.log("gallery-dl not found, TikTok downloading will be disabled", 'warning') def _find_executable(self, name: str) -> Optional[str]: """Find an executable by name""" common_paths = [ f'/opt/media-downloader/venv/bin/{name}', f'/usr/local/bin/{name}', f'/usr/bin/{name}', f'/opt/homebrew/bin/{name}', os.path.expanduser(f'~/.local/bin/{name}'), ] for path in common_paths: if os.path.isfile(path) and os.access(path, os.X_OK): return path try: result = subprocess.run(['which', name], capture_output=True, text=True) if result.returncode == 0: return result.stdout.strip() except Exception: pass return None def is_available(self) -> bool: """Check if both yt-dlp and gallery-dl are available""" return self.ytdlp_path is not None and self.gallery_dl_path is not None def cleanup(self): """Clean up any temporary files""" if self._cookies_file and os.path.exists(self._cookies_file): try: os.unlink(self._cookies_file) except Exception: pass def _get_cookies_file(self) -> Optional[str]: """Get path to cookies file, creating from database if needed.""" if self._cookies_file and os.path.exists(self._cookies_file): return self._cookies_file if not self.unified_db: return None try: with self.unified_db.get_connection() as conn: cursor = conn.cursor() # Check for tiktok scraper cookies for scraper_id in ('tiktok', 'tiktok_client'): cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,)) row = cursor.fetchone() if row and row[0]: data = json.loads(row[0]) if isinstance(data, dict) and 'cookies' in data: cookies_list = data['cookies'] elif isinstance(data, list): cookies_list = data else: cookies_list = [] if cookies_list: import tempfile fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='tiktok_cookies_') with os.fdopen(fd, 'w') as f: f.write("# Netscape HTTP Cookie File\n") for cookie in cookies_list: domain = cookie.get('domain', '') include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE' path = cookie.get('path', '/') secure = 'TRUE' if cookie.get('secure', False) else 'FALSE' expiry = str(int(cookie.get('expirationDate', 0))) name = cookie.get('name', '') value = cookie.get('value', '') f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n") self.log(f"Loaded {len(cookies_list)} TikTok cookies", 'debug') return self._cookies_file except Exception as e: self.log(f"Could not load TikTok cookies: {e}", 'debug') return None def _save_cookies_back(self): """Read updated cookies from temp file and save back to database. yt-dlp and gallery-dl update the cookies file with refreshed tokens from TikTok (e.g. msToken), so we need to persist those changes.""" if not self._cookies_file or not os.path.exists(self._cookies_file): return if not self.unified_db: return try: import http.cookiejar jar = http.cookiejar.MozillaCookieJar(self._cookies_file) jar.load(ignore_discard=True, ignore_expires=True) updated_cookies = [] for cookie in jar: updated_cookies.append({ 'name': cookie.name, 'value': cookie.value, 'domain': cookie.domain, 'path': cookie.path, 'secure': cookie.secure, 'expirationDate': cookie.expires or 0, }) if not updated_cookies: return # Merge updated cookies back to DB with self.unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('tiktok',)) row = cursor.fetchone() if row and row[0]: existing_data = json.loads(row[0]) existing_cookies = existing_data if isinstance(existing_data, list) else existing_data.get('cookies', []) # Merge: updated cookies override existing by name+domain cookie_map = {(c.get('name'), c.get('domain')): c for c in existing_cookies} for c in updated_cookies: cookie_map[(c['name'], c['domain'])] = c final_cookies = list(cookie_map.values()) else: final_cookies = updated_cookies self.unified_db.save_scraper_cookies('tiktok', final_cookies, merge=False) self.log(f"Saved {len(final_cookies)} refreshed cookies back to DB", 'debug') # Clear cached file so next use gets fresh cookies from DB self._cookies_file = None except Exception as e: self.log(f"Failed to save cookies back: {e}", 'debug') def _get_base_cmd(self) -> List[str]: """Get base yt-dlp command with cookies if available.""" cmd = [self.ytdlp_path] cookies_file = self._get_cookies_file() if cookies_file: cmd.extend(['--cookies', cookies_file]) return cmd @staticmethod def extract_username(url: str) -> Optional[str]: """Extract username from TikTok URL""" match = re.search(r'tiktok\.com/@([a-zA-Z0-9_.]+)', url) if match: return match.group(1) return None @staticmethod def normalize_creator_url(username: str) -> str: """Convert username to a consistent URL format""" if username.startswith('http://') or username.startswith('https://'): return username username = username.lstrip('@') return f"https://www.tiktok.com/@{username}" async def _resolve_channel_id(self, username: str) -> Optional[str]: """Resolve a TikTok username to a channel_id (secUid). When yt-dlp can't extract the secondary user ID from the profile page, we try to find a video URL from TikTok's embed/RSS and then extract the channel_id (secUid) from that video's metadata via yt-dlp. """ if not self.ytdlp_path: return None try: # Step 1: Get a video URL from this user via the oembed embed HTML video_url = None async with aiohttp.ClientSession() as session: # The oembed HTML often contains a video ID we can use oembed_url = f"https://www.tiktok.com/oembed?url=https://www.tiktok.com/@{username}" async with session.get(oembed_url, timeout=aiohttp.ClientTimeout(total=15)) as resp: if resp.status == 200: data = await resp.json() embed_html = data.get('html', '') # Extract video URL from embed iframe match = re.search(r'cite="(https://www\.tiktok\.com/@[^"]+/video/\d+)"', embed_html) if not match: match = re.search(r'data-video-id="(\d+)"', embed_html) if match: video_url = f"https://www.tiktok.com/@{username}/video/{match.group(1)}" else: video_url = match.group(1) if not video_url: # oembed thumbnail_url sometimes contains the video ID thumb = data.get('thumbnail_url', '') vid_match = re.search(r'/video/(\d+)', thumb) if vid_match: video_url = f"https://www.tiktok.com/@{username}/video/{vid_match.group(1)}" if not video_url: # Step 1b: Check if we have any existing video URLs in the database if self.unified_db: try: with self.unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute(""" SELECT a.download_url FROM paid_content_attachments a JOIN paid_content_posts p ON a.post_id = p.id JOIN paid_content_creators c ON p.creator_id = c.id WHERE c.username = ? AND a.download_url LIKE '%tiktok.com%' LIMIT 1 """, (username,)) row = cursor.fetchone() if row and row[0]: video_url = row[0] except Exception: pass if not video_url: self.log(f"No video URL found for @{username} to resolve channel_id", 'debug') return None # Step 2: Use yt-dlp to get the channel_id from the single video self.log(f"Resolving channel_id from video: {video_url}", 'debug') cmd = self._get_base_cmd() + [ '-j', '--no-warnings', '--no-download', '--socket-timeout', '30', video_url ] result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode == 0: for line in stdout.decode('utf-8', errors='replace').strip().split('\n'): if not line.strip(): continue try: video_data = json.loads(line) channel_id = video_data.get('channel_id') or video_data.get('playlist_id') if channel_id: self.log(f"Resolved @{username} channel_id: {channel_id[:30]}...", 'info') return channel_id except json.JSONDecodeError: continue except Exception as e: self.log(f"Failed to resolve channel_id for @{username}: {e}", 'debug') return None async def get_creator_info(self, url: str) -> Optional[Dict]: """Get creator information using yt-dlp + profile page scraping""" username = self.extract_username(url) if not username: return None profile_url = self.normalize_creator_url(username) creator_name = username # Try yt-dlp for display name from video metadata if self.ytdlp_path: try: cmd = self._get_base_cmd() + [ '--no-warnings', '--flat-playlist', '-j', '--playlist-items', '1', '--socket-timeout', '30', profile_url ] result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode == 0: for line in stdout.decode('utf-8', errors='replace').strip().split('\n'): if not line: continue try: data = json.loads(line) creator_name = (data.get('channel') or data.get('uploader') or data.get('playlist_title') or username) break except json.JSONDecodeError: continue else: # Fallback: try tiktokuser: scheme if secondary user ID extraction fails err_text = stderr.decode('utf-8', errors='replace') if 'secondary user ID' in err_text or 'Unable to extract' in err_text: channel_id = await self._resolve_channel_id(username) if channel_id: fb_cmd = self._get_base_cmd() + [ '--no-warnings', '--flat-playlist', '-j', '--playlist-items', '1', '--socket-timeout', '30', f"tiktokuser:{channel_id}" ] fb_result = await asyncio.create_subprocess_exec( *fb_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) fb_stdout, _ = await fb_result.communicate() if fb_result.returncode == 0: for line in fb_stdout.decode('utf-8', errors='replace').strip().split('\n'): if not line: continue try: data = json.loads(line) creator_name = (data.get('channel') or data.get('uploader') or data.get('playlist_title') or username) break except json.JSONDecodeError: continue except Exception as e: self.log(f"Failed to get creator info via yt-dlp: {e}", 'debug') # Scrape profile page for avatar and bio profile_image = None bio = None try: profile_image, bio, page_name = await self._scrape_profile_page(profile_url) if page_name and creator_name == username: creator_name = page_name except Exception as e: self.log(f"Failed to scrape profile page: {e}", 'debug') return { 'creator_id': username, 'creator_name': creator_name, 'creator_url': profile_url, 'profile_image_url': profile_image, 'bio': bio, } async def _fetch_profile_with_cookies(self, url: str) -> Optional[str]: """Fetch TikTok profile page using curl_cffi with cookies from database.""" cookies_file = self._get_cookies_file() if not cookies_file: return None try: from curl_cffi import requests as cf_requests import http.cookiejar # Load cookies from the Netscape file jar = http.cookiejar.MozillaCookieJar(cookies_file) jar.load(ignore_discard=True, ignore_expires=True) # Try multiple browser versions for curl_cffi compatibility for _browser in ("chrome136", "chrome131", "chrome"): try: session = cf_requests.Session(impersonate=_browser) break except Exception: continue else: session = cf_requests.Session() for cookie in jar: session.cookies.set(cookie.name, cookie.value, domain=cookie.domain) resp = session.get(url, timeout=15) if resp.status_code == 200 and 'avatarLarger' in resp.text: self.log("Fetched TikTok profile with cookies (curl_cffi)", 'debug') return resp.text elif 'captcha' in resp.text.lower(): self.log("TikTok profile still returned captcha with cookies", 'debug') session.close() except Exception as e: self.log(f"curl_cffi profile fetch failed: {e}", 'debug') return None async def _scrape_profile_page(self, url: str) -> tuple: """ Scrape TikTok profile page for avatar and bio from embedded JSON data. TikTok embeds user data in __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag. Returns (profile_image_url, bio, display_name). """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', } profile_image = None bio = None display_name = None try: page_html = None async with aiohttp.ClientSession() as session: async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp: if resp.status == 200: page_html = await resp.text() # If we got a captcha page, try curl_cffi with cookies if not page_html or ('captcha' in page_html.lower() and 'avatarLarger' not in page_html): page_html = await self._fetch_profile_with_cookies(url) if not page_html: return (None, None, None) # Try structured JSON first (__UNIVERSAL_DATA_FOR_REHYDRATION__) rehydration_match = re.search( r']*id="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>(.*?)', page_html, re.DOTALL ) if rehydration_match: try: rdata = json.loads(rehydration_match.group(1)) user_detail = (rdata.get('__DEFAULT_SCOPE__', {}) .get('webapp.user-detail', {})) user = user_detail.get('userInfo', {}).get('user', {}) if user: avatar_val = user.get('avatarLarger') or user.get('avatarMedium') if avatar_val and not avatar_val.endswith('.mp4'): profile_image = avatar_val self.log("Found TikTok profile avatar (rehydration)", 'debug') sig_val = user.get('signature', '') if sig_val and sig_val.strip(): bio = sig_val.strip() self.log("Found TikTok bio (rehydration)", 'debug') nick_val = user.get('nickname') if nick_val: display_name = nick_val self.log(f"Found TikTok display name (rehydration): {display_name}", 'debug') # Extract pinned post IDs pinned_list = user_detail.get('pinnedList', []) if pinned_list: self._last_pinned_posts = {} for item in pinned_list: vid = str(item.get('id', '')) if vid: self._last_pinned_posts[vid] = {'pinned_at': None} if self._last_pinned_posts: self.log(f"Found {len(self._last_pinned_posts)} pinned TikTok posts", 'debug') except (json.JSONDecodeError, KeyError): pass # Fallback: regex extraction from raw HTML # Use json.loads to decode values (handles \uXXXX, surrogate pairs, and raw UTF-8) if not profile_image: avatar_match = re.search(r'"avatarLarger":"([^"]+)"', page_html) if not avatar_match: avatar_match = re.search(r'"avatarMedium":"([^"]+)"', page_html) if avatar_match: try: avatar_url = json.loads(f'"{avatar_match.group(1)}"') except (json.JSONDecodeError, ValueError): avatar_url = avatar_match.group(1) if avatar_url and not avatar_url.endswith('.mp4'): profile_image = avatar_url self.log("Found TikTok profile avatar", 'debug') if not bio: sig_match = re.search(r'"signature":"([^"]*)"', page_html) if sig_match: try: raw_bio = json.loads(f'"{sig_match.group(1)}"') except (json.JSONDecodeError, ValueError): raw_bio = sig_match.group(1) if raw_bio and raw_bio.strip(): bio = raw_bio.strip() self.log("Found TikTok bio", 'debug') if not display_name: nick_match = re.search(r'"nickname":"([^"]+)"', page_html) if nick_match: try: display_name = json.loads(f'"{nick_match.group(1)}"') except (json.JSONDecodeError, ValueError): display_name = nick_match.group(1) self.log(f"Found TikTok display name: {display_name}", 'debug') # Extract banner/cover from "coverLarger" field # (stored separately, not returned here but could be used later) except asyncio.TimeoutError: self.log("TikTok profile page request timed out", 'debug') except Exception as e: self.log(f"Error scraping TikTok profile: {e}", 'debug') return (profile_image, bio, display_name) async def get_creator_videos(self, url: str, since_date: str = None, max_videos: int = None, progress_callback=None) -> List[Dict]: """ Get all videos from a TikTok profile using yt-dlp --flat-playlist -j. Uses JSON output to properly handle multi-line descriptions/titles. Returns list of video metadata dicts with video_id and upload_date. """ if not self.ytdlp_path: return [] username = self.extract_username(url) if not username: return [] profile_url = self.normalize_creator_url(username) try: # Use yt-dlp flat-playlist with JSON output for full metadata cmd = self._get_base_cmd() + [ '--flat-playlist', '-j', '--no-warnings', '--socket-timeout', '30', profile_url ] self.log(f"Fetching TikTok videos for @{username}", 'info') result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode != 0: error = stderr.decode('utf-8', errors='replace') # Fallback: if yt-dlp can't extract secondary user ID, try tiktokuser: scheme if 'secondary user ID' in error or 'Unable to extract' in error: self.log(f"yt-dlp can't extract user ID for @{username}, trying channel_id fallback", 'info') channel_id = await self._resolve_channel_id(username) if channel_id: fallback_cmd = self._get_base_cmd() + [ '--flat-playlist', '-j', '--no-warnings', '--socket-timeout', '30', f"tiktokuser:{channel_id}" ] fb_result = await asyncio.create_subprocess_exec( *fallback_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await fb_result.communicate() if fb_result.returncode == 0: self.log(f"Fallback tiktokuser: succeeded for @{username}", 'info') else: fb_error = stderr.decode('utf-8', errors='replace') self.log(f"Fallback also failed for @{username}: {fb_error}", 'warning') return [] else: self.log(f"Could not resolve channel_id for @{username}", 'warning') return [] else: self.log(f"Failed to list TikTok videos: {error}", 'warning') return [] lines = stdout.decode('utf-8', errors='replace').strip().split('\n') # Parse since_date for filtering cutoff_str = None if since_date: try: if 'T' in since_date: cutoff_dt = datetime.fromisoformat(since_date.replace('Z', '+00:00').replace('+00:00', '')) else: cutoff_dt = datetime.strptime(since_date[:10], '%Y-%m-%d') cutoff_str = cutoff_dt.strftime('%Y%m%d') except (ValueError, IndexError): pass videos = [] for line in lines: if not line.strip(): continue try: data = json.loads(line) except json.JSONDecodeError: continue video_id = str(data.get('id', '')) if not video_id: continue upload_date = data.get('upload_date', '') title = data.get('title', '') description = data.get('description', '') # Skip posts where yt-dlp returned no metadata at all # When cookies are expired, yt-dlp returns no date, no title, # and no description. Real posts with empty captions still have # upload_date, so we use that as the key signal. if not upload_date and not title and not description: self.log(f"Skipping TikTok {video_id}: no metadata (cookies may be expired)", 'debug') continue title = title or description or f"TikTok video #{video_id}" description = description or title # Filter by date if cutoff specified if cutoff_str and upload_date and upload_date < cutoff_str: continue # Format upload_date to ISO formatted_date = None if upload_date and len(upload_date) == 8 and upload_date.isdigit(): formatted_date = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}" video_url = data.get('url') or f"https://www.tiktok.com/@{username}/video/{video_id}" videos.append({ 'video_id': video_id, 'title': title, 'description': description, 'upload_date': formatted_date, 'url': video_url, 'username': username, }) if progress_callback: progress_callback(len(videos)) if max_videos and len(videos) >= max_videos: break self.log(f"Found {len(videos)} TikTok videos for @{username}", 'info') self._save_cookies_back() return videos except Exception as e: self.log(f"Error getting TikTok videos: {e}", 'error') self._save_cookies_back() return [] async def download_video(self, video_url: str, output_dir: Path, username: str = '') -> Dict: """ Download a TikTok video/carousel using gallery-dl. gallery-dl handles both regular videos and carousel/slideshow posts. Returns dict with success status and list of downloaded files. """ if not self.gallery_dl_path: return {'success': False, 'error': 'gallery-dl not available'} try: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) cmd = [ self.gallery_dl_path, '--write-metadata', '-D', str(output_dir), '-f', '{id}_{num}.{extension}', ] # Add cookies for age-restricted / login-required content cookies_file = self._get_cookies_file() if cookies_file: cmd.extend(['--cookies', cookies_file]) cmd.append(video_url) self.log(f"Downloading TikTok: {video_url}", 'debug') # Snapshot existing files before download so we only pick up new ones existing_files = set(f.name for f in output_dir.iterdir() if f.is_file()) result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() # Find newly downloaded files (exclude .json metadata and audio-only files) downloaded_files = [] for f in output_dir.iterdir(): if f.is_file() and f.name not in existing_files and f.suffix.lower() not in ('.json',): # Skip audio-only files if f.suffix.lower() in ('.mp3', '.m4a', '.aac', '.wav', '.ogg'): continue downloaded_files.append(f) if result.returncode != 0: # gallery-dl exit code 4 = partial failure (e.g. slideshow images OK but audio failed) # If we got media files, treat as success if downloaded_files: self.log(f"gallery-dl partial failure (code {result.returncode}) but {len(downloaded_files)} files downloaded", 'debug') else: error_msg = stderr.decode('utf-8', errors='replace').strip() if 'not available' in error_msg.lower() or '404' in error_msg: error_msg = 'Video not available (deleted or private)' elif len(error_msg) > 200: error_msg = error_msg[:200] + '...' return {'success': False, 'error': error_msg} if not downloaded_files: return {'success': False, 'error': 'No files downloaded'} # Sort by name to maintain carousel order (e.g. id_1.jpg, id_2.jpg) downloaded_files.sort(key=lambda f: f.name) primary_file = downloaded_files[0] # Determine if this is a photo carousel (multiple images) image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp'} is_carousel = len(downloaded_files) > 1 and all( f.suffix.lower() in image_exts for f in downloaded_files ) self._save_cookies_back() return { 'success': True, 'file_path': str(primary_file), 'filename': primary_file.name, 'file_size': primary_file.stat().st_size, 'all_files': [str(f) for f in downloaded_files], 'file_count': len(downloaded_files), 'is_carousel': is_carousel, } except Exception as e: self.log(f"Error downloading TikTok video: {e}", 'error') self._save_cookies_back() return {'success': False, 'error': str(e)} async def get_creator(self, url: str) -> Optional[Creator]: """Get Creator object from URL""" info = await self.get_creator_info(url) if not info: return None username = info.get('creator_id', '') return Creator( creator_id=username, service_id='tiktok', platform='tiktok', username=info.get('creator_name', username), display_name=info.get('creator_name'), profile_image_url=info.get('profile_image_url'), bio=info.get('bio'), ) async def get_posts(self, url: str, since_date: str = None, max_videos: int = None, progress_callback=None) -> List[Post]: """Get TikTok videos as Post objects""" videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback) username = self.extract_username(url) or '' posts = [] for video in videos: # Each TikTok post could be video or carousel # We create a single attachment for now; the actual download determines type attachment = Attachment( name=f"{video['video_id']}.mp4", file_type='video', extension='.mp4', server_path=video['url'], download_url=video['url'], ) post = Post( post_id=video['video_id'], service_id='tiktok', platform='tiktok', creator_id=username, title=None, content=video.get('description') or video.get('title', ''), published_at=video.get('upload_date'), attachments=[attachment], ) posts.append(post) return posts