827
modules/paid_content/tiktok_client.py
Normal file
827
modules/paid_content/tiktok_client.py
Normal file
@@ -0,0 +1,827 @@
|
||||
"""
|
||||
TikTok Client for Paid Content - Uses yt-dlp for listing and gallery-dl for downloading
|
||||
|
||||
Adapts the hybrid approach from modules/tiktok_module.py into the paid content client pattern.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html as html_module
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class TikTokClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching TikTok creator information and videos.
|
||||
|
||||
Uses yt-dlp for listing (fast flat-playlist) and gallery-dl for downloading
|
||||
(handles carousels/slideshows properly).
|
||||
"""
|
||||
|
||||
SERVICE_ID = 'tiktok'
|
||||
PLATFORM = 'tiktok'
|
||||
|
||||
def __init__(self, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='TikTok')
|
||||
|
||||
self.ytdlp_path = self._find_executable('yt-dlp')
|
||||
self.gallery_dl_path = self._find_executable('gallery-dl')
|
||||
self.unified_db = unified_db
|
||||
self._cookies_file = None
|
||||
self._last_pinned_posts = {}
|
||||
|
||||
if not self.ytdlp_path:
|
||||
self.log("yt-dlp not found, TikTok listing will be disabled", 'warning')
|
||||
if not self.gallery_dl_path:
|
||||
self.log("gallery-dl not found, TikTok downloading will be disabled", 'warning')
|
||||
|
||||
def _find_executable(self, name: str) -> Optional[str]:
|
||||
"""Find an executable by name"""
|
||||
common_paths = [
|
||||
f'/opt/media-downloader/venv/bin/{name}',
|
||||
f'/usr/local/bin/{name}',
|
||||
f'/usr/bin/{name}',
|
||||
f'/opt/homebrew/bin/{name}',
|
||||
os.path.expanduser(f'~/.local/bin/{name}'),
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
|
||||
try:
|
||||
result = subprocess.run(['which', name], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if both yt-dlp and gallery-dl are available"""
|
||||
return self.ytdlp_path is not None and self.gallery_dl_path is not None
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up any temporary files"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
try:
|
||||
os.unlink(self._cookies_file)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _get_cookies_file(self) -> Optional[str]:
|
||||
"""Get path to cookies file, creating from database if needed."""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
return self._cookies_file
|
||||
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
# Check for tiktok scraper cookies
|
||||
for scraper_id in ('tiktok', 'tiktok_client'):
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
cookies_list = data['cookies']
|
||||
elif isinstance(data, list):
|
||||
cookies_list = data
|
||||
else:
|
||||
cookies_list = []
|
||||
|
||||
if cookies_list:
|
||||
import tempfile
|
||||
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='tiktok_cookies_')
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
f.write("# Netscape HTTP Cookie File\n")
|
||||
for cookie in cookies_list:
|
||||
domain = cookie.get('domain', '')
|
||||
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
||||
path = cookie.get('path', '/')
|
||||
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
||||
expiry = str(int(cookie.get('expirationDate', 0)))
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
|
||||
self.log(f"Loaded {len(cookies_list)} TikTok cookies", 'debug')
|
||||
return self._cookies_file
|
||||
except Exception as e:
|
||||
self.log(f"Could not load TikTok cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _save_cookies_back(self):
|
||||
"""Read updated cookies from temp file and save back to database.
|
||||
yt-dlp and gallery-dl update the cookies file with refreshed tokens
|
||||
from TikTok (e.g. msToken), so we need to persist those changes."""
|
||||
if not self._cookies_file or not os.path.exists(self._cookies_file):
|
||||
return
|
||||
if not self.unified_db:
|
||||
return
|
||||
|
||||
try:
|
||||
import http.cookiejar
|
||||
jar = http.cookiejar.MozillaCookieJar(self._cookies_file)
|
||||
jar.load(ignore_discard=True, ignore_expires=True)
|
||||
|
||||
updated_cookies = []
|
||||
for cookie in jar:
|
||||
updated_cookies.append({
|
||||
'name': cookie.name,
|
||||
'value': cookie.value,
|
||||
'domain': cookie.domain,
|
||||
'path': cookie.path,
|
||||
'secure': cookie.secure,
|
||||
'expirationDate': cookie.expires or 0,
|
||||
})
|
||||
|
||||
if not updated_cookies:
|
||||
return
|
||||
|
||||
# Merge updated cookies back to DB
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('tiktok',))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row and row[0]:
|
||||
existing_data = json.loads(row[0])
|
||||
existing_cookies = existing_data if isinstance(existing_data, list) else existing_data.get('cookies', [])
|
||||
# Merge: updated cookies override existing by name+domain
|
||||
cookie_map = {(c.get('name'), c.get('domain')): c for c in existing_cookies}
|
||||
for c in updated_cookies:
|
||||
cookie_map[(c['name'], c['domain'])] = c
|
||||
final_cookies = list(cookie_map.values())
|
||||
else:
|
||||
final_cookies = updated_cookies
|
||||
|
||||
self.unified_db.save_scraper_cookies('tiktok', final_cookies, merge=False)
|
||||
self.log(f"Saved {len(final_cookies)} refreshed cookies back to DB", 'debug')
|
||||
|
||||
# Clear cached file so next use gets fresh cookies from DB
|
||||
self._cookies_file = None
|
||||
except Exception as e:
|
||||
self.log(f"Failed to save cookies back: {e}", 'debug')
|
||||
|
||||
def _get_base_cmd(self) -> List[str]:
|
||||
"""Get base yt-dlp command with cookies if available."""
|
||||
cmd = [self.ytdlp_path]
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
return cmd
|
||||
|
||||
@staticmethod
|
||||
def extract_username(url: str) -> Optional[str]:
|
||||
"""Extract username from TikTok URL"""
|
||||
match = re.search(r'tiktok\.com/@([a-zA-Z0-9_.]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_creator_url(username: str) -> str:
|
||||
"""Convert username to a consistent URL format"""
|
||||
if username.startswith('http://') or username.startswith('https://'):
|
||||
return username
|
||||
username = username.lstrip('@')
|
||||
return f"https://www.tiktok.com/@{username}"
|
||||
|
||||
async def _resolve_channel_id(self, username: str) -> Optional[str]:
|
||||
"""Resolve a TikTok username to a channel_id (secUid).
|
||||
|
||||
When yt-dlp can't extract the secondary user ID from the profile page,
|
||||
we try to find a video URL from TikTok's embed/RSS and then extract
|
||||
the channel_id (secUid) from that video's metadata via yt-dlp.
|
||||
"""
|
||||
if not self.ytdlp_path:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Step 1: Get a video URL from this user via the oembed embed HTML
|
||||
video_url = None
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# The oembed HTML often contains a video ID we can use
|
||||
oembed_url = f"https://www.tiktok.com/oembed?url=https://www.tiktok.com/@{username}"
|
||||
async with session.get(oembed_url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json()
|
||||
embed_html = data.get('html', '')
|
||||
# Extract video URL from embed iframe
|
||||
match = re.search(r'cite="(https://www\.tiktok\.com/@[^"]+/video/\d+)"', embed_html)
|
||||
if not match:
|
||||
match = re.search(r'data-video-id="(\d+)"', embed_html)
|
||||
if match:
|
||||
video_url = f"https://www.tiktok.com/@{username}/video/{match.group(1)}"
|
||||
else:
|
||||
video_url = match.group(1)
|
||||
|
||||
if not video_url:
|
||||
# oembed thumbnail_url sometimes contains the video ID
|
||||
thumb = data.get('thumbnail_url', '')
|
||||
vid_match = re.search(r'/video/(\d+)', thumb)
|
||||
if vid_match:
|
||||
video_url = f"https://www.tiktok.com/@{username}/video/{vid_match.group(1)}"
|
||||
|
||||
if not video_url:
|
||||
# Step 1b: Check if we have any existing video URLs in the database
|
||||
if self.unified_db:
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT a.download_url FROM paid_content_attachments a
|
||||
JOIN paid_content_posts p ON a.post_id = p.id
|
||||
JOIN paid_content_creators c ON p.creator_id = c.id
|
||||
WHERE c.username = ? AND a.download_url LIKE '%tiktok.com%'
|
||||
LIMIT 1
|
||||
""", (username,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
video_url = row[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not video_url:
|
||||
self.log(f"No video URL found for @{username} to resolve channel_id", 'debug')
|
||||
return None
|
||||
|
||||
# Step 2: Use yt-dlp to get the channel_id from the single video
|
||||
self.log(f"Resolving channel_id from video: {video_url}", 'debug')
|
||||
cmd = self._get_base_cmd() + [
|
||||
'-j',
|
||||
'--no-warnings',
|
||||
'--no-download',
|
||||
'--socket-timeout', '30',
|
||||
video_url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
video_data = json.loads(line)
|
||||
channel_id = video_data.get('channel_id') or video_data.get('playlist_id')
|
||||
if channel_id:
|
||||
self.log(f"Resolved @{username} channel_id: {channel_id[:30]}...", 'info')
|
||||
return channel_id
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Failed to resolve channel_id for @{username}: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_creator_info(self, url: str) -> Optional[Dict]:
|
||||
"""Get creator information using yt-dlp + profile page scraping"""
|
||||
username = self.extract_username(url)
|
||||
if not username:
|
||||
return None
|
||||
|
||||
profile_url = self.normalize_creator_url(username)
|
||||
creator_name = username
|
||||
|
||||
# Try yt-dlp for display name from video metadata
|
||||
if self.ytdlp_path:
|
||||
try:
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--playlist-items', '1',
|
||||
'--socket-timeout', '30',
|
||||
profile_url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
creator_name = (data.get('channel') or data.get('uploader')
|
||||
or data.get('playlist_title') or username)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
else:
|
||||
# Fallback: try tiktokuser: scheme if secondary user ID extraction fails
|
||||
err_text = stderr.decode('utf-8', errors='replace')
|
||||
if 'secondary user ID' in err_text or 'Unable to extract' in err_text:
|
||||
channel_id = await self._resolve_channel_id(username)
|
||||
if channel_id:
|
||||
fb_cmd = self._get_base_cmd() + [
|
||||
'--no-warnings', '--flat-playlist',
|
||||
'-j', '--playlist-items', '1', '--socket-timeout', '30',
|
||||
f"tiktokuser:{channel_id}"
|
||||
]
|
||||
fb_result = await asyncio.create_subprocess_exec(
|
||||
*fb_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
fb_stdout, _ = await fb_result.communicate()
|
||||
if fb_result.returncode == 0:
|
||||
for line in fb_stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
creator_name = (data.get('channel') or data.get('uploader')
|
||||
or data.get('playlist_title') or username)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
self.log(f"Failed to get creator info via yt-dlp: {e}", 'debug')
|
||||
|
||||
# Scrape profile page for avatar and bio
|
||||
profile_image = None
|
||||
bio = None
|
||||
try:
|
||||
profile_image, bio, page_name = await self._scrape_profile_page(profile_url)
|
||||
if page_name and creator_name == username:
|
||||
creator_name = page_name
|
||||
except Exception as e:
|
||||
self.log(f"Failed to scrape profile page: {e}", 'debug')
|
||||
|
||||
return {
|
||||
'creator_id': username,
|
||||
'creator_name': creator_name,
|
||||
'creator_url': profile_url,
|
||||
'profile_image_url': profile_image,
|
||||
'bio': bio,
|
||||
}
|
||||
|
||||
async def _fetch_profile_with_cookies(self, url: str) -> Optional[str]:
|
||||
"""Fetch TikTok profile page using curl_cffi with cookies from database."""
|
||||
cookies_file = self._get_cookies_file()
|
||||
if not cookies_file:
|
||||
return None
|
||||
|
||||
try:
|
||||
from curl_cffi import requests as cf_requests
|
||||
import http.cookiejar
|
||||
|
||||
# Load cookies from the Netscape file
|
||||
jar = http.cookiejar.MozillaCookieJar(cookies_file)
|
||||
jar.load(ignore_discard=True, ignore_expires=True)
|
||||
|
||||
# Try multiple browser versions for curl_cffi compatibility
|
||||
for _browser in ("chrome136", "chrome131", "chrome"):
|
||||
try:
|
||||
session = cf_requests.Session(impersonate=_browser)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
else:
|
||||
session = cf_requests.Session()
|
||||
for cookie in jar:
|
||||
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain)
|
||||
|
||||
resp = session.get(url, timeout=15)
|
||||
if resp.status_code == 200 and 'avatarLarger' in resp.text:
|
||||
self.log("Fetched TikTok profile with cookies (curl_cffi)", 'debug')
|
||||
return resp.text
|
||||
elif 'captcha' in resp.text.lower():
|
||||
self.log("TikTok profile still returned captcha with cookies", 'debug')
|
||||
session.close()
|
||||
except Exception as e:
|
||||
self.log(f"curl_cffi profile fetch failed: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def _scrape_profile_page(self, url: str) -> tuple:
|
||||
"""
|
||||
Scrape TikTok profile page for avatar and bio from embedded JSON data.
|
||||
TikTok embeds user data in __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag.
|
||||
Returns (profile_image_url, bio, display_name).
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
profile_image = None
|
||||
bio = None
|
||||
display_name = None
|
||||
|
||||
try:
|
||||
page_html = None
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
|
||||
if resp.status == 200:
|
||||
page_html = await resp.text()
|
||||
|
||||
# If we got a captcha page, try curl_cffi with cookies
|
||||
if not page_html or ('captcha' in page_html.lower() and 'avatarLarger' not in page_html):
|
||||
page_html = await self._fetch_profile_with_cookies(url)
|
||||
if not page_html:
|
||||
return (None, None, None)
|
||||
|
||||
# Try structured JSON first (__UNIVERSAL_DATA_FOR_REHYDRATION__)
|
||||
rehydration_match = re.search(
|
||||
r'<script[^>]*id="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>(.*?)</script>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if rehydration_match:
|
||||
try:
|
||||
rdata = json.loads(rehydration_match.group(1))
|
||||
user_detail = (rdata.get('__DEFAULT_SCOPE__', {})
|
||||
.get('webapp.user-detail', {}))
|
||||
user = user_detail.get('userInfo', {}).get('user', {})
|
||||
if user:
|
||||
avatar_val = user.get('avatarLarger') or user.get('avatarMedium')
|
||||
if avatar_val and not avatar_val.endswith('.mp4'):
|
||||
profile_image = avatar_val
|
||||
self.log("Found TikTok profile avatar (rehydration)", 'debug')
|
||||
sig_val = user.get('signature', '')
|
||||
if sig_val and sig_val.strip():
|
||||
bio = sig_val.strip()
|
||||
self.log("Found TikTok bio (rehydration)", 'debug')
|
||||
nick_val = user.get('nickname')
|
||||
if nick_val:
|
||||
display_name = nick_val
|
||||
self.log(f"Found TikTok display name (rehydration): {display_name}", 'debug')
|
||||
|
||||
# Extract pinned post IDs
|
||||
pinned_list = user_detail.get('pinnedList', [])
|
||||
if pinned_list:
|
||||
self._last_pinned_posts = {}
|
||||
for item in pinned_list:
|
||||
vid = str(item.get('id', ''))
|
||||
if vid:
|
||||
self._last_pinned_posts[vid] = {'pinned_at': None}
|
||||
if self._last_pinned_posts:
|
||||
self.log(f"Found {len(self._last_pinned_posts)} pinned TikTok posts", 'debug')
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
|
||||
# Fallback: regex extraction from raw HTML
|
||||
# Use json.loads to decode values (handles \uXXXX, surrogate pairs, and raw UTF-8)
|
||||
if not profile_image:
|
||||
avatar_match = re.search(r'"avatarLarger":"([^"]+)"', page_html)
|
||||
if not avatar_match:
|
||||
avatar_match = re.search(r'"avatarMedium":"([^"]+)"', page_html)
|
||||
if avatar_match:
|
||||
try:
|
||||
avatar_url = json.loads(f'"{avatar_match.group(1)}"')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
avatar_url = avatar_match.group(1)
|
||||
if avatar_url and not avatar_url.endswith('.mp4'):
|
||||
profile_image = avatar_url
|
||||
self.log("Found TikTok profile avatar", 'debug')
|
||||
|
||||
if not bio:
|
||||
sig_match = re.search(r'"signature":"([^"]*)"', page_html)
|
||||
if sig_match:
|
||||
try:
|
||||
raw_bio = json.loads(f'"{sig_match.group(1)}"')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
raw_bio = sig_match.group(1)
|
||||
if raw_bio and raw_bio.strip():
|
||||
bio = raw_bio.strip()
|
||||
self.log("Found TikTok bio", 'debug')
|
||||
|
||||
if not display_name:
|
||||
nick_match = re.search(r'"nickname":"([^"]+)"', page_html)
|
||||
if nick_match:
|
||||
try:
|
||||
display_name = json.loads(f'"{nick_match.group(1)}"')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
display_name = nick_match.group(1)
|
||||
self.log(f"Found TikTok display name: {display_name}", 'debug')
|
||||
|
||||
# Extract banner/cover from "coverLarger" field
|
||||
# (stored separately, not returned here but could be used later)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
self.log("TikTok profile page request timed out", 'debug')
|
||||
except Exception as e:
|
||||
self.log(f"Error scraping TikTok profile: {e}", 'debug')
|
||||
|
||||
return (profile_image, bio, display_name)
|
||||
|
||||
async def get_creator_videos(self, url: str, since_date: str = None,
|
||||
max_videos: int = None,
|
||||
progress_callback=None) -> List[Dict]:
|
||||
"""
|
||||
Get all videos from a TikTok profile using yt-dlp --flat-playlist -j.
|
||||
|
||||
Uses JSON output to properly handle multi-line descriptions/titles.
|
||||
Returns list of video metadata dicts with video_id and upload_date.
|
||||
"""
|
||||
if not self.ytdlp_path:
|
||||
return []
|
||||
|
||||
username = self.extract_username(url)
|
||||
if not username:
|
||||
return []
|
||||
|
||||
profile_url = self.normalize_creator_url(username)
|
||||
|
||||
try:
|
||||
# Use yt-dlp flat-playlist with JSON output for full metadata
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--no-warnings',
|
||||
'--socket-timeout', '30',
|
||||
profile_url
|
||||
]
|
||||
|
||||
self.log(f"Fetching TikTok videos for @{username}", 'info')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error = stderr.decode('utf-8', errors='replace')
|
||||
|
||||
# Fallback: if yt-dlp can't extract secondary user ID, try tiktokuser: scheme
|
||||
if 'secondary user ID' in error or 'Unable to extract' in error:
|
||||
self.log(f"yt-dlp can't extract user ID for @{username}, trying channel_id fallback", 'info')
|
||||
channel_id = await self._resolve_channel_id(username)
|
||||
if channel_id:
|
||||
fallback_cmd = self._get_base_cmd() + [
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--no-warnings',
|
||||
'--socket-timeout', '30',
|
||||
f"tiktokuser:{channel_id}"
|
||||
]
|
||||
fb_result = await asyncio.create_subprocess_exec(
|
||||
*fallback_cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await fb_result.communicate()
|
||||
if fb_result.returncode == 0:
|
||||
self.log(f"Fallback tiktokuser: succeeded for @{username}", 'info')
|
||||
else:
|
||||
fb_error = stderr.decode('utf-8', errors='replace')
|
||||
self.log(f"Fallback also failed for @{username}: {fb_error}", 'warning')
|
||||
return []
|
||||
else:
|
||||
self.log(f"Could not resolve channel_id for @{username}", 'warning')
|
||||
return []
|
||||
else:
|
||||
self.log(f"Failed to list TikTok videos: {error}", 'warning')
|
||||
return []
|
||||
|
||||
lines = stdout.decode('utf-8', errors='replace').strip().split('\n')
|
||||
|
||||
# Parse since_date for filtering
|
||||
cutoff_str = None
|
||||
if since_date:
|
||||
try:
|
||||
if 'T' in since_date:
|
||||
cutoff_dt = datetime.fromisoformat(since_date.replace('Z', '+00:00').replace('+00:00', ''))
|
||||
else:
|
||||
cutoff_dt = datetime.strptime(since_date[:10], '%Y-%m-%d')
|
||||
cutoff_str = cutoff_dt.strftime('%Y%m%d')
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
videos = []
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
video_id = str(data.get('id', ''))
|
||||
if not video_id:
|
||||
continue
|
||||
|
||||
upload_date = data.get('upload_date', '')
|
||||
title = data.get('title', '')
|
||||
description = data.get('description', '')
|
||||
|
||||
# Skip posts where yt-dlp returned no metadata at all
|
||||
# When cookies are expired, yt-dlp returns no date, no title,
|
||||
# and no description. Real posts with empty captions still have
|
||||
# upload_date, so we use that as the key signal.
|
||||
if not upload_date and not title and not description:
|
||||
self.log(f"Skipping TikTok {video_id}: no metadata (cookies may be expired)", 'debug')
|
||||
continue
|
||||
|
||||
title = title or description or f"TikTok video #{video_id}"
|
||||
description = description or title
|
||||
|
||||
# Filter by date if cutoff specified
|
||||
if cutoff_str and upload_date and upload_date < cutoff_str:
|
||||
continue
|
||||
|
||||
# Format upload_date to ISO
|
||||
formatted_date = None
|
||||
if upload_date and len(upload_date) == 8 and upload_date.isdigit():
|
||||
formatted_date = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}"
|
||||
|
||||
video_url = data.get('url') or f"https://www.tiktok.com/@{username}/video/{video_id}"
|
||||
|
||||
videos.append({
|
||||
'video_id': video_id,
|
||||
'title': title,
|
||||
'description': description,
|
||||
'upload_date': formatted_date,
|
||||
'url': video_url,
|
||||
'username': username,
|
||||
})
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(videos))
|
||||
|
||||
if max_videos and len(videos) >= max_videos:
|
||||
break
|
||||
|
||||
self.log(f"Found {len(videos)} TikTok videos for @{username}", 'info')
|
||||
self._save_cookies_back()
|
||||
return videos
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting TikTok videos: {e}", 'error')
|
||||
self._save_cookies_back()
|
||||
return []
|
||||
|
||||
async def download_video(self, video_url: str, output_dir: Path, username: str = '') -> Dict:
|
||||
"""
|
||||
Download a TikTok video/carousel using gallery-dl.
|
||||
|
||||
gallery-dl handles both regular videos and carousel/slideshow posts.
|
||||
Returns dict with success status and list of downloaded files.
|
||||
"""
|
||||
if not self.gallery_dl_path:
|
||||
return {'success': False, 'error': 'gallery-dl not available'}
|
||||
|
||||
try:
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cmd = [
|
||||
self.gallery_dl_path,
|
||||
'--write-metadata',
|
||||
'-D', str(output_dir),
|
||||
'-f', '{id}_{num}.{extension}',
|
||||
]
|
||||
|
||||
# Add cookies for age-restricted / login-required content
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
|
||||
cmd.append(video_url)
|
||||
|
||||
self.log(f"Downloading TikTok: {video_url}", 'debug')
|
||||
|
||||
# Snapshot existing files before download so we only pick up new ones
|
||||
existing_files = set(f.name for f in output_dir.iterdir() if f.is_file())
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
# Find newly downloaded files (exclude .json metadata and audio-only files)
|
||||
downloaded_files = []
|
||||
for f in output_dir.iterdir():
|
||||
if f.is_file() and f.name not in existing_files and f.suffix.lower() not in ('.json',):
|
||||
# Skip audio-only files
|
||||
if f.suffix.lower() in ('.mp3', '.m4a', '.aac', '.wav', '.ogg'):
|
||||
continue
|
||||
downloaded_files.append(f)
|
||||
|
||||
if result.returncode != 0:
|
||||
# gallery-dl exit code 4 = partial failure (e.g. slideshow images OK but audio failed)
|
||||
# If we got media files, treat as success
|
||||
if downloaded_files:
|
||||
self.log(f"gallery-dl partial failure (code {result.returncode}) but {len(downloaded_files)} files downloaded", 'debug')
|
||||
else:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
if 'not available' in error_msg.lower() or '404' in error_msg:
|
||||
error_msg = 'Video not available (deleted or private)'
|
||||
elif len(error_msg) > 200:
|
||||
error_msg = error_msg[:200] + '...'
|
||||
return {'success': False, 'error': error_msg}
|
||||
|
||||
if not downloaded_files:
|
||||
return {'success': False, 'error': 'No files downloaded'}
|
||||
|
||||
# Sort by name to maintain carousel order (e.g. id_1.jpg, id_2.jpg)
|
||||
downloaded_files.sort(key=lambda f: f.name)
|
||||
primary_file = downloaded_files[0]
|
||||
|
||||
# Determine if this is a photo carousel (multiple images)
|
||||
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp'}
|
||||
is_carousel = len(downloaded_files) > 1 and all(
|
||||
f.suffix.lower() in image_exts for f in downloaded_files
|
||||
)
|
||||
|
||||
self._save_cookies_back()
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(primary_file),
|
||||
'filename': primary_file.name,
|
||||
'file_size': primary_file.stat().st_size,
|
||||
'all_files': [str(f) for f in downloaded_files],
|
||||
'file_count': len(downloaded_files),
|
||||
'is_carousel': is_carousel,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading TikTok video: {e}", 'error')
|
||||
self._save_cookies_back()
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
async def get_creator(self, url: str) -> Optional[Creator]:
|
||||
"""Get Creator object from URL"""
|
||||
info = await self.get_creator_info(url)
|
||||
if not info:
|
||||
return None
|
||||
|
||||
username = info.get('creator_id', '')
|
||||
|
||||
return Creator(
|
||||
creator_id=username,
|
||||
service_id='tiktok',
|
||||
platform='tiktok',
|
||||
username=info.get('creator_name', username),
|
||||
display_name=info.get('creator_name'),
|
||||
profile_image_url=info.get('profile_image_url'),
|
||||
bio=info.get('bio'),
|
||||
)
|
||||
|
||||
async def get_posts(self, url: str, since_date: str = None,
|
||||
max_videos: int = None, progress_callback=None) -> List[Post]:
|
||||
"""Get TikTok videos as Post objects"""
|
||||
videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
|
||||
|
||||
username = self.extract_username(url) or ''
|
||||
|
||||
posts = []
|
||||
for video in videos:
|
||||
# Each TikTok post could be video or carousel
|
||||
# We create a single attachment for now; the actual download determines type
|
||||
attachment = Attachment(
|
||||
name=f"{video['video_id']}.mp4",
|
||||
file_type='video',
|
||||
extension='.mp4',
|
||||
server_path=video['url'],
|
||||
download_url=video['url'],
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=video['video_id'],
|
||||
service_id='tiktok',
|
||||
platform='tiktok',
|
||||
creator_id=username,
|
||||
title=None,
|
||||
content=video.get('description') or video.get('title', ''),
|
||||
published_at=video.get('upload_date'),
|
||||
attachments=[attachment],
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
Reference in New Issue
Block a user