Files
media-downloader/modules/tiktok_module.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

603 lines
26 KiB
Python
Executable File

#!/usr/bin/env python3
"""
TikTok Download Module - Downloads TikTok videos with proper timestamp extraction
"""
import os
import re
import json
import subprocess
import sqlite3
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from modules.base_module import LoggingMixin
class TikTokDownloader(LoggingMixin):
"""Downloads TikTok videos and extracts metadata including timestamps"""
def __init__(self, base_path: Path = None, log_callback=None, use_database=True, unified_db=None):
"""
Initialize TikTok downloader
Args:
base_path: Base path for downloads
log_callback: Optional callback for logging (tag, level, message)
use_database: Whether to use database for tracking downloads
unified_db: UnifiedDatabase instance (required)
"""
# Initialize logging via mixin
self._init_logger('TikTok', log_callback, default_module='Download')
self.base_path = Path(base_path) if base_path else Path.cwd()
self.file_timestamps = {} # Map of filename -> datetime
self.use_database = use_database
# Always use unified database adapter
if not unified_db:
raise ValueError("TikTok module requires unified_db - standalone database is no longer supported")
from modules.tiktok_db_adapter import TikTokDatabaseAdapter
self.db = TikTokDatabaseAdapter(unified_db)
self.use_unified_db = True
# Initialize activity status manager for real-time updates
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
self.pending_downloads = [] # Track downloads for deferred database recording
def _is_already_downloaded(self, video_id: str, username: str = None) -> bool:
"""Check if a video has already been downloaded"""
if not self.use_database:
return False
# Pass username for proper database lookup
if username:
return self.db.is_downloaded(video_id, username)
return self.db.is_already_downloaded(video_id)
def _record_download(self, video_id: str, username: str, filename: str,
post_date: Optional[datetime] = None, metadata: Dict = None,
deferred: bool = False):
"""Record a successful download in the database
Args:
deferred: If True, don't record to database now - add to pending_downloads list
for later recording after file move is complete
"""
# Extract just the filename from the full path for database
from pathlib import Path
file_path = str(filename) # Full path
filename_only = Path(filename).name # Just the filename
# If deferred, store for later recording instead of recording now
if deferred:
self.pending_downloads.append({
'video_id': video_id,
'username': username,
'filename': filename_only,
'post_date': post_date.isoformat() if post_date else None,
'file_path': file_path,
'metadata': metadata
})
self.log(f"Deferred recording for {video_id}", "debug")
return True
if not self.use_database:
return
return self.db.record_download(
video_id=video_id,
username=username,
filename=filename_only,
post_date=post_date,
metadata=metadata,
file_path=file_path
)
def get_pending_downloads(self):
"""Get list of downloads that were deferred for later recording"""
return self.pending_downloads.copy()
def clear_pending_downloads(self):
"""Clear the pending downloads list after they've been recorded"""
self.pending_downloads = []
def extract_date_from_info(self, info_dict: Dict) -> Optional[datetime]:
"""
Extract upload date from yt-dlp info dictionary
Args:
info_dict: yt-dlp info dictionary
Returns:
datetime object or None
"""
# Try timestamp first (Unix timestamp - has full date and time)
# TikTok provides UTC timestamps, need to convert to local time
timestamp = info_dict.get('timestamp')
if timestamp:
try:
# Use UTC timestamp and convert to local
from datetime import timezone
dt_utc = datetime.fromtimestamp(timestamp, tz=timezone.utc)
dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime
self.log(f"Extracted full timestamp (UTC): {dt}", "debug")
return dt
except Exception:
pass
# Try release_timestamp (also has full date and time)
release_timestamp = info_dict.get('release_timestamp')
if release_timestamp:
try:
from datetime import timezone
dt_utc = datetime.fromtimestamp(release_timestamp, tz=timezone.utc)
dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime
self.log(f"Extracted release timestamp (UTC): {dt}", "debug")
return dt
except Exception:
pass
# Try modified_timestamp
modified_timestamp = info_dict.get('modified_timestamp')
if modified_timestamp:
try:
from datetime import timezone
dt_utc = datetime.fromtimestamp(modified_timestamp, tz=timezone.utc)
dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime
self.log(f"Extracted modified timestamp (UTC): {dt}", "debug")
return dt
except Exception:
pass
# Fall back to upload_date (YYYYMMDD format - only has date, no time)
# This should be last resort as it loses time information
upload_date = info_dict.get('upload_date')
if upload_date and len(upload_date) == 8:
try:
# Try to get time from filename if it has timestamp format
# TikTok sometimes includes timestamp in the video ID
dt = datetime.strptime(upload_date, '%Y%m%d')
self.log(f"Only date available (no time): {dt.date()}", "warning")
return dt
except Exception:
pass
return None
def download_profile(self,
username: str,
number_of_days: int = 7,
full_profile: bool = False,
output_dir: Path = None,
defer_database: bool = False) -> Tuple[Dict[str, datetime], List[Path]]:
"""
Download TikTok profile videos
Args:
username: TikTok username (without @)
number_of_days: Number of days to download (ignored if full_profile=True)
full_profile: If True, download entire profile
output_dir: Output directory (uses base_path/username if not specified)
defer_database: If True, don't record to database immediately - store in
pending_downloads for later recording after file move is complete
Returns:
Tuple of (file_timestamps dict, list of downloaded files)
"""
self.defer_database = defer_database # Store for use in _record_download
username = username.lstrip('@')
output_dir = output_dir or self.base_path / username
output_dir.mkdir(parents=True, exist_ok=True)
self.log(f"Downloading TikTok profile: @{username}", "info")
self.activity_manager.update_status("Checking videos")
# HYBRID APPROACH: Use yt-dlp to get ID list (fast), then gallery-dl per video (handles carousels)
# Step 1: Use yt-dlp to quickly get list of video IDs with dates
profile_url = f"https://www.tiktok.com/@{username}"
list_cmd = [
"yt-dlp",
"--flat-playlist", # Don't download, just list
"--print", "%(upload_date)s %(id)s", # Print date and ID
"--quiet",
"--no-warnings",
profile_url
]
self.log(f"Getting video list with yt-dlp...", "debug")
# Get list of video IDs with dates
try:
result = subprocess.run(list_cmd, capture_output=True, text=True, timeout=60)
lines = [line.strip() for line in result.stdout.strip().split('\n') if line.strip()]
# Parse and filter by date if needed
video_ids = []
if not full_profile and number_of_days:
from datetime import timedelta
cutoff_date = datetime.now() - timedelta(days=number_of_days)
cutoff_str = cutoff_date.strftime('%Y%m%d')
for line in lines:
parts = line.split()
if len(parts) >= 2:
upload_date, video_id = parts[0], parts[1]
# Only include videos after cutoff date
if upload_date >= cutoff_str:
video_ids.append(video_id)
else:
# No filter, take all
video_ids = [line.split()[1] for line in lines if len(line.split()) >= 2]
self.log(f"Found {len(video_ids)} posts to download", "info")
except Exception as e:
self.log(f"Failed to get video list: {e}", "error")
return {}, []
if not video_ids:
self.log("No videos found matching criteria", "info")
return {}, []
# Set initial progress so dashboard shows 0/N immediately
self.activity_manager.update_status(
"Downloading videos",
progress_current=0,
progress_total=len(video_ids)
)
# Crash recovery checkpoint
from modules.task_checkpoint import TaskCheckpoint
checkpoint = TaskCheckpoint(f'tiktok:{username}', 'scraping')
checkpoint.start(total_items=len(video_ids))
if checkpoint.is_recovering():
self.log(f"TikTok @{username}: recovering — skipping already-downloaded videos", "info")
# Step 2: Download each video individually with gallery-dl (fast per video, handles carousels)
for i, video_id in enumerate(video_ids, 1):
# Update progress at start of each iteration (fires even on skips)
self.activity_manager.update_status(
"Downloading videos",
progress_current=i,
progress_total=len(video_ids)
)
# Skip if already completed in a previous crashed run
if checkpoint.is_completed(video_id):
continue
checkpoint.set_current(video_id)
# Skip if already downloaded
if self._is_already_downloaded(video_id, username):
self.log(f"[{i}/{len(video_ids)}] Skipping already downloaded: {video_id}", "debug")
checkpoint.mark_completed(video_id)
continue
video_url = f"https://www.tiktok.com/@{username}/video/{video_id}"
self.log(f"[{i}/{len(video_ids)}] Downloading {video_id}", "debug")
cmd = [
"gallery-dl",
"--write-metadata",
"-D", str(output_dir),
"-f", "{date:%Y%m%d}_{desc}_{id}_{num}.{extension}",
video_url
]
try:
self.log(f"Calling gallery-dl for {video_id}", "debug")
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
self.log(f"gallery-dl returned: code={result.returncode}, stdout lines={len(result.stdout.splitlines()) if result.stdout else 0}", "debug")
if result.returncode != 0 and result.stderr:
stderr = result.stderr
if "not available" in stderr.lower() or "404" in stderr:
self.log(f"Video {video_id} not available (deleted or private)", "warning")
else:
self.log(f"Failed to download {video_id}: {stderr[:100]}", "warning")
except subprocess.TimeoutExpired:
self.log(f"Timeout downloading {video_id}", "warning")
except Exception as e:
self.log(f"Error downloading {video_id}: {e}", "warning")
checkpoint.mark_completed(video_id)
checkpoint.finish()
# Post-process: Rename files with long descriptions and remove audio-only files
for file in output_dir.glob("*"):
if file.is_file() and not file.suffix == '.json':
# Remove audio-only files (.mp3, .m4a, .aac)
if file.suffix.lower() in ['.mp3', '.m4a', '.aac', '.wav', '.ogg']:
self.log(f"Removing audio-only file: {file.name}", "debug")
file.unlink()
# Also remove corresponding JSON
json_file = file.with_suffix(file.suffix + '.json')
if json_file.exists():
json_file.unlink()
continue
# Truncate long filenames (max 255 chars for Linux)
if len(file.name) > 200: # Leave some margin
# Parse filename: YYYYMMDD_description_ID_NUM.ext
parts = file.name.rsplit('_', 2) # Split from right to preserve ID and num
if len(parts) == 3:
date_and_desc, video_id, num_and_ext = parts
# Split date from description
date_part = date_and_desc[:8] # YYYYMMDD
desc_part = date_and_desc[9:] # Everything after date_
# Calculate max description length
# Format: DATE_DESC_ID_NUM.EXT
fixed_length = len(date_part) + len(video_id) + len(num_and_ext) + 3 # 3 underscores
max_desc_len = 200 - fixed_length
if len(desc_part) > max_desc_len:
truncated_desc = desc_part[:max_desc_len-3] + "..."
new_name = f"{date_part}_{truncated_desc}_{video_id}_{num_and_ext}"
new_path = file.parent / new_name
self.log(f"Truncating long filename: {file.name[:50]}... -> {new_name[:50]}...", "debug")
file.rename(new_path)
# Rename corresponding JSON file too
json_file = Path(str(file) + '.json')
if json_file.exists():
new_json = Path(str(new_path) + '.json')
json_file.rename(new_json)
# Process downloaded files and extract timestamps from JSON
downloaded_files = []
file_timestamps = {}
processed_ids = set() # Track IDs we've checked in DB (not in this loop, but in previous downloads)
started_ids = set() # Track IDs we've started processing in THIS run
for json_file in output_dir.glob("*.json"):
try:
with open(json_file, 'r', encoding='utf-8') as f:
info = json.load(f)
# Get video ID
video_id = info.get('id', '')
# Extract timestamp from gallery-dl's createTime field (needed for all files)
timestamp = None
create_time = info.get('createTime')
if create_time:
try:
timestamp = datetime.fromtimestamp(int(create_time))
self.log(f"Extracted timestamp {timestamp} from createTime", "debug")
except Exception:
# Fall back to old yt-dlp method if createTime not available
timestamp = self.extract_date_from_info(info)
# gallery-dl names JSON files as: filename.ext.json
# So we need to remove the .json extension to get the media file
media_file = Path(str(json_file)[:-5]) # Remove .json extension
if not media_file.exists():
self.log(f"Media file not found for {json_file.name}", "warning")
json_file.unlink()
continue
video_file = media_file # Use same variable name for compatibility
# Check if already downloaded - but only check ONCE per video_id per run
# (Don't check again for carousel photos #2, #3 after we've started processing #1)
if video_id and video_id not in started_ids:
if self._is_already_downloaded(video_id, username):
self.log(f"Skipping already downloaded post: {video_id}", "debug")
# Mark as processed so we don't check again for this ID's other files
processed_ids.add(video_id)
# Just remove JSON file, keep media files (they're already processed)
json_file.unlink()
continue
# Mark that we've started processing this video_id
started_ids.add(video_id)
# Skip if this video_id was marked as already downloaded
if video_id in processed_ids:
json_file.unlink()
continue
# ALWAYS add file to downloaded list and apply timestamp (even for carousel photos #2, #3)
downloaded_files.append(video_file)
if timestamp:
file_timestamps[video_file.name] = timestamp
self.log(f"Extracted timestamp {timestamp} for {video_file.name}", "debug")
# Check for duplicate hash before recording (hash blacklist persists even if original deleted)
file_hash = self.db.get_file_hash(str(video_file)) if self.db else None
if file_hash:
existing = self.db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(video_file) != existing.get('file_path'):
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
self.log(f"⚠ Duplicate content detected (hash match): {video_file.name} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
# Delete the duplicate regardless of whether original file still exists
try:
video_file.unlink()
self.log(f"Deleted duplicate (hash blacklist): {video_file.name}", "debug")
# Mark as processed so we don't try to download again
processed_ids.add(video_id)
json_file.unlink()
continue
except Exception as e:
self.log(f"Failed to delete duplicate {video_file.name}: {e}", "warning")
# Record in database (each file gets its own entry, even for carousels)
if video_id:
self._record_download(
video_id=video_id,
username=username,
filename=video_file.name,
post_date=timestamp,
metadata={"title": info.get('desc', ''), "description": info.get('desc', '')},
deferred=self.defer_database
)
# Remove JSON file after processing
json_file.unlink()
except Exception as e:
self.log(f"Failed to process {json_file}: {e}", "error")
self.log(f"Downloaded {len(downloaded_files)} files from @{username}", "info")
# Apply timestamps to files
import os
for file_path in downloaded_files:
filename = file_path.name
if filename in file_timestamps:
timestamp = file_timestamps[filename]
try:
# Convert datetime to unix timestamp
unix_time = timestamp.timestamp()
# Set both access time and modification time
os.utime(str(file_path), (unix_time, unix_time))
self.log(f"Applied timestamp {timestamp} to {filename}", "debug")
except Exception as e:
self.log(f"Failed to apply timestamp to {filename}: {e}", "warning")
# Store timestamps for later use
self.file_timestamps.update(file_timestamps)
return file_timestamps, downloaded_files
def download_video(self, url: str, output_dir: Path = None) -> Tuple[Optional[datetime], Optional[Path]]:
"""
Download a single TikTok video
Args:
url: TikTok video URL
output_dir: Output directory
Returns:
Tuple of (timestamp, downloaded file path)
"""
output_dir = output_dir or self.base_path
output_dir.mkdir(parents=True, exist_ok=True)
self.log(f"Downloading video: {url}", "info")
# First, get video info without downloading
cmd_info = [
"yt-dlp",
"--dump-json",
"--no-warnings",
"--quiet",
url
]
try:
result = subprocess.run(cmd_info, capture_output=True, text=True)
if result.returncode != 0:
self.log(f"Failed to get video info: {result.stderr}", "error")
return None, None
info = json.loads(result.stdout)
timestamp = self.extract_date_from_info(info)
# Check if this is a photo post (no video, only audio)
formats = info.get('formats', [])
has_video = any(f.get('vcodec') != 'none' for f in formats)
if not has_video and len(formats) > 0:
# This is a photo/image post - skip it
self.log("Skipping TikTok photo post (only videos are downloaded)", "info")
return timestamp, None
# Download video
output_template = str(output_dir / "%(upload_date)s_%(title)s_%(id)s.%(ext)s")
cmd_download = [
"yt-dlp",
"--format", "best", # Explicitly request best video+audio format
"--no-warnings",
"--quiet",
"-o", output_template,
url
]
result = subprocess.run(cmd_download, capture_output=True, text=True)
if result.returncode != 0:
self.log(f"Failed to download video: {result.stderr}", "error")
return timestamp, None
# Find the downloaded file
expected_name = output_template.replace('%(upload_date)s', info.get('upload_date', 'unknown'))
expected_name = expected_name.replace('%(title)s', info.get('title', 'video'))
expected_name = expected_name.replace('%(id)s', info.get('id', ''))
expected_name = expected_name.replace('%(ext)s', info.get('ext', 'mp4'))
downloaded_file = Path(expected_name)
if not downloaded_file.exists():
# Try to find it by pattern
pattern = f"*{info.get('id', '')}*.mp4"
matches = list(output_dir.glob(pattern))
if matches:
downloaded_file = matches[0]
if downloaded_file.exists():
if timestamp:
self.file_timestamps[downloaded_file.name] = timestamp
return timestamp, downloaded_file
return timestamp, None
except Exception as e:
self.log(f"Failed to download video: {e}", "error")
return None, None
def get_file_timestamps(self) -> Dict[str, datetime]:
"""Get the collected file timestamps"""
return self.file_timestamps.copy()
def clear_timestamps(self):
"""Clear the stored timestamps"""
self.file_timestamps.clear()
def download_tiktok_profile(username: str,
days: int = 7,
base_path: Path = None,
log_callback=None,
unified_db=None) -> Dict[str, datetime]:
"""
Simple function interface for downloading TikTok profile
Args:
username: TikTok username
days: Number of days to download
base_path: Base download path
log_callback: Optional logging callback
unified_db: UnifiedDatabase instance (required)
Returns:
Dictionary mapping filenames to timestamps
"""
if not unified_db:
raise ValueError("unified_db is required for TikTok downloads")
downloader = TikTokDownloader(base_path=base_path, log_callback=log_callback, unified_db=unified_db)
timestamps, files = downloader.download_profile(username, number_of_days=days)
return timestamps
if __name__ == "__main__":
# Test the module
import tempfile
print("TikTok Downloader Module Test")
print("="*60)
# Test with a small profile
with tempfile.TemporaryDirectory() as tmpdir:
downloader = TikTokDownloader(base_path=Path(tmpdir))
# You can test with a real TikTok username
# timestamps, files = downloader.download_profile("username", number_of_days=1)
print("Module ready for integration")