#!/usr/bin/env python3 """ TikTok Download Module - Downloads TikTok videos with proper timestamp extraction """ import os import re import json import subprocess import sqlite3 from pathlib import Path from datetime import datetime from typing import Dict, List, Optional, Tuple from modules.base_module import LoggingMixin class TikTokDownloader(LoggingMixin): """Downloads TikTok videos and extracts metadata including timestamps""" def __init__(self, base_path: Path = None, log_callback=None, use_database=True, unified_db=None): """ Initialize TikTok downloader Args: base_path: Base path for downloads log_callback: Optional callback for logging (tag, level, message) use_database: Whether to use database for tracking downloads unified_db: UnifiedDatabase instance (required) """ # Initialize logging via mixin self._init_logger('TikTok', log_callback, default_module='Download') self.base_path = Path(base_path) if base_path else Path.cwd() self.file_timestamps = {} # Map of filename -> datetime self.use_database = use_database # Always use unified database adapter if not unified_db: raise ValueError("TikTok module requires unified_db - standalone database is no longer supported") from modules.tiktok_db_adapter import TikTokDatabaseAdapter self.db = TikTokDatabaseAdapter(unified_db) self.use_unified_db = True # Initialize activity status manager for real-time updates from modules.activity_status import get_activity_manager self.activity_manager = get_activity_manager(unified_db) self.pending_downloads = [] # Track downloads for deferred database recording def _is_already_downloaded(self, video_id: str, username: str = None) -> bool: """Check if a video has already been downloaded""" if not self.use_database: return False # Pass username for proper database lookup if username: return self.db.is_downloaded(video_id, username) return self.db.is_already_downloaded(video_id) def _record_download(self, video_id: str, username: str, filename: str, post_date: Optional[datetime] = None, metadata: Dict = None, deferred: bool = False): """Record a successful download in the database Args: deferred: If True, don't record to database now - add to pending_downloads list for later recording after file move is complete """ # Extract just the filename from the full path for database from pathlib import Path file_path = str(filename) # Full path filename_only = Path(filename).name # Just the filename # If deferred, store for later recording instead of recording now if deferred: self.pending_downloads.append({ 'video_id': video_id, 'username': username, 'filename': filename_only, 'post_date': post_date.isoformat() if post_date else None, 'file_path': file_path, 'metadata': metadata }) self.log(f"Deferred recording for {video_id}", "debug") return True if not self.use_database: return return self.db.record_download( video_id=video_id, username=username, filename=filename_only, post_date=post_date, metadata=metadata, file_path=file_path ) def get_pending_downloads(self): """Get list of downloads that were deferred for later recording""" return self.pending_downloads.copy() def clear_pending_downloads(self): """Clear the pending downloads list after they've been recorded""" self.pending_downloads = [] def extract_date_from_info(self, info_dict: Dict) -> Optional[datetime]: """ Extract upload date from yt-dlp info dictionary Args: info_dict: yt-dlp info dictionary Returns: datetime object or None """ # Try timestamp first (Unix timestamp - has full date and time) # TikTok provides UTC timestamps, need to convert to local time timestamp = info_dict.get('timestamp') if timestamp: try: # Use UTC timestamp and convert to local from datetime import timezone dt_utc = datetime.fromtimestamp(timestamp, tz=timezone.utc) dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime self.log(f"Extracted full timestamp (UTC): {dt}", "debug") return dt except Exception: pass # Try release_timestamp (also has full date and time) release_timestamp = info_dict.get('release_timestamp') if release_timestamp: try: from datetime import timezone dt_utc = datetime.fromtimestamp(release_timestamp, tz=timezone.utc) dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime self.log(f"Extracted release timestamp (UTC): {dt}", "debug") return dt except Exception: pass # Try modified_timestamp modified_timestamp = info_dict.get('modified_timestamp') if modified_timestamp: try: from datetime import timezone dt_utc = datetime.fromtimestamp(modified_timestamp, tz=timezone.utc) dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime self.log(f"Extracted modified timestamp (UTC): {dt}", "debug") return dt except Exception: pass # Fall back to upload_date (YYYYMMDD format - only has date, no time) # This should be last resort as it loses time information upload_date = info_dict.get('upload_date') if upload_date and len(upload_date) == 8: try: # Try to get time from filename if it has timestamp format # TikTok sometimes includes timestamp in the video ID dt = datetime.strptime(upload_date, '%Y%m%d') self.log(f"Only date available (no time): {dt.date()}", "warning") return dt except Exception: pass return None def download_profile(self, username: str, number_of_days: int = 7, full_profile: bool = False, output_dir: Path = None, defer_database: bool = False) -> Tuple[Dict[str, datetime], List[Path]]: """ Download TikTok profile videos Args: username: TikTok username (without @) number_of_days: Number of days to download (ignored if full_profile=True) full_profile: If True, download entire profile output_dir: Output directory (uses base_path/username if not specified) defer_database: If True, don't record to database immediately - store in pending_downloads for later recording after file move is complete Returns: Tuple of (file_timestamps dict, list of downloaded files) """ self.defer_database = defer_database # Store for use in _record_download username = username.lstrip('@') output_dir = output_dir or self.base_path / username output_dir.mkdir(parents=True, exist_ok=True) self.log(f"Downloading TikTok profile: @{username}", "info") self.activity_manager.update_status("Checking videos") # HYBRID APPROACH: Use yt-dlp to get ID list (fast), then gallery-dl per video (handles carousels) # Step 1: Use yt-dlp to quickly get list of video IDs with dates profile_url = f"https://www.tiktok.com/@{username}" list_cmd = [ "yt-dlp", "--flat-playlist", # Don't download, just list "--print", "%(upload_date)s %(id)s", # Print date and ID "--quiet", "--no-warnings", profile_url ] self.log(f"Getting video list with yt-dlp...", "debug") # Get list of video IDs with dates try: result = subprocess.run(list_cmd, capture_output=True, text=True, timeout=60) lines = [line.strip() for line in result.stdout.strip().split('\n') if line.strip()] # Parse and filter by date if needed video_ids = [] if not full_profile and number_of_days: from datetime import timedelta cutoff_date = datetime.now() - timedelta(days=number_of_days) cutoff_str = cutoff_date.strftime('%Y%m%d') for line in lines: parts = line.split() if len(parts) >= 2: upload_date, video_id = parts[0], parts[1] # Only include videos after cutoff date if upload_date >= cutoff_str: video_ids.append(video_id) else: # No filter, take all video_ids = [line.split()[1] for line in lines if len(line.split()) >= 2] self.log(f"Found {len(video_ids)} posts to download", "info") except Exception as e: self.log(f"Failed to get video list: {e}", "error") return {}, [] if not video_ids: self.log("No videos found matching criteria", "info") return {}, [] # Set initial progress so dashboard shows 0/N immediately self.activity_manager.update_status( "Downloading videos", progress_current=0, progress_total=len(video_ids) ) # Crash recovery checkpoint from modules.task_checkpoint import TaskCheckpoint checkpoint = TaskCheckpoint(f'tiktok:{username}', 'scraping') checkpoint.start(total_items=len(video_ids)) if checkpoint.is_recovering(): self.log(f"TikTok @{username}: recovering — skipping already-downloaded videos", "info") # Step 2: Download each video individually with gallery-dl (fast per video, handles carousels) for i, video_id in enumerate(video_ids, 1): # Update progress at start of each iteration (fires even on skips) self.activity_manager.update_status( "Downloading videos", progress_current=i, progress_total=len(video_ids) ) # Skip if already completed in a previous crashed run if checkpoint.is_completed(video_id): continue checkpoint.set_current(video_id) # Skip if already downloaded if self._is_already_downloaded(video_id, username): self.log(f"[{i}/{len(video_ids)}] Skipping already downloaded: {video_id}", "debug") checkpoint.mark_completed(video_id) continue video_url = f"https://www.tiktok.com/@{username}/video/{video_id}" self.log(f"[{i}/{len(video_ids)}] Downloading {video_id}", "debug") cmd = [ "gallery-dl", "--write-metadata", "-D", str(output_dir), "-f", "{date:%Y%m%d}_{desc}_{id}_{num}.{extension}", video_url ] try: self.log(f"Calling gallery-dl for {video_id}", "debug") result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) self.log(f"gallery-dl returned: code={result.returncode}, stdout lines={len(result.stdout.splitlines()) if result.stdout else 0}", "debug") if result.returncode != 0 and result.stderr: stderr = result.stderr if "not available" in stderr.lower() or "404" in stderr: self.log(f"Video {video_id} not available (deleted or private)", "warning") else: self.log(f"Failed to download {video_id}: {stderr[:100]}", "warning") except subprocess.TimeoutExpired: self.log(f"Timeout downloading {video_id}", "warning") except Exception as e: self.log(f"Error downloading {video_id}: {e}", "warning") checkpoint.mark_completed(video_id) checkpoint.finish() # Post-process: Rename files with long descriptions and remove audio-only files for file in output_dir.glob("*"): if file.is_file() and not file.suffix == '.json': # Remove audio-only files (.mp3, .m4a, .aac) if file.suffix.lower() in ['.mp3', '.m4a', '.aac', '.wav', '.ogg']: self.log(f"Removing audio-only file: {file.name}", "debug") file.unlink() # Also remove corresponding JSON json_file = file.with_suffix(file.suffix + '.json') if json_file.exists(): json_file.unlink() continue # Truncate long filenames (max 255 chars for Linux) if len(file.name) > 200: # Leave some margin # Parse filename: YYYYMMDD_description_ID_NUM.ext parts = file.name.rsplit('_', 2) # Split from right to preserve ID and num if len(parts) == 3: date_and_desc, video_id, num_and_ext = parts # Split date from description date_part = date_and_desc[:8] # YYYYMMDD desc_part = date_and_desc[9:] # Everything after date_ # Calculate max description length # Format: DATE_DESC_ID_NUM.EXT fixed_length = len(date_part) + len(video_id) + len(num_and_ext) + 3 # 3 underscores max_desc_len = 200 - fixed_length if len(desc_part) > max_desc_len: truncated_desc = desc_part[:max_desc_len-3] + "..." new_name = f"{date_part}_{truncated_desc}_{video_id}_{num_and_ext}" new_path = file.parent / new_name self.log(f"Truncating long filename: {file.name[:50]}... -> {new_name[:50]}...", "debug") file.rename(new_path) # Rename corresponding JSON file too json_file = Path(str(file) + '.json') if json_file.exists(): new_json = Path(str(new_path) + '.json') json_file.rename(new_json) # Process downloaded files and extract timestamps from JSON downloaded_files = [] file_timestamps = {} processed_ids = set() # Track IDs we've checked in DB (not in this loop, but in previous downloads) started_ids = set() # Track IDs we've started processing in THIS run for json_file in output_dir.glob("*.json"): try: with open(json_file, 'r', encoding='utf-8') as f: info = json.load(f) # Get video ID video_id = info.get('id', '') # Extract timestamp from gallery-dl's createTime field (needed for all files) timestamp = None create_time = info.get('createTime') if create_time: try: timestamp = datetime.fromtimestamp(int(create_time)) self.log(f"Extracted timestamp {timestamp} from createTime", "debug") except Exception: # Fall back to old yt-dlp method if createTime not available timestamp = self.extract_date_from_info(info) # gallery-dl names JSON files as: filename.ext.json # So we need to remove the .json extension to get the media file media_file = Path(str(json_file)[:-5]) # Remove .json extension if not media_file.exists(): self.log(f"Media file not found for {json_file.name}", "warning") json_file.unlink() continue video_file = media_file # Use same variable name for compatibility # Check if already downloaded - but only check ONCE per video_id per run # (Don't check again for carousel photos #2, #3 after we've started processing #1) if video_id and video_id not in started_ids: if self._is_already_downloaded(video_id, username): self.log(f"Skipping already downloaded post: {video_id}", "debug") # Mark as processed so we don't check again for this ID's other files processed_ids.add(video_id) # Just remove JSON file, keep media files (they're already processed) json_file.unlink() continue # Mark that we've started processing this video_id started_ids.add(video_id) # Skip if this video_id was marked as already downloaded if video_id in processed_ids: json_file.unlink() continue # ALWAYS add file to downloaded list and apply timestamp (even for carousel photos #2, #3) downloaded_files.append(video_file) if timestamp: file_timestamps[video_file.name] = timestamp self.log(f"Extracted timestamp {timestamp} for {video_file.name}", "debug") # Check for duplicate hash before recording (hash blacklist persists even if original deleted) file_hash = self.db.get_file_hash(str(video_file)) if self.db else None if file_hash: existing = self.db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(video_file) != existing.get('file_path'): # Duplicate hash found - content was already downloaded (prevents redownload of deleted content) self.log(f"⚠ Duplicate content detected (hash match): {video_file.name} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate regardless of whether original file still exists try: video_file.unlink() self.log(f"Deleted duplicate (hash blacklist): {video_file.name}", "debug") # Mark as processed so we don't try to download again processed_ids.add(video_id) json_file.unlink() continue except Exception as e: self.log(f"Failed to delete duplicate {video_file.name}: {e}", "warning") # Record in database (each file gets its own entry, even for carousels) if video_id: self._record_download( video_id=video_id, username=username, filename=video_file.name, post_date=timestamp, metadata={"title": info.get('desc', ''), "description": info.get('desc', '')}, deferred=self.defer_database ) # Remove JSON file after processing json_file.unlink() except Exception as e: self.log(f"Failed to process {json_file}: {e}", "error") self.log(f"Downloaded {len(downloaded_files)} files from @{username}", "info") # Apply timestamps to files import os for file_path in downloaded_files: filename = file_path.name if filename in file_timestamps: timestamp = file_timestamps[filename] try: # Convert datetime to unix timestamp unix_time = timestamp.timestamp() # Set both access time and modification time os.utime(str(file_path), (unix_time, unix_time)) self.log(f"Applied timestamp {timestamp} to {filename}", "debug") except Exception as e: self.log(f"Failed to apply timestamp to {filename}: {e}", "warning") # Store timestamps for later use self.file_timestamps.update(file_timestamps) return file_timestamps, downloaded_files def download_video(self, url: str, output_dir: Path = None) -> Tuple[Optional[datetime], Optional[Path]]: """ Download a single TikTok video Args: url: TikTok video URL output_dir: Output directory Returns: Tuple of (timestamp, downloaded file path) """ output_dir = output_dir or self.base_path output_dir.mkdir(parents=True, exist_ok=True) self.log(f"Downloading video: {url}", "info") # First, get video info without downloading cmd_info = [ "yt-dlp", "--dump-json", "--no-warnings", "--quiet", url ] try: result = subprocess.run(cmd_info, capture_output=True, text=True) if result.returncode != 0: self.log(f"Failed to get video info: {result.stderr}", "error") return None, None info = json.loads(result.stdout) timestamp = self.extract_date_from_info(info) # Check if this is a photo post (no video, only audio) formats = info.get('formats', []) has_video = any(f.get('vcodec') != 'none' for f in formats) if not has_video and len(formats) > 0: # This is a photo/image post - skip it self.log("Skipping TikTok photo post (only videos are downloaded)", "info") return timestamp, None # Download video output_template = str(output_dir / "%(upload_date)s_%(title)s_%(id)s.%(ext)s") cmd_download = [ "yt-dlp", "--format", "best", # Explicitly request best video+audio format "--no-warnings", "--quiet", "-o", output_template, url ] result = subprocess.run(cmd_download, capture_output=True, text=True) if result.returncode != 0: self.log(f"Failed to download video: {result.stderr}", "error") return timestamp, None # Find the downloaded file expected_name = output_template.replace('%(upload_date)s', info.get('upload_date', 'unknown')) expected_name = expected_name.replace('%(title)s', info.get('title', 'video')) expected_name = expected_name.replace('%(id)s', info.get('id', '')) expected_name = expected_name.replace('%(ext)s', info.get('ext', 'mp4')) downloaded_file = Path(expected_name) if not downloaded_file.exists(): # Try to find it by pattern pattern = f"*{info.get('id', '')}*.mp4" matches = list(output_dir.glob(pattern)) if matches: downloaded_file = matches[0] if downloaded_file.exists(): if timestamp: self.file_timestamps[downloaded_file.name] = timestamp return timestamp, downloaded_file return timestamp, None except Exception as e: self.log(f"Failed to download video: {e}", "error") return None, None def get_file_timestamps(self) -> Dict[str, datetime]: """Get the collected file timestamps""" return self.file_timestamps.copy() def clear_timestamps(self): """Clear the stored timestamps""" self.file_timestamps.clear() def download_tiktok_profile(username: str, days: int = 7, base_path: Path = None, log_callback=None, unified_db=None) -> Dict[str, datetime]: """ Simple function interface for downloading TikTok profile Args: username: TikTok username days: Number of days to download base_path: Base download path log_callback: Optional logging callback unified_db: UnifiedDatabase instance (required) Returns: Dictionary mapping filenames to timestamps """ if not unified_db: raise ValueError("unified_db is required for TikTok downloads") downloader = TikTokDownloader(base_path=base_path, log_callback=log_callback, unified_db=unified_db) timestamps, files = downloader.download_profile(username, number_of_days=days) return timestamps if __name__ == "__main__": # Test the module import tempfile print("TikTok Downloader Module Test") print("="*60) # Test with a small profile with tempfile.TemporaryDirectory() as tmpdir: downloader = TikTokDownloader(base_path=Path(tmpdir)) # You can test with a real TikTok username # timestamps, files = downloader.download_profile("username", number_of_days=1) print("Module ready for integration")