Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/tiktok_module.py
+++ b/modules/tiktok_module.py
@@ -0,0 +1,603 @@
+#!/usr/bin/env python3
+"""
+TikTok Download Module - Downloads TikTok videos with proper timestamp extraction
+"""
+
+import os
+import re
+import json
+import subprocess
+import sqlite3
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Optional, Tuple
+from modules.base_module import LoggingMixin
+
+
+class TikTokDownloader(LoggingMixin):
+    """Downloads TikTok videos and extracts metadata including timestamps"""
+    
+    def __init__(self, base_path: Path = None, log_callback=None, use_database=True, unified_db=None):
+        """
+        Initialize TikTok downloader
+
+        Args:
+            base_path: Base path for downloads
+            log_callback: Optional callback for logging (tag, level, message)
+            use_database: Whether to use database for tracking downloads
+            unified_db: UnifiedDatabase instance (required)
+        """
+        # Initialize logging via mixin
+        self._init_logger('TikTok', log_callback, default_module='Download')
+
+        self.base_path = Path(base_path) if base_path else Path.cwd()
+        self.file_timestamps = {}  # Map of filename -> datetime
+        self.use_database = use_database
+
+        # Always use unified database adapter
+        if not unified_db:
+            raise ValueError("TikTok module requires unified_db - standalone database is no longer supported")
+
+        from modules.tiktok_db_adapter import TikTokDatabaseAdapter
+        self.db = TikTokDatabaseAdapter(unified_db)
+        self.use_unified_db = True
+
+        # Initialize activity status manager for real-time updates
+        from modules.activity_status import get_activity_manager
+        self.activity_manager = get_activity_manager(unified_db)
+
+        self.pending_downloads = []  # Track downloads for deferred database recording
+
+    def _is_already_downloaded(self, video_id: str, username: str = None) -> bool:
+        """Check if a video has already been downloaded"""
+        if not self.use_database:
+            return False
+
+        # Pass username for proper database lookup
+        if username:
+            return self.db.is_downloaded(video_id, username)
+        return self.db.is_already_downloaded(video_id)
+    
+    def _record_download(self, video_id: str, username: str, filename: str,
+                        post_date: Optional[datetime] = None, metadata: Dict = None,
+                        deferred: bool = False):
+        """Record a successful download in the database
+
+        Args:
+            deferred: If True, don't record to database now - add to pending_downloads list
+                     for later recording after file move is complete
+        """
+        # Extract just the filename from the full path for database
+        from pathlib import Path
+        file_path = str(filename)  # Full path
+        filename_only = Path(filename).name  # Just the filename
+
+        # If deferred, store for later recording instead of recording now
+        if deferred:
+            self.pending_downloads.append({
+                'video_id': video_id,
+                'username': username,
+                'filename': filename_only,
+                'post_date': post_date.isoformat() if post_date else None,
+                'file_path': file_path,
+                'metadata': metadata
+            })
+            self.log(f"Deferred recording for {video_id}", "debug")
+            return True
+
+        if not self.use_database:
+            return
+
+        return self.db.record_download(
+            video_id=video_id,
+            username=username,
+            filename=filename_only,
+            post_date=post_date,
+            metadata=metadata,
+            file_path=file_path
+        )
+
+    def get_pending_downloads(self):
+        """Get list of downloads that were deferred for later recording"""
+        return self.pending_downloads.copy()
+
+    def clear_pending_downloads(self):
+        """Clear the pending downloads list after they've been recorded"""
+        self.pending_downloads = []
+    
+    def extract_date_from_info(self, info_dict: Dict) -> Optional[datetime]:
+        """
+        Extract upload date from yt-dlp info dictionary
+        
+        Args:
+            info_dict: yt-dlp info dictionary
+        
+        Returns:
+            datetime object or None
+        """
+        # Try timestamp first (Unix timestamp - has full date and time)
+        # TikTok provides UTC timestamps, need to convert to local time
+        timestamp = info_dict.get('timestamp')
+        if timestamp:
+            try:
+                # Use UTC timestamp and convert to local
+                from datetime import timezone
+                dt_utc = datetime.fromtimestamp(timestamp, tz=timezone.utc)
+                dt = dt_utc.replace(tzinfo=None)  # Remove timezone info for local datetime
+                self.log(f"Extracted full timestamp (UTC): {dt}", "debug")
+                return dt
+            except Exception:
+                pass
+        
+        # Try release_timestamp (also has full date and time)
+        release_timestamp = info_dict.get('release_timestamp')
+        if release_timestamp:
+            try:
+                from datetime import timezone
+                dt_utc = datetime.fromtimestamp(release_timestamp, tz=timezone.utc)
+                dt = dt_utc.replace(tzinfo=None)  # Remove timezone info for local datetime
+                self.log(f"Extracted release timestamp (UTC): {dt}", "debug")
+                return dt
+            except Exception:
+                pass
+        
+        # Try modified_timestamp
+        modified_timestamp = info_dict.get('modified_timestamp')
+        if modified_timestamp:
+            try:
+                from datetime import timezone
+                dt_utc = datetime.fromtimestamp(modified_timestamp, tz=timezone.utc)
+                dt = dt_utc.replace(tzinfo=None)  # Remove timezone info for local datetime
+                self.log(f"Extracted modified timestamp (UTC): {dt}", "debug")
+                return dt
+            except Exception:
+                pass
+        
+        # Fall back to upload_date (YYYYMMDD format - only has date, no time)
+        # This should be last resort as it loses time information
+        upload_date = info_dict.get('upload_date')
+        if upload_date and len(upload_date) == 8:
+            try:
+                # Try to get time from filename if it has timestamp format
+                # TikTok sometimes includes timestamp in the video ID
+                dt = datetime.strptime(upload_date, '%Y%m%d')
+                self.log(f"Only date available (no time): {dt.date()}", "warning")
+                return dt
+            except Exception:
+                pass
+        
+        return None
+    
+    def download_profile(self,
+                        username: str,
+                        number_of_days: int = 7,
+                        full_profile: bool = False,
+                        output_dir: Path = None,
+                        defer_database: bool = False) -> Tuple[Dict[str, datetime], List[Path]]:
+        """
+        Download TikTok profile videos
+
+        Args:
+            username: TikTok username (without @)
+            number_of_days: Number of days to download (ignored if full_profile=True)
+            full_profile: If True, download entire profile
+            output_dir: Output directory (uses base_path/username if not specified)
+            defer_database: If True, don't record to database immediately - store in
+                           pending_downloads for later recording after file move is complete
+
+        Returns:
+            Tuple of (file_timestamps dict, list of downloaded files)
+        """
+        self.defer_database = defer_database  # Store for use in _record_download
+        username = username.lstrip('@')
+        output_dir = output_dir or self.base_path / username
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        self.log(f"Downloading TikTok profile: @{username}", "info")
+        self.activity_manager.update_status("Checking videos")
+
+        # HYBRID APPROACH: Use yt-dlp to get ID list (fast), then gallery-dl per video (handles carousels)
+
+        # Step 1: Use yt-dlp to quickly get list of video IDs with dates
+        profile_url = f"https://www.tiktok.com/@{username}"
+        list_cmd = [
+            "yt-dlp",
+            "--flat-playlist",  # Don't download, just list
+            "--print", "%(upload_date)s %(id)s",    # Print date and ID
+            "--quiet",
+            "--no-warnings",
+            profile_url
+        ]
+
+        self.log(f"Getting video list with yt-dlp...", "debug")
+
+        # Get list of video IDs with dates
+        try:
+            result = subprocess.run(list_cmd, capture_output=True, text=True, timeout=60)
+            lines = [line.strip() for line in result.stdout.strip().split('\n') if line.strip()]
+
+            # Parse and filter by date if needed
+            video_ids = []
+            if not full_profile and number_of_days:
+                from datetime import timedelta
+                cutoff_date = datetime.now() - timedelta(days=number_of_days)
+                cutoff_str = cutoff_date.strftime('%Y%m%d')
+
+                for line in lines:
+                    parts = line.split()
+                    if len(parts) >= 2:
+                        upload_date, video_id = parts[0], parts[1]
+                        # Only include videos after cutoff date
+                        if upload_date >= cutoff_str:
+                            video_ids.append(video_id)
+            else:
+                # No filter, take all
+                video_ids = [line.split()[1] for line in lines if len(line.split()) >= 2]
+
+            self.log(f"Found {len(video_ids)} posts to download", "info")
+        except Exception as e:
+            self.log(f"Failed to get video list: {e}", "error")
+            return {}, []
+
+        if not video_ids:
+            self.log("No videos found matching criteria", "info")
+            return {}, []
+
+        # Set initial progress so dashboard shows 0/N immediately
+        self.activity_manager.update_status(
+            "Downloading videos",
+            progress_current=0,
+            progress_total=len(video_ids)
+        )
+
+        # Crash recovery checkpoint
+        from modules.task_checkpoint import TaskCheckpoint
+        checkpoint = TaskCheckpoint(f'tiktok:{username}', 'scraping')
+        checkpoint.start(total_items=len(video_ids))
+        if checkpoint.is_recovering():
+            self.log(f"TikTok @{username}: recovering — skipping already-downloaded videos", "info")
+
+        # Step 2: Download each video individually with gallery-dl (fast per video, handles carousels)
+        for i, video_id in enumerate(video_ids, 1):
+            # Update progress at start of each iteration (fires even on skips)
+            self.activity_manager.update_status(
+                "Downloading videos",
+                progress_current=i,
+                progress_total=len(video_ids)
+            )
+
+            # Skip if already completed in a previous crashed run
+            if checkpoint.is_completed(video_id):
+                continue
+
+            checkpoint.set_current(video_id)
+
+            # Skip if already downloaded
+            if self._is_already_downloaded(video_id, username):
+                self.log(f"[{i}/{len(video_ids)}] Skipping already downloaded: {video_id}", "debug")
+                checkpoint.mark_completed(video_id)
+                continue
+
+            video_url = f"https://www.tiktok.com/@{username}/video/{video_id}"
+            self.log(f"[{i}/{len(video_ids)}] Downloading {video_id}", "debug")
+
+            cmd = [
+                "gallery-dl",
+                "--write-metadata",
+                "-D", str(output_dir),
+                "-f", "{date:%Y%m%d}_{desc}_{id}_{num}.{extension}",
+                video_url
+            ]
+
+            try:
+                self.log(f"Calling gallery-dl for {video_id}", "debug")
+                result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+                self.log(f"gallery-dl returned: code={result.returncode}, stdout lines={len(result.stdout.splitlines()) if result.stdout else 0}", "debug")
+                if result.returncode != 0 and result.stderr:
+                    stderr = result.stderr
+                    if "not available" in stderr.lower() or "404" in stderr:
+                        self.log(f"Video {video_id} not available (deleted or private)", "warning")
+                    else:
+                        self.log(f"Failed to download {video_id}: {stderr[:100]}", "warning")
+            except subprocess.TimeoutExpired:
+                self.log(f"Timeout downloading {video_id}", "warning")
+            except Exception as e:
+                self.log(f"Error downloading {video_id}: {e}", "warning")
+
+            checkpoint.mark_completed(video_id)
+
+        checkpoint.finish()
+
+        # Post-process: Rename files with long descriptions and remove audio-only files
+        for file in output_dir.glob("*"):
+            if file.is_file() and not file.suffix == '.json':
+                # Remove audio-only files (.mp3, .m4a, .aac)
+                if file.suffix.lower() in ['.mp3', '.m4a', '.aac', '.wav', '.ogg']:
+                    self.log(f"Removing audio-only file: {file.name}", "debug")
+                    file.unlink()
+                    # Also remove corresponding JSON
+                    json_file = file.with_suffix(file.suffix + '.json')
+                    if json_file.exists():
+                        json_file.unlink()
+                    continue
+
+                # Truncate long filenames (max 255 chars for Linux)
+                if len(file.name) > 200:  # Leave some margin
+                    # Parse filename: YYYYMMDD_description_ID_NUM.ext
+                    parts = file.name.rsplit('_', 2)  # Split from right to preserve ID and num
+                    if len(parts) == 3:
+                        date_and_desc, video_id, num_and_ext = parts
+                        # Split date from description
+                        date_part = date_and_desc[:8]  # YYYYMMDD
+                        desc_part = date_and_desc[9:]  # Everything after date_
+
+                        # Calculate max description length
+                        # Format: DATE_DESC_ID_NUM.EXT
+                        fixed_length = len(date_part) + len(video_id) + len(num_and_ext) + 3  # 3 underscores
+                        max_desc_len = 200 - fixed_length
+
+                        if len(desc_part) > max_desc_len:
+                            truncated_desc = desc_part[:max_desc_len-3] + "..."
+                            new_name = f"{date_part}_{truncated_desc}_{video_id}_{num_and_ext}"
+                            new_path = file.parent / new_name
+
+                            self.log(f"Truncating long filename: {file.name[:50]}... -> {new_name[:50]}...", "debug")
+                            file.rename(new_path)
+
+                            # Rename corresponding JSON file too
+                            json_file = Path(str(file) + '.json')
+                            if json_file.exists():
+                                new_json = Path(str(new_path) + '.json')
+                                json_file.rename(new_json)
+        
+        # Process downloaded files and extract timestamps from JSON
+        downloaded_files = []
+        file_timestamps = {}
+        processed_ids = set()  # Track IDs we've checked in DB (not in this loop, but in previous downloads)
+        started_ids = set()    # Track IDs we've started processing in THIS run
+
+        for json_file in output_dir.glob("*.json"):
+            try:
+                with open(json_file, 'r', encoding='utf-8') as f:
+                    info = json.load(f)
+
+                # Get video ID
+                video_id = info.get('id', '')
+
+                # Extract timestamp from gallery-dl's createTime field (needed for all files)
+                timestamp = None
+                create_time = info.get('createTime')
+                if create_time:
+                    try:
+                        timestamp = datetime.fromtimestamp(int(create_time))
+                        self.log(f"Extracted timestamp {timestamp} from createTime", "debug")
+                    except Exception:
+                        # Fall back to old yt-dlp method if createTime not available
+                        timestamp = self.extract_date_from_info(info)
+
+                # gallery-dl names JSON files as: filename.ext.json
+                # So we need to remove the .json extension to get the media file
+                media_file = Path(str(json_file)[:-5])  # Remove .json extension
+
+                if not media_file.exists():
+                    self.log(f"Media file not found for {json_file.name}", "warning")
+                    json_file.unlink()
+                    continue
+
+                video_file = media_file  # Use same variable name for compatibility
+
+                # Check if already downloaded - but only check ONCE per video_id per run
+                # (Don't check again for carousel photos #2, #3 after we've started processing #1)
+                if video_id and video_id not in started_ids:
+                    if self._is_already_downloaded(video_id, username):
+                        self.log(f"Skipping already downloaded post: {video_id}", "debug")
+                        # Mark as processed so we don't check again for this ID's other files
+                        processed_ids.add(video_id)
+                        # Just remove JSON file, keep media files (they're already processed)
+                        json_file.unlink()
+                        continue
+                    # Mark that we've started processing this video_id
+                    started_ids.add(video_id)
+
+                # Skip if this video_id was marked as already downloaded
+                if video_id in processed_ids:
+                    json_file.unlink()
+                    continue
+
+                # ALWAYS add file to downloaded list and apply timestamp (even for carousel photos #2, #3)
+                downloaded_files.append(video_file)
+                if timestamp:
+                    file_timestamps[video_file.name] = timestamp
+                    self.log(f"Extracted timestamp {timestamp} for {video_file.name}", "debug")
+
+                # Check for duplicate hash before recording (hash blacklist persists even if original deleted)
+                file_hash = self.db.get_file_hash(str(video_file)) if self.db else None
+                if file_hash:
+                    existing = self.db.get_download_by_file_hash(file_hash)
+                    if existing and existing.get('file_path') and str(video_file) != existing.get('file_path'):
+                        # Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
+                        self.log(f"⚠ Duplicate content detected (hash match): {video_file.name} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
+                        # Delete the duplicate regardless of whether original file still exists
+                        try:
+                            video_file.unlink()
+                            self.log(f"Deleted duplicate (hash blacklist): {video_file.name}", "debug")
+                            # Mark as processed so we don't try to download again
+                            processed_ids.add(video_id)
+                            json_file.unlink()
+                            continue
+                        except Exception as e:
+                            self.log(f"Failed to delete duplicate {video_file.name}: {e}", "warning")
+
+                # Record in database (each file gets its own entry, even for carousels)
+                if video_id:
+                    self._record_download(
+                        video_id=video_id,
+                        username=username,
+                        filename=video_file.name,
+                        post_date=timestamp,
+                        metadata={"title": info.get('desc', ''), "description": info.get('desc', '')},
+                        deferred=self.defer_database
+                    )
+
+                # Remove JSON file after processing
+                json_file.unlink()
+                
+            except Exception as e:
+                self.log(f"Failed to process {json_file}: {e}", "error")
+        
+        self.log(f"Downloaded {len(downloaded_files)} files from @{username}", "info")
+
+        # Apply timestamps to files
+        import os
+        for file_path in downloaded_files:
+            filename = file_path.name
+            if filename in file_timestamps:
+                timestamp = file_timestamps[filename]
+                try:
+                    # Convert datetime to unix timestamp
+                    unix_time = timestamp.timestamp()
+                    # Set both access time and modification time
+                    os.utime(str(file_path), (unix_time, unix_time))
+                    self.log(f"Applied timestamp {timestamp} to {filename}", "debug")
+                except Exception as e:
+                    self.log(f"Failed to apply timestamp to {filename}: {e}", "warning")
+
+        # Store timestamps for later use
+        self.file_timestamps.update(file_timestamps)
+
+        return file_timestamps, downloaded_files
+    
+    def download_video(self, url: str, output_dir: Path = None) -> Tuple[Optional[datetime], Optional[Path]]:
+        """
+        Download a single TikTok video
+        
+        Args:
+            url: TikTok video URL
+            output_dir: Output directory
+        
+        Returns:
+            Tuple of (timestamp, downloaded file path)
+        """
+        output_dir = output_dir or self.base_path
+        output_dir.mkdir(parents=True, exist_ok=True)
+        
+        self.log(f"Downloading video: {url}", "info")
+        
+        # First, get video info without downloading
+        cmd_info = [
+            "yt-dlp",
+            "--dump-json",
+            "--no-warnings",
+            "--quiet",
+            url
+        ]
+        
+        try:
+            result = subprocess.run(cmd_info, capture_output=True, text=True)
+            if result.returncode != 0:
+                self.log(f"Failed to get video info: {result.stderr}", "error")
+                return None, None
+            
+            info = json.loads(result.stdout)
+            timestamp = self.extract_date_from_info(info)
+
+            # Check if this is a photo post (no video, only audio)
+            formats = info.get('formats', [])
+            has_video = any(f.get('vcodec') != 'none' for f in formats)
+
+            if not has_video and len(formats) > 0:
+                # This is a photo/image post - skip it
+                self.log("Skipping TikTok photo post (only videos are downloaded)", "info")
+                return timestamp, None
+
+            # Download video
+            output_template = str(output_dir / "%(upload_date)s_%(title)s_%(id)s.%(ext)s")
+            cmd_download = [
+                "yt-dlp",
+                "--format", "best",  # Explicitly request best video+audio format
+                "--no-warnings",
+                "--quiet",
+                "-o", output_template,
+                url
+            ]
+
+            result = subprocess.run(cmd_download, capture_output=True, text=True)
+            if result.returncode != 0:
+                self.log(f"Failed to download video: {result.stderr}", "error")
+                return timestamp, None
+            
+            # Find the downloaded file
+            expected_name = output_template.replace('%(upload_date)s', info.get('upload_date', 'unknown'))
+            expected_name = expected_name.replace('%(title)s', info.get('title', 'video'))
+            expected_name = expected_name.replace('%(id)s', info.get('id', ''))
+            expected_name = expected_name.replace('%(ext)s', info.get('ext', 'mp4'))
+            
+            downloaded_file = Path(expected_name)
+            if not downloaded_file.exists():
+                # Try to find it by pattern
+                pattern = f"*{info.get('id', '')}*.mp4"
+                matches = list(output_dir.glob(pattern))
+                if matches:
+                    downloaded_file = matches[0]
+            
+            if downloaded_file.exists():
+                if timestamp:
+                    self.file_timestamps[downloaded_file.name] = timestamp
+                return timestamp, downloaded_file
+            
+            return timestamp, None
+            
+        except Exception as e:
+            self.log(f"Failed to download video: {e}", "error")
+            return None, None
+    
+    def get_file_timestamps(self) -> Dict[str, datetime]:
+        """Get the collected file timestamps"""
+        return self.file_timestamps.copy()
+    
+    def clear_timestamps(self):
+        """Clear the stored timestamps"""
+        self.file_timestamps.clear()
+
+
+def download_tiktok_profile(username: str,
+                           days: int = 7,
+                           base_path: Path = None,
+                           log_callback=None,
+                           unified_db=None) -> Dict[str, datetime]:
+    """
+    Simple function interface for downloading TikTok profile
+
+    Args:
+        username: TikTok username
+        days: Number of days to download
+        base_path: Base download path
+        log_callback: Optional logging callback
+        unified_db: UnifiedDatabase instance (required)
+
+    Returns:
+        Dictionary mapping filenames to timestamps
+    """
+    if not unified_db:
+        raise ValueError("unified_db is required for TikTok downloads")
+
+    downloader = TikTokDownloader(base_path=base_path, log_callback=log_callback, unified_db=unified_db)
+    timestamps, files = downloader.download_profile(username, number_of_days=days)
+    return timestamps
+
+
+if __name__ == "__main__":
+    # Test the module
+    import tempfile
+    
+    print("TikTok Downloader Module Test")
+    print("="*60)
+    
+    # Test with a small profile
+    with tempfile.TemporaryDirectory() as tmpdir:
+        downloader = TikTokDownloader(base_path=Path(tmpdir))
+        
+        # You can test with a real TikTok username
+        # timestamps, files = downloader.download_profile("username", number_of_days=1)
+        
+        print("Module ready for integration")