media-downloader/modules/tiktok_module.py

#!/usr/bin/env python3
"""
TikTok Download Module - Downloads TikTok videos with proper timestamp extraction
"""

import os
import re
import json
import subprocess
import sqlite3
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from modules.base_module import LoggingMixin


class TikTokDownloader(LoggingMixin):
    """Downloads TikTok videos and extracts metadata including timestamps"""

    def __init__(self, base_path: Path = None, log_callback=None, use_database=True, unified_db=None):
        """
        Initialize TikTok downloader

        Args:
            base_path: Base path for downloads
            log_callback: Optional callback for logging (tag, level, message)
            use_database: Whether to use database for tracking downloads
            unified_db: UnifiedDatabase instance (required)
        """
        # Initialize logging via mixin
        self._init_logger('TikTok', log_callback, default_module='Download')

        self.base_path = Path(base_path) if base_path else Path.cwd()
        self.file_timestamps = {}  # Map of filename -> datetime
        self.use_database = use_database

        # Always use unified database adapter
        if not unified_db:
            raise ValueError("TikTok module requires unified_db - standalone database is no longer supported")

        from modules.tiktok_db_adapter import TikTokDatabaseAdapter
        self.db = TikTokDatabaseAdapter(unified_db)
        self.use_unified_db = True

        # Initialize activity status manager for real-time updates
        from modules.activity_status import get_activity_manager
        self.activity_manager = get_activity_manager(unified_db)

        self.pending_downloads = []  # Track downloads for deferred database recording

    def _is_already_downloaded(self, video_id: str, username: str = None) -> bool:
        """Check if a video has already been downloaded"""
        if not self.use_database:
            return False

        # Pass username for proper database lookup
        if username:
            return self.db.is_downloaded(video_id, username)
        return self.db.is_already_downloaded(video_id)

    def _record_download(self, video_id: str, username: str, filename: str,
                        post_date: Optional[datetime] = None, metadata: Dict = None,
                        deferred: bool = False):
        """Record a successful download in the database

        Args:
            deferred: If True, don't record to database now - add to pending_downloads list
                     for later recording after file move is complete
        """
        # Extract just the filename from the full path for database
        from pathlib import Path
        file_path = str(filename)  # Full path
        filename_only = Path(filename).name  # Just the filename

        # If deferred, store for later recording instead of recording now
        if deferred:
            self.pending_downloads.append({
                'video_id': video_id,
                'username': username,
                'filename': filename_only,
                'post_date': post_date.isoformat() if post_date else None,
                'file_path': file_path,
                'metadata': metadata
            })
            self.log(f"Deferred recording for {video_id}", "debug")
            return True

        if not self.use_database:
            return

        return self.db.record_download(
            video_id=video_id,
            username=username,
            filename=filename_only,
            post_date=post_date,
            metadata=metadata,
            file_path=file_path
        )

    def get_pending_downloads(self):
        """Get list of downloads that were deferred for later recording"""
        return self.pending_downloads.copy()

    def clear_pending_downloads(self):
        """Clear the pending downloads list after they've been recorded"""
        self.pending_downloads = []

    def extract_date_from_info(self, info_dict: Dict) -> Optional[datetime]:
        """
        Extract upload date from yt-dlp info dictionary

        Args:
            info_dict: yt-dlp info dictionary

        Returns:
            datetime object or None
        """
        # Try timestamp first (Unix timestamp - has full date and time)
        # TikTok provides UTC timestamps, need to convert to local time
        timestamp = info_dict.get('timestamp')
        if timestamp:
            try:
                # Use UTC timestamp and convert to local
                from datetime import timezone
                dt_utc = datetime.fromtimestamp(timestamp, tz=timezone.utc)
                dt = dt_utc.replace(tzinfo=None)  # Remove timezone info for local datetime
                self.log(f"Extracted full timestamp (UTC): {dt}", "debug")
                return dt
            except Exception:
                pass

        # Try release_timestamp (also has full date and time)
        release_timestamp = info_dict.get('release_timestamp')
        if release_timestamp:
            try:
                from datetime import timezone
                dt_utc = datetime.fromtimestamp(release_timestamp, tz=timezone.utc)
                dt = dt_utc.replace(tzinfo=None)  # Remove timezone info for local datetime
                self.log(f"Extracted release timestamp (UTC): {dt}", "debug")
                return dt
            except Exception:
                pass

        # Try modified_timestamp
        modified_timestamp = info_dict.get('modified_timestamp')
        if modified_timestamp:
            try:
                from datetime import timezone
                dt_utc = datetime.fromtimestamp(modified_timestamp, tz=timezone.utc)
                dt = dt_utc.replace(tzinfo=None)  # Remove timezone info for local datetime
                self.log(f"Extracted modified timestamp (UTC): {dt}", "debug")
                return dt
            except Exception:
                pass

        # Fall back to upload_date (YYYYMMDD format - only has date, no time)
        # This should be last resort as it loses time information
        upload_date = info_dict.get('upload_date')
        if upload_date and len(upload_date) == 8:
            try:
                # Try to get time from filename if it has timestamp format
                # TikTok sometimes includes timestamp in the video ID
                dt = datetime.strptime(upload_date, '%Y%m%d')
                self.log(f"Only date available (no time): {dt.date()}", "warning")
                return dt
            except Exception:
                pass

        return None

    def download_profile(self,
                        username: str,
                        number_of_days: int = 7,
                        full_profile: bool = False,
                        output_dir: Path = None,
                        defer_database: bool = False) -> Tuple[Dict[str, datetime], List[Path]]:
        """
        Download TikTok profile videos

        Args:
            username: TikTok username (without @)
            number_of_days: Number of days to download (ignored if full_profile=True)
            full_profile: If True, download entire profile
            output_dir: Output directory (uses base_path/username if not specified)
            defer_database: If True, don't record to database immediately - store in
                           pending_downloads for later recording after file move is complete

        Returns:
            Tuple of (file_timestamps dict, list of downloaded files)
        """
        self.defer_database = defer_database  # Store for use in _record_download
        username = username.lstrip('@')
        output_dir = output_dir or self.base_path / username
        output_dir.mkdir(parents=True, exist_ok=True)

        self.log(f"Downloading TikTok profile: @{username}", "info")
        self.activity_manager.update_status("Checking videos")

        # HYBRID APPROACH: Use yt-dlp to get ID list (fast), then gallery-dl per video (handles carousels)

        # Step 1: Use yt-dlp to quickly get list of video IDs with dates
        profile_url = f"https://www.tiktok.com/@{username}"
        list_cmd = [
            "yt-dlp",
            "--flat-playlist",  # Don't download, just list
            "--print", "%(upload_date)s %(id)s",    # Print date and ID
            "--quiet",
            "--no-warnings",
            profile_url
        ]

        self.log(f"Getting video list with yt-dlp...", "debug")

        # Get list of video IDs with dates
        try:
            result = subprocess.run(list_cmd, capture_output=True, text=True, timeout=60)
            lines = [line.strip() for line in result.stdout.strip().split('\n') if line.strip()]

            # Parse and filter by date if needed
            video_ids = []
            if not full_profile and number_of_days:
                from datetime import timedelta
                cutoff_date = datetime.now() - timedelta(days=number_of_days)
                cutoff_str = cutoff_date.strftime('%Y%m%d')

                for line in lines:
                    parts = line.split()
                    if len(parts) >= 2:
                        upload_date, video_id = parts[0], parts[1]
                        # Only include videos after cutoff date
                        if upload_date >= cutoff_str:
                            video_ids.append(video_id)
            else:
                # No filter, take all
                video_ids = [line.split()[1] for line in lines if len(line.split()) >= 2]

            self.log(f"Found {len(video_ids)} posts to download", "info")
        except Exception as e:
            self.log(f"Failed to get video list: {e}", "error")
            return {}, []

        if not video_ids:
            self.log("No videos found matching criteria", "info")
            return {}, []

        # Set initial progress so dashboard shows 0/N immediately
        self.activity_manager.update_status(
            "Downloading videos",
            progress_current=0,
            progress_total=len(video_ids)
        )

        # Crash recovery checkpoint
        from modules.task_checkpoint import TaskCheckpoint
        checkpoint = TaskCheckpoint(f'tiktok:{username}', 'scraping')
        checkpoint.start(total_items=len(video_ids))
        if checkpoint.is_recovering():
            self.log(f"TikTok @{username}: recovering — skipping already-downloaded videos", "info")

        # Step 2: Download each video individually with gallery-dl (fast per video, handles carousels)
        for i, video_id in enumerate(video_ids, 1):
            # Update progress at start of each iteration (fires even on skips)
            self.activity_manager.update_status(
                "Downloading videos",
                progress_current=i,
                progress_total=len(video_ids)
            )

            # Skip if already completed in a previous crashed run
            if checkpoint.is_completed(video_id):
                continue

            checkpoint.set_current(video_id)

            # Skip if already downloaded
            if self._is_already_downloaded(video_id, username):
                self.log(f"[{i}/{len(video_ids)}] Skipping already downloaded: {video_id}", "debug")
                checkpoint.mark_completed(video_id)
                continue

            video_url = f"https://www.tiktok.com/@{username}/video/{video_id}"
            self.log(f"[{i}/{len(video_ids)}] Downloading {video_id}", "debug")

            cmd = [
                "gallery-dl",
                "--write-metadata",
                "-D", str(output_dir),
                "-f", "{date:%Y%m%d}_{desc}_{id}_{num}.{extension}",
                video_url
            ]

            try:
                self.log(f"Calling gallery-dl for {video_id}", "debug")
                result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
                self.log(f"gallery-dl returned: code={result.returncode}, stdout lines={len(result.stdout.splitlines()) if result.stdout else 0}", "debug")
                if result.returncode != 0 and result.stderr:
                    stderr = result.stderr
                    if "not available" in stderr.lower() or "404" in stderr:
                        self.log(f"Video {video_id} not available (deleted or private)", "warning")
                    else:
                        self.log(f"Failed to download {video_id}: {stderr[:100]}", "warning")
            except subprocess.TimeoutExpired:
                self.log(f"Timeout downloading {video_id}", "warning")
            except Exception as e:
                self.log(f"Error downloading {video_id}: {e}", "warning")

            checkpoint.mark_completed(video_id)

        checkpoint.finish()

        # Post-process: Rename files with long descriptions and remove audio-only files
        for file in output_dir.glob("*"):
            if file.is_file() and not file.suffix == '.json':
                # Remove audio-only files (.mp3, .m4a, .aac)
                if file.suffix.lower() in ['.mp3', '.m4a', '.aac', '.wav', '.ogg']:
                    self.log(f"Removing audio-only file: {file.name}", "debug")
                    file.unlink()
                    # Also remove corresponding JSON
                    json_file = file.with_suffix(file.suffix + '.json')
                    if json_file.exists():
                        json_file.unlink()
                    continue

                # Truncate long filenames (max 255 chars for Linux)
                if len(file.name) > 200:  # Leave some margin
                    # Parse filename: YYYYMMDD_description_ID_NUM.ext
                    parts = file.name.rsplit('_', 2)  # Split from right to preserve ID and num
                    if len(parts) == 3:
                        date_and_desc, video_id, num_and_ext = parts
                        # Split date from description
                        date_part = date_and_desc[:8]  # YYYYMMDD
                        desc_part = date_and_desc[9:]  # Everything after date_

                        # Calculate max description length
                        # Format: DATE_DESC_ID_NUM.EXT
                        fixed_length = len(date_part) + len(video_id) + len(num_and_ext) + 3  # 3 underscores
                        max_desc_len = 200 - fixed_length

                        if len(desc_part) > max_desc_len:
                            truncated_desc = desc_part[:max_desc_len-3] + "..."
                            new_name = f"{date_part}_{truncated_desc}_{video_id}_{num_and_ext}"
                            new_path = file.parent / new_name

                            self.log(f"Truncating long filename: {file.name[:50]}... -> {new_name[:50]}...", "debug")
                            file.rename(new_path)

                            # Rename corresponding JSON file too
                            json_file = Path(str(file) + '.json')
                            if json_file.exists():
                                new_json = Path(str(new_path) + '.json')
                                json_file.rename(new_json)

        # Process downloaded files and extract timestamps from JSON
        downloaded_files = []
        file_timestamps = {}
        processed_ids = set()  # Track IDs we've checked in DB (not in this loop, but in previous downloads)
        started_ids = set()    # Track IDs we've started processing in THIS run

        for json_file in output_dir.glob("*.json"):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    info = json.load(f)

                # Get video ID
                video_id = info.get('id', '')

                # Extract timestamp from gallery-dl's createTime field (needed for all files)
                timestamp = None
                create_time = info.get('createTime')
                if create_time:
                    try:
                        timestamp = datetime.fromtimestamp(int(create_time))
                        self.log(f"Extracted timestamp {timestamp} from createTime", "debug")
                    except Exception:
                        # Fall back to old yt-dlp method if createTime not available
                        timestamp = self.extract_date_from_info(info)

                # gallery-dl names JSON files as: filename.ext.json
                # So we need to remove the .json extension to get the media file
                media_file = Path(str(json_file)[:-5])  # Remove .json extension

                if not media_file.exists():
                    self.log(f"Media file not found for {json_file.name}", "warning")
                    json_file.unlink()
                    continue

                video_file = media_file  # Use same variable name for compatibility

                # Check if already downloaded - but only check ONCE per video_id per run
                # (Don't check again for carousel photos #2, #3 after we've started processing #1)
                if video_id and video_id not in started_ids:
                    if self._is_already_downloaded(video_id, username):
                        self.log(f"Skipping already downloaded post: {video_id}", "debug")
                        # Mark as processed so we don't check again for this ID's other files
                        processed_ids.add(video_id)
                        # Just remove JSON file, keep media files (they're already processed)
                        json_file.unlink()
                        continue
                    # Mark that we've started processing this video_id
                    started_ids.add(video_id)

                # Skip if this video_id was marked as already downloaded
                if video_id in processed_ids:
                    json_file.unlink()
                    continue

                # ALWAYS add file to downloaded list and apply timestamp (even for carousel photos #2, #3)
                downloaded_files.append(video_file)
                if timestamp:
                    file_timestamps[video_file.name] = timestamp
                    self.log(f"Extracted timestamp {timestamp} for {video_file.name}", "debug")

                # Check for duplicate hash before recording (hash blacklist persists even if original deleted)
                file_hash = self.db.get_file_hash(str(video_file)) if self.db else None
                if file_hash:
                    existing = self.db.get_download_by_file_hash(file_hash)
                    if existing and existing.get('file_path') and str(video_file) != existing.get('file_path'):
                        # Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
                        self.log(f"⚠ Duplicate content detected (hash match): {video_file.name} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
                        # Delete the duplicate regardless of whether original file still exists
                        try:
                            video_file.unlink()
                            self.log(f"Deleted duplicate (hash blacklist): {video_file.name}", "debug")
                            # Mark as processed so we don't try to download again
                            processed_ids.add(video_id)
                            json_file.unlink()
                            continue
                        except Exception as e:
                            self.log(f"Failed to delete duplicate {video_file.name}: {e}", "warning")

                # Record in database (each file gets its own entry, even for carousels)
                if video_id:
                    self._record_download(
                        video_id=video_id,
                        username=username,
                        filename=video_file.name,
                        post_date=timestamp,
                        metadata={"title": info.get('desc', ''), "description": info.get('desc', '')},
                        deferred=self.defer_database
                    )

                # Remove JSON file after processing
                json_file.unlink()

            except Exception as e:
                self.log(f"Failed to process {json_file}: {e}", "error")

        self.log(f"Downloaded {len(downloaded_files)} files from @{username}", "info")

        # Apply timestamps to files
        import os
        for file_path in downloaded_files:
            filename = file_path.name
            if filename in file_timestamps:
                timestamp = file_timestamps[filename]
                try:
                    # Convert datetime to unix timestamp
                    unix_time = timestamp.timestamp()
                    # Set both access time and modification time
                    os.utime(str(file_path), (unix_time, unix_time))
                    self.log(f"Applied timestamp {timestamp} to {filename}", "debug")
                except Exception as e:
                    self.log(f"Failed to apply timestamp to {filename}: {e}", "warning")

        # Store timestamps for later use
        self.file_timestamps.update(file_timestamps)

        return file_timestamps, downloaded_files

    def download_video(self, url: str, output_dir: Path = None) -> Tuple[Optional[datetime], Optional[Path]]:
        """
        Download a single TikTok video

        Args:
            url: TikTok video URL
            output_dir: Output directory

        Returns:
            Tuple of (timestamp, downloaded file path)
        """
        output_dir = output_dir or self.base_path
        output_dir.mkdir(parents=True, exist_ok=True)

        self.log(f"Downloading video: {url}", "info")

        # First, get video info without downloading
        cmd_info = [
            "yt-dlp",
            "--dump-json",
            "--no-warnings",
            "--quiet",
            url
        ]

        try:
            result = subprocess.run(cmd_info, capture_output=True, text=True)
            if result.returncode != 0:
                self.log(f"Failed to get video info: {result.stderr}", "error")
                return None, None

            info = json.loads(result.stdout)
            timestamp = self.extract_date_from_info(info)

            # Check if this is a photo post (no video, only audio)
            formats = info.get('formats', [])
            has_video = any(f.get('vcodec') != 'none' for f in formats)

            if not has_video and len(formats) > 0:
                # This is a photo/image post - skip it
                self.log("Skipping TikTok photo post (only videos are downloaded)", "info")
                return timestamp, None

            # Download video
            output_template = str(output_dir / "%(upload_date)s_%(title)s_%(id)s.%(ext)s")
            cmd_download = [
                "yt-dlp",
                "--format", "best",  # Explicitly request best video+audio format
                "--no-warnings",
                "--quiet",
                "-o", output_template,
                url
            ]

            result = subprocess.run(cmd_download, capture_output=True, text=True)
            if result.returncode != 0:
                self.log(f"Failed to download video: {result.stderr}", "error")
                return timestamp, None

            # Find the downloaded file
            expected_name = output_template.replace('%(upload_date)s', info.get('upload_date', 'unknown'))
            expected_name = expected_name.replace('%(title)s', info.get('title', 'video'))
            expected_name = expected_name.replace('%(id)s', info.get('id', ''))
            expected_name = expected_name.replace('%(ext)s', info.get('ext', 'mp4'))

            downloaded_file = Path(expected_name)
            if not downloaded_file.exists():
                # Try to find it by pattern
                pattern = f"*{info.get('id', '')}*.mp4"
                matches = list(output_dir.glob(pattern))
                if matches:
                    downloaded_file = matches[0]

            if downloaded_file.exists():
                if timestamp:
                    self.file_timestamps[downloaded_file.name] = timestamp
                return timestamp, downloaded_file

            return timestamp, None

        except Exception as e:
            self.log(f"Failed to download video: {e}", "error")
            return None, None

    def get_file_timestamps(self) -> Dict[str, datetime]:
        """Get the collected file timestamps"""
        return self.file_timestamps.copy()

    def clear_timestamps(self):
        """Clear the stored timestamps"""
        self.file_timestamps.clear()


def download_tiktok_profile(username: str,
                           days: int = 7,
                           base_path: Path = None,
                           log_callback=None,
                           unified_db=None) -> Dict[str, datetime]:
    """
    Simple function interface for downloading TikTok profile

    Args:
        username: TikTok username
        days: Number of days to download
        base_path: Base download path
        log_callback: Optional logging callback
        unified_db: UnifiedDatabase instance (required)

    Returns:
        Dictionary mapping filenames to timestamps
    """
    if not unified_db:
        raise ValueError("unified_db is required for TikTok downloads")

    downloader = TikTokDownloader(base_path=base_path, log_callback=log_callback, unified_db=unified_db)
    timestamps, files = downloader.download_profile(username, number_of_days=days)
    return timestamps


if __name__ == "__main__":
    # Test the module
    import tempfile

    print("TikTok Downloader Module Test")
    print("="*60)

    # Test with a small profile
    with tempfile.TemporaryDirectory() as tmpdir:
        downloader = TikTokDownloader(base_path=Path(tmpdir))

        # You can test with a real TikTok username
        # timestamps, files = downloader.download_profile("username", number_of_days=1)

        print("Module ready for integration")