media-downloader/modules/semantic_search.py

#!/usr/bin/env python3
"""
Semantic Search Module using CLIP
Provides image/video similarity search and natural language search capabilities
"""

import os
import struct
import numpy as np
from typing import Dict, List, Optional, Tuple, Any
from pathlib import Path
from PIL import Image
import threading
import queue
from datetime import datetime
from modules.universal_logger import get_logger

logger = get_logger('SemanticSearch')

# Global model instance (lazy loaded)
_clip_model = None
_clip_model_name = None
_model_lock = threading.Lock()


def get_configured_model_name() -> str:
    """Get the configured CLIP model name from settings"""
    try:
        from modules.settings_manager import SettingsManager
        from pathlib import Path
        # Use the correct database path
        db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
        settings_manager = SettingsManager(str(db_path))
        semantic_settings = settings_manager.get('semantic_search', {})
        if isinstance(semantic_settings, dict):
            model = semantic_settings.get('model', 'clip-ViT-B-32')
            logger.info(f"Configured CLIP model: {model}")
            return model
        return 'clip-ViT-B-32'
    except Exception as e:
        logger.error(f"Failed to get configured model: {e}")
        return 'clip-ViT-B-32'


def get_clip_model(model_name: str = None):
    """Get or load the CLIP model (thread-safe singleton)"""
    global _clip_model, _clip_model_name

    if model_name is None:
        model_name = get_configured_model_name()

    # Check if we need to reload (model changed)
    if _clip_model is not None and _clip_model_name != model_name:
        with _model_lock:
            logger.info(f"Model changed from {_clip_model_name} to {model_name}, reloading...")
            _clip_model = None
            _clip_model_name = None

    if _clip_model is None:
        with _model_lock:
            if _clip_model is None:
                logger.info(f"Loading CLIP model ({model_name})...")
                try:
                    from sentence_transformers import SentenceTransformer
                    _clip_model = SentenceTransformer(model_name)
                    _clip_model_name = model_name
                    logger.info(f"CLIP model {model_name} loaded successfully")
                except Exception as e:
                    logger.error(f"Failed to load CLIP model: {e}")
                    raise

    return _clip_model


def embedding_to_bytes(embedding: np.ndarray) -> bytes:
    """Convert numpy embedding to bytes for database storage"""
    return embedding.astype(np.float32).tobytes()


def bytes_to_embedding(data: bytes) -> np.ndarray:
    """Convert bytes from database back to numpy embedding"""
    return np.frombuffer(data, dtype=np.float32)


def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Calculate cosine similarity between two embeddings"""
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))


class SemanticSearch:
    """Semantic search engine using CLIP embeddings"""

    SUPPORTED_IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'}
    SUPPORTED_VIDEO_EXTENSIONS = {'.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v'}

    def __init__(self, unified_db):
        """
        Initialize Semantic Search

        Args:
            unified_db: UnifiedDatabase instance
        """
        self.db = unified_db
        self.logger = get_logger('SemanticSearch')
        self._model = None

    @property
    def model(self):
        """Lazy load CLIP model"""
        if self._model is None:
            self._model = get_clip_model()
        return self._model

    def get_image_embedding(self, image_path: str) -> Optional[np.ndarray]:
        """
        Generate CLIP embedding for an image

        Args:
            image_path: Path to the image file

        Returns:
            Embedding vector or None on error
        """
        try:
            # Load and preprocess image
            with Image.open(image_path) as image:
                # Convert to RGB if necessary
                if image.mode != 'RGB':
                    image = image.convert('RGB')

                # Generate embedding
                embedding = self.model.encode(image, convert_to_numpy=True)

                return embedding

        except Exception as e:
            self.logger.debug(f"Failed to get embedding for {image_path}: {e}")
            return None

    def get_video_frame_embedding(self, video_path: str, frame_position: float = 0.1) -> Optional[np.ndarray]:
        """
        Generate CLIP embedding for a video by extracting a frame

        Args:
            video_path: Path to the video file
            frame_position: Position in video (0-1) to extract frame from

        Returns:
            Embedding vector or None on error
        """
        # Try cv2 first, fall back to ffmpeg for codecs cv2 can't handle (e.g. AV1)
        image = self._extract_frame_cv2(video_path, frame_position)
        if image is None:
            image = self._extract_frame_ffmpeg(video_path, frame_position)

        if image is None:
            return None

        try:
            embedding = self.model.encode(image, convert_to_numpy=True)
            return embedding
        except Exception as e:
            self.logger.debug(f"Failed to encode video frame for {video_path}: {e}")
            return None
        finally:
            # Clean up image to prevent memory leaks
            if image is not None:
                try:
                    image.close()
                except Exception:
                    pass

    def _extract_frame_cv2(self, video_path: str, frame_position: float) -> Optional[Image.Image]:
        """Extract frame using OpenCV"""
        try:
            import cv2

            cap = cv2.VideoCapture(video_path)
            if not cap.isOpened():
                return None

            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            if total_frames <= 0:
                cap.release()
                return None

            target_frame = int(total_frames * frame_position)
            cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame)

            ret, frame = cap.read()
            cap.release()

            if not ret:
                return None

            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            return Image.fromarray(frame_rgb)

        except Exception as e:
            self.logger.debug(f"cv2 frame extraction failed for {video_path}: {e}")
            return None

    def _extract_frame_ffmpeg(self, video_path: str, frame_position: float) -> Optional[Image.Image]:
        """Extract frame using ffmpeg (fallback for codecs cv2 can't handle)"""
        try:
            import subprocess
            import tempfile

            # Get video duration
            probe_cmd = [
                'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
                '-of', 'default=noprint_wrappers=1:nokey=1', video_path
            ]
            result = subprocess.run(probe_cmd, capture_output=True, text=True, timeout=10)
            if result.returncode != 0:
                return None

            duration = float(result.stdout.strip())
            seek_time = duration * frame_position

            # Extract frame to temp file
            with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
                tmp_path = tmp.name

            extract_cmd = [
                'ffmpeg', '-y', '-ss', str(seek_time), '-i', video_path,
                '-vframes', '1', '-q:v', '2', tmp_path
            ]
            result = subprocess.run(extract_cmd, capture_output=True, timeout=30)

            if result.returncode != 0 or not os.path.exists(tmp_path):
                return None

            image = Image.open(tmp_path)
            image.load()  # Load into memory before deleting file

            # Clean up temp file
            try:
                os.unlink(tmp_path)
            except OSError:
                pass  # Best effort cleanup of temp file

            if image.mode != 'RGB':
                image = image.convert('RGB')

            return image

        except Exception as e:
            self.logger.debug(f"ffmpeg frame extraction failed for {video_path}: {e}")
            return None

    def get_text_embedding(self, text: str) -> Optional[np.ndarray]:
        """
        Generate CLIP embedding for text query

        Args:
            text: Text query

        Returns:
            Embedding vector or None on error
        """
        try:
            embedding = self.model.encode(text, convert_to_numpy=True)
            return embedding
        except Exception as e:
            self.logger.error(f"Failed to get text embedding: {e}")
            return None

    def store_embedding(self, file_id: int, embedding: np.ndarray) -> bool:
        """
        Store embedding in database

        Args:
            file_id: File inventory ID
            embedding: Embedding vector

        Returns:
            Success status
        """
        try:
            embedding_bytes = embedding_to_bytes(embedding)

            with self.db.get_connection(for_write=True) as conn:
                cursor = conn.cursor()
                cursor.execute('''
                    INSERT OR REPLACE INTO content_embeddings
                    (file_id, embedding, embedding_model, embedding_version, created_date)
                    VALUES (?, ?, 'clip-ViT-B-32', 1, CURRENT_TIMESTAMP)
                ''', (file_id, embedding_bytes))

            return True

        except Exception as e:
            self.logger.error(f"Failed to store embedding for file {file_id}: {e}")
            return False

    def get_embedding(self, file_id: int) -> Optional[np.ndarray]:
        """
        Get stored embedding from database

        Args:
            file_id: File inventory ID

        Returns:
            Embedding vector or None
        """
        try:
            with self.db.get_connection() as conn:
                cursor = conn.cursor()
                cursor.execute('''
                    SELECT embedding FROM content_embeddings WHERE file_id = ?
                ''', (file_id,))

                row = cursor.fetchone()
                if row and row['embedding']:
                    return bytes_to_embedding(row['embedding'])

            return None

        except Exception as e:
            self.logger.error(f"Failed to get embedding for file {file_id}: {e}")
            return None

    def delete_embedding(self, file_id: int) -> bool:
        """
        Delete embedding for a file

        Args:
            file_id: File inventory ID

        Returns:
            True if deleted, False otherwise
        """
        try:
            with self.db.get_connection(for_write=True) as conn:
                cursor = conn.cursor()
                cursor.execute('DELETE FROM content_embeddings WHERE file_id = ?', (file_id,))
                if cursor.rowcount > 0:
                    self.logger.debug(f"Deleted embedding for file_id {file_id}")
                    return True
            return False
        except Exception as e:
            self.logger.error(f"Failed to delete embedding for file {file_id}: {e}")
            return False

    def delete_embedding_by_path(self, file_path: str) -> bool:
        """
        Delete embedding for a file by its path

        Args:
            file_path: File path

        Returns:
            True if deleted, False otherwise
        """
        try:
            with self.db.get_connection(for_write=True) as conn:
                cursor = conn.cursor()
                # First get the file_id
                cursor.execute('SELECT id FROM file_inventory WHERE file_path = ?', (file_path,))
                row = cursor.fetchone()
                if row:
                    cursor.execute('DELETE FROM content_embeddings WHERE file_id = ?', (row['id'],))
                    if cursor.rowcount > 0:
                        self.logger.debug(f"Deleted embedding for {file_path}")
                        return True
            return False
        except Exception as e:
            self.logger.error(f"Failed to delete embedding for {file_path}: {e}")
            return False

    def generate_embedding_for_file(self, file_id: int, file_path: str, content_type: str = None) -> bool:
        """
        Generate and store embedding for a single file

        Args:
            file_id: File inventory ID
            file_path: Path to the file
            content_type: Optional content type ('image' or 'video')

        Returns:
            True if embedding generated and stored successfully
        """
        try:
            if not os.path.exists(file_path):
                self.logger.debug(f"File not found for embedding: {file_path}")
                return False

            ext = Path(file_path).suffix.lower()

            # Determine file type
            if content_type:
                is_image = 'image' in content_type.lower()
                is_video = 'video' in content_type.lower()
            else:
                is_image = ext in self.SUPPORTED_IMAGE_EXTENSIONS
                is_video = ext in self.SUPPORTED_VIDEO_EXTENSIONS

            embedding = None
            if is_image:
                embedding = self.get_image_embedding(file_path)
            elif is_video:
                embedding = self.get_video_frame_embedding(file_path)

            if embedding is not None:
                if self.store_embedding(file_id, embedding):
                    self.logger.debug(f"Generated embedding for file_id {file_id}: {Path(file_path).name}")
                    return True

            return False

        except Exception as e:
            self.logger.error(f"Failed to generate embedding for file {file_id}: {e}")
            return False

    def get_embedding_stats(self) -> Dict:
        """Get statistics about embeddings in the database"""
        try:
            with self.db.get_connection() as conn:
                cursor = conn.cursor()

                # Total embeddings for files in 'final' location only
                # (excludes embeddings for files moved to recycle bin or review)
                cursor.execute('''
                    SELECT COUNT(*) FROM content_embeddings ce
                    JOIN file_inventory fi ON ce.file_id = fi.id
                    WHERE fi.location = 'final'
                ''')
                total_embeddings = cursor.fetchone()[0]

                # Total files in final location
                cursor.execute("SELECT COUNT(*) FROM file_inventory WHERE location = 'final'")
                total_files = cursor.fetchone()[0]

                # Files without embeddings
                cursor.execute('''
                    SELECT COUNT(*) FROM file_inventory fi
                    WHERE fi.location = 'final'
                    AND NOT EXISTS (SELECT 1 FROM content_embeddings ce WHERE ce.file_id = fi.id)
                ''')
                missing_embeddings = cursor.fetchone()[0]

                return {
                    'total_embeddings': total_embeddings,
                    'total_files': total_files,
                    'missing_embeddings': missing_embeddings,
                    'coverage_percent': round((total_embeddings / total_files * 100) if total_files > 0 else 0, 2)
                }

        except Exception as e:
            self.logger.error(f"Failed to get embedding stats: {e}")
            return {}

    def generate_embeddings_batch(self, limit: int = 100, platform: str = None,
                                   progress_callback=None) -> Dict:
        """
        Generate embeddings for files that don't have them yet

        Args:
            limit: Maximum files to process
            platform: Filter by platform
            progress_callback: Optional callback(processed, total, current_file)

        Returns:
            Dict with success/error counts
        """
        results = {'processed': 0, 'success': 0, 'errors': 0, 'skipped': 0}

        try:
            with self.db.get_connection() as conn:
                cursor = conn.cursor()

                # Get files without embeddings
                query = '''
                    SELECT fi.id, fi.file_path, fi.content_type, fi.filename
                    FROM file_inventory fi
                    WHERE fi.location = 'final'
                    AND NOT EXISTS (SELECT 1 FROM content_embeddings ce WHERE ce.file_id = fi.id)
                '''
                params = []

                if platform:
                    query += ' AND fi.platform = ?'
                    params.append(platform)

                query += ' LIMIT ?'
                params.append(limit)

                cursor.execute(query, params)
                files = cursor.fetchall()

            total = len(files)
            self.logger.info(f"Processing {total} files for embedding generation")

            for i, file_row in enumerate(files):
                file_id = file_row['id']
                file_path = file_row['file_path']
                content_type = file_row['content_type'] or ''
                filename = file_row['filename'] or ''

                results['processed'] += 1

                if progress_callback:
                    progress_callback(i + 1, total, filename)

                # Skip if file doesn't exist
                if not os.path.exists(file_path):
                    results['skipped'] += 1
                    continue

                # Determine file type
                ext = Path(file_path).suffix.lower()

                embedding = None

                if ext in self.SUPPORTED_IMAGE_EXTENSIONS or 'image' in content_type.lower():
                    embedding = self.get_image_embedding(file_path)
                elif ext in self.SUPPORTED_VIDEO_EXTENSIONS or 'video' in content_type.lower():
                    embedding = self.get_video_frame_embedding(file_path)
                else:
                    results['skipped'] += 1
                    continue

                if embedding is not None:
                    if self.store_embedding(file_id, embedding):
                        results['success'] += 1
                    else:
                        results['errors'] += 1
                else:
                    results['errors'] += 1

            self.logger.info(f"Embedding generation complete: {results}")
            return results

        except Exception as e:
            self.logger.error(f"Failed to generate embeddings batch: {e}")
            return results

    def search_by_text(self, query: str, limit: int = 50, platform: str = None,
                       source: str = None, threshold: float = 0.2) -> List[Dict]:
        """
        Search for images/videos using natural language

        Args:
            query: Natural language search query
            limit: Maximum results
            platform: Filter by platform
            source: Filter by source
            threshold: Minimum similarity score (0-1)

        Returns:
            List of files with similarity scores
        """
        try:
            # Get text embedding
            query_embedding = self.get_text_embedding(query)
            if query_embedding is None:
                return []

            return self._search_by_embedding(query_embedding, limit, platform, source, threshold)

        except Exception as e:
            self.logger.error(f"Text search failed: {e}")
            return []

    def search_by_image(self, image_path: str, limit: int = 50, platform: str = None,
                        source: str = None, threshold: float = 0.5) -> List[Dict]:
        """
        Find similar images to a given image

        Args:
            image_path: Path to query image
            limit: Maximum results
            platform: Filter by platform
            source: Filter by source
            threshold: Minimum similarity score (0-1)

        Returns:
            List of similar files with scores
        """
        try:
            # Get image embedding
            query_embedding = self.get_image_embedding(image_path)
            if query_embedding is None:
                return []

            return self._search_by_embedding(query_embedding, limit, platform, source, threshold)

        except Exception as e:
            self.logger.error(f"Image search failed: {e}")
            return []

    def search_by_file_id(self, file_id: int, limit: int = 50, platform: str = None,
                          source: str = None, threshold: float = 0.5) -> List[Dict]:
        """
        Find similar files to a file already in the database

        Args:
            file_id: File inventory ID
            limit: Maximum results
            platform: Filter by platform
            source: Filter by source
            threshold: Minimum similarity score (0-1)

        Returns:
            List of similar files with scores
        """
        try:
            # Get existing embedding
            query_embedding = self.get_embedding(file_id)

            if query_embedding is None:
                # Try to generate it
                with self.db.get_connection() as conn:
                    cursor = conn.cursor()
                    cursor.execute('SELECT file_path FROM file_inventory WHERE id = ?', (file_id,))
                    row = cursor.fetchone()
                    if row:
                        query_embedding = self.get_image_embedding(row['file_path'])

            if query_embedding is None:
                return []

            results = self._search_by_embedding(query_embedding, limit + 1, platform, source, threshold)

            # Remove the query file itself from results
            return [r for r in results if r['id'] != file_id][:limit]

        except Exception as e:
            self.logger.error(f"Similar file search failed: {e}")
            return []

    def _search_by_embedding(self, query_embedding: np.ndarray, limit: int,
                              platform: str = None, source: str = None,
                              threshold: float = 0.2) -> List[Dict]:
        """
        Internal search using embedding vector

        Args:
            query_embedding: Query embedding vector
            limit: Maximum results
            platform: Filter by platform
            source: Filter by source
            threshold: Minimum similarity score

        Returns:
            List of files with similarity scores, sorted by score
        """
        try:
            with self.db.get_connection() as conn:
                cursor = conn.cursor()

                # Build query to get all embeddings (with optional filters)
                query = '''
                    SELECT ce.file_id, ce.embedding, fi.file_path, fi.filename,
                           fi.platform, fi.source, fi.content_type, fi.file_size
                    FROM content_embeddings ce
                    JOIN file_inventory fi ON fi.id = ce.file_id
                    WHERE fi.location = 'final'
                '''
                params = []

                if platform:
                    query += ' AND fi.platform = ?'
                    params.append(platform)
                if source:
                    query += ' AND fi.source = ?'
                    params.append(source)

                cursor.execute(query, params)

                results = []
                for row in cursor.fetchall():
                    embedding = bytes_to_embedding(row['embedding'])
                    similarity = cosine_similarity(query_embedding, embedding)

                    if similarity >= threshold:
                        results.append({
                            'id': row['file_id'],
                            'file_path': row['file_path'],
                            'filename': row['filename'],
                            'platform': row['platform'],
                            'source': row['source'],
                            'content_type': row['content_type'],
                            'file_size': row['file_size'],
                            'similarity': round(similarity, 4)
                        })

                # Sort by similarity descending
                results.sort(key=lambda x: x['similarity'], reverse=True)

                return results[:limit]

        except Exception as e:
            self.logger.error(f"Embedding search failed: {e}")
            return []


# Global instance (lazy initialization)
_semantic_search = None


def reset_clip_model():
    """Reset the global CLIP model so it will be reloaded with new config"""
    global _clip_model, _clip_model_name
    with _model_lock:
        _clip_model = None
        _clip_model_name = None
        logger.info("CLIP model cache cleared, will reload on next use")


def get_semantic_search(unified_db=None, force_reload=False):
    """Get or create global semantic search instance

    Args:
        unified_db: Database instance to use
        force_reload: If True, recreate the instance (useful when model config changes)
    """
    global _semantic_search
    if _semantic_search is None or force_reload:
        if force_reload:
            # Also reset the CLIP model so it reloads with new config
            reset_clip_model()
        if unified_db is None:
            from modules.unified_database import UnifiedDatabase
            unified_db = UnifiedDatabase()
        _semantic_search = SemanticSearch(unified_db)
    return _semantic_search