#!/usr/bin/env python3 """ Nightly embedding generation script Run via systemd timer to index new media files """ import sys import os # Add parent directory to path for imports sys.path.insert(0, '/opt/media-downloader') # Bootstrap database backend (must be before any sqlite3 imports) import modules.db_bootstrap # noqa: E402,F401 from modules.universal_logger import get_logger from modules.unified_database import UnifiedDatabase from modules.semantic_search import SemanticSearch logger = get_logger('EmbeddingGenerator') def generate_embeddings(db): """Generate embeddings for files that don't have them yet""" logger.info("=== Embedding Generation ===") try: semantic = SemanticSearch(db) # Get current stats stats = semantic.get_embedding_stats() logger.info(f"Current stats: {stats['total_embeddings']} embeddings, " f"{stats['missing_embeddings']} missing, " f"{stats['coverage_percent']}% coverage") if stats['missing_embeddings'] == 0: logger.info("All files already have embeddings, nothing to do") return 0 # Process in batches of 1000 files batch_size = 1000 total_processed = 0 max_batches = 10 # Process up to 10000 files per night for batch_num in range(max_batches): if stats['missing_embeddings'] == 0: break logger.info(f"Processing batch {batch_num + 1}/{max_batches} " f"({stats['missing_embeddings']} files remaining)") def progress_callback(processed, total, current_file): if processed % 100 == 0: logger.info(f" Progress: {processed}/{total} - {current_file}") results = semantic.generate_embeddings_batch( limit=batch_size, progress_callback=progress_callback ) total_processed += results['success'] logger.info(f"Batch {batch_num + 1} complete: " f"{results['success']} success, " f"{results['errors']} errors, " f"{results['skipped']} skipped") # Update stats for next iteration stats = semantic.get_embedding_stats() # Final stats final_stats = semantic.get_embedding_stats() logger.info(f"Embedding generation complete: {total_processed} new embeddings generated") logger.info(f"Final coverage: {final_stats['coverage_percent']}% " f"({final_stats['total_embeddings']}/{final_stats['total_files']} files)") return total_processed except Exception as e: logger.error(f"Embedding generation failed: {e}") return 0 def main(): """Generate embeddings for files that don't have them yet""" logger.info("Starting nightly embedding generation") try: # Initialize database db = UnifiedDatabase() # Generate embeddings embeddings_processed = generate_embeddings(db) logger.info(f"=== Nightly indexing complete ===") logger.info(f" Embeddings generated: {embeddings_processed}") except Exception as e: logger.error(f"Nightly indexing failed: {e}") sys.exit(1) if __name__ == '__main__': main()