103
scripts/generate-embeddings.py
Executable file
103
scripts/generate-embeddings.py
Executable file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Nightly embedding generation script
|
||||
Run via systemd timer to index new media files
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, '/opt/media-downloader')
|
||||
|
||||
# Bootstrap database backend (must be before any sqlite3 imports)
|
||||
import modules.db_bootstrap # noqa: E402,F401
|
||||
|
||||
from modules.universal_logger import get_logger
|
||||
from modules.unified_database import UnifiedDatabase
|
||||
from modules.semantic_search import SemanticSearch
|
||||
|
||||
logger = get_logger('EmbeddingGenerator')
|
||||
|
||||
|
||||
def generate_embeddings(db):
|
||||
"""Generate embeddings for files that don't have them yet"""
|
||||
logger.info("=== Embedding Generation ===")
|
||||
|
||||
try:
|
||||
semantic = SemanticSearch(db)
|
||||
|
||||
# Get current stats
|
||||
stats = semantic.get_embedding_stats()
|
||||
logger.info(f"Current stats: {stats['total_embeddings']} embeddings, "
|
||||
f"{stats['missing_embeddings']} missing, "
|
||||
f"{stats['coverage_percent']}% coverage")
|
||||
|
||||
if stats['missing_embeddings'] == 0:
|
||||
logger.info("All files already have embeddings, nothing to do")
|
||||
return 0
|
||||
|
||||
# Process in batches of 1000 files
|
||||
batch_size = 1000
|
||||
total_processed = 0
|
||||
max_batches = 10 # Process up to 10000 files per night
|
||||
|
||||
for batch_num in range(max_batches):
|
||||
if stats['missing_embeddings'] == 0:
|
||||
break
|
||||
|
||||
logger.info(f"Processing batch {batch_num + 1}/{max_batches} "
|
||||
f"({stats['missing_embeddings']} files remaining)")
|
||||
|
||||
def progress_callback(processed, total, current_file):
|
||||
if processed % 100 == 0:
|
||||
logger.info(f" Progress: {processed}/{total} - {current_file}")
|
||||
|
||||
results = semantic.generate_embeddings_batch(
|
||||
limit=batch_size,
|
||||
progress_callback=progress_callback
|
||||
)
|
||||
|
||||
total_processed += results['success']
|
||||
logger.info(f"Batch {batch_num + 1} complete: "
|
||||
f"{results['success']} success, "
|
||||
f"{results['errors']} errors, "
|
||||
f"{results['skipped']} skipped")
|
||||
|
||||
# Update stats for next iteration
|
||||
stats = semantic.get_embedding_stats()
|
||||
|
||||
# Final stats
|
||||
final_stats = semantic.get_embedding_stats()
|
||||
logger.info(f"Embedding generation complete: {total_processed} new embeddings generated")
|
||||
logger.info(f"Final coverage: {final_stats['coverage_percent']}% "
|
||||
f"({final_stats['total_embeddings']}/{final_stats['total_files']} files)")
|
||||
|
||||
return total_processed
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Embedding generation failed: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
"""Generate embeddings for files that don't have them yet"""
|
||||
logger.info("Starting nightly embedding generation")
|
||||
|
||||
try:
|
||||
# Initialize database
|
||||
db = UnifiedDatabase()
|
||||
|
||||
# Generate embeddings
|
||||
embeddings_processed = generate_embeddings(db)
|
||||
|
||||
logger.info(f"=== Nightly indexing complete ===")
|
||||
logger.info(f" Embeddings generated: {embeddings_processed}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Nightly indexing failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user