104 lines
3.3 KiB
Python
Executable File
104 lines
3.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Nightly embedding generation script
|
|
Run via systemd timer to index new media files
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, '/opt/media-downloader')
|
|
|
|
# Bootstrap database backend (must be before any sqlite3 imports)
|
|
import modules.db_bootstrap # noqa: E402,F401
|
|
|
|
from modules.universal_logger import get_logger
|
|
from modules.unified_database import UnifiedDatabase
|
|
from modules.semantic_search import SemanticSearch
|
|
|
|
logger = get_logger('EmbeddingGenerator')
|
|
|
|
|
|
def generate_embeddings(db):
|
|
"""Generate embeddings for files that don't have them yet"""
|
|
logger.info("=== Embedding Generation ===")
|
|
|
|
try:
|
|
semantic = SemanticSearch(db)
|
|
|
|
# Get current stats
|
|
stats = semantic.get_embedding_stats()
|
|
logger.info(f"Current stats: {stats['total_embeddings']} embeddings, "
|
|
f"{stats['missing_embeddings']} missing, "
|
|
f"{stats['coverage_percent']}% coverage")
|
|
|
|
if stats['missing_embeddings'] == 0:
|
|
logger.info("All files already have embeddings, nothing to do")
|
|
return 0
|
|
|
|
# Process in batches of 1000 files
|
|
batch_size = 1000
|
|
total_processed = 0
|
|
max_batches = 10 # Process up to 10000 files per night
|
|
|
|
for batch_num in range(max_batches):
|
|
if stats['missing_embeddings'] == 0:
|
|
break
|
|
|
|
logger.info(f"Processing batch {batch_num + 1}/{max_batches} "
|
|
f"({stats['missing_embeddings']} files remaining)")
|
|
|
|
def progress_callback(processed, total, current_file):
|
|
if processed % 100 == 0:
|
|
logger.info(f" Progress: {processed}/{total} - {current_file}")
|
|
|
|
results = semantic.generate_embeddings_batch(
|
|
limit=batch_size,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
total_processed += results['success']
|
|
logger.info(f"Batch {batch_num + 1} complete: "
|
|
f"{results['success']} success, "
|
|
f"{results['errors']} errors, "
|
|
f"{results['skipped']} skipped")
|
|
|
|
# Update stats for next iteration
|
|
stats = semantic.get_embedding_stats()
|
|
|
|
# Final stats
|
|
final_stats = semantic.get_embedding_stats()
|
|
logger.info(f"Embedding generation complete: {total_processed} new embeddings generated")
|
|
logger.info(f"Final coverage: {final_stats['coverage_percent']}% "
|
|
f"({final_stats['total_embeddings']}/{final_stats['total_files']} files)")
|
|
|
|
return total_processed
|
|
|
|
except Exception as e:
|
|
logger.error(f"Embedding generation failed: {e}")
|
|
return 0
|
|
|
|
|
|
def main():
|
|
"""Generate embeddings for files that don't have them yet"""
|
|
logger.info("Starting nightly embedding generation")
|
|
|
|
try:
|
|
# Initialize database
|
|
db = UnifiedDatabase()
|
|
|
|
# Generate embeddings
|
|
embeddings_processed = generate_embeddings(db)
|
|
|
|
logger.info(f"=== Nightly indexing complete ===")
|
|
logger.info(f" Embeddings generated: {embeddings_processed}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Nightly indexing failed: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|