Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

103
scripts/generate-embeddings.py Executable file
View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""
Nightly embedding generation script
Run via systemd timer to index new media files
"""
import sys
import os
# Add parent directory to path for imports
sys.path.insert(0, '/opt/media-downloader')
# Bootstrap database backend (must be before any sqlite3 imports)
import modules.db_bootstrap # noqa: E402,F401
from modules.universal_logger import get_logger
from modules.unified_database import UnifiedDatabase
from modules.semantic_search import SemanticSearch
logger = get_logger('EmbeddingGenerator')
def generate_embeddings(db):
"""Generate embeddings for files that don't have them yet"""
logger.info("=== Embedding Generation ===")
try:
semantic = SemanticSearch(db)
# Get current stats
stats = semantic.get_embedding_stats()
logger.info(f"Current stats: {stats['total_embeddings']} embeddings, "
f"{stats['missing_embeddings']} missing, "
f"{stats['coverage_percent']}% coverage")
if stats['missing_embeddings'] == 0:
logger.info("All files already have embeddings, nothing to do")
return 0
# Process in batches of 1000 files
batch_size = 1000
total_processed = 0
max_batches = 10 # Process up to 10000 files per night
for batch_num in range(max_batches):
if stats['missing_embeddings'] == 0:
break
logger.info(f"Processing batch {batch_num + 1}/{max_batches} "
f"({stats['missing_embeddings']} files remaining)")
def progress_callback(processed, total, current_file):
if processed % 100 == 0:
logger.info(f" Progress: {processed}/{total} - {current_file}")
results = semantic.generate_embeddings_batch(
limit=batch_size,
progress_callback=progress_callback
)
total_processed += results['success']
logger.info(f"Batch {batch_num + 1} complete: "
f"{results['success']} success, "
f"{results['errors']} errors, "
f"{results['skipped']} skipped")
# Update stats for next iteration
stats = semantic.get_embedding_stats()
# Final stats
final_stats = semantic.get_embedding_stats()
logger.info(f"Embedding generation complete: {total_processed} new embeddings generated")
logger.info(f"Final coverage: {final_stats['coverage_percent']}% "
f"({final_stats['total_embeddings']}/{final_stats['total_files']} files)")
return total_processed
except Exception as e:
logger.error(f"Embedding generation failed: {e}")
return 0
def main():
"""Generate embeddings for files that don't have them yet"""
logger.info("Starting nightly embedding generation")
try:
# Initialize database
db = UnifiedDatabase()
# Generate embeddings
embeddings_processed = generate_embeddings(db)
logger.info(f"=== Nightly indexing complete ===")
logger.info(f" Embeddings generated: {embeddings_processed}")
except Exception as e:
logger.error(f"Nightly indexing failed: {e}")
sys.exit(1)
if __name__ == '__main__':
main()