""" Blog Data Update Script This script updates the blog data vector store when new posts are added. It can be scheduled to run periodically or manually executed. Usage: python update_blog_data.py [--force-recreate] [--data-dir DATA_DIR] Options: --force-recreate Force recreation of the vector store even if it exists --data-dir DIR Directory containing the blog posts (default: data/) """ import os import sys import argparse from datetime import datetime import json from pathlib import Path from lets_talk.config import VECTOR_STORAGE_PATH # Import the blog utilities module import lets_talk.utils.blog as blog def parse_args(): """Parse command-line arguments""" parser = argparse.ArgumentParser(description="Update blog data vector store") parser.add_argument("--force-recreate", action="store_true", help="Force recreation of the vector store") parser.add_argument("--data-dir", default=blog.DATA_DIR, help=f"Directory containing blog posts (default: {blog.DATA_DIR})") return parser.parse_args() def save_stats(stats, output_dir="./stats"): """Save stats to a JSON file for tracking changes over time""" # Create directory if it doesn't exist Path(output_dir).mkdir(exist_ok=True, parents=True) # Create filename with timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"{output_dir}/blog_stats_{timestamp}.json" # Save only the basic stats, not the full document list basic_stats = { "timestamp": timestamp, "total_documents": stats["total_documents"], "total_characters": stats["total_characters"], "min_length": stats["min_length"], "max_length": stats["max_length"], "avg_length": stats["avg_length"], } with open(filename, "w") as f: json.dump(basic_stats, f, indent=2) print(f"Saved stats to {filename}") return filename def main(): """Main function to update blog data""" args = parse_args() print("=== Blog Data Update ===") print(f"Data directory: {args.data_dir}") print(f"Force recreate: {args.force_recreate}") print("========================") # Process blog posts without creating embeddings try: # Load and process documents documents = blog.load_blog_posts(args.data_dir) documents = blog.update_document_metadata(documents) # Get stats stats = blog.get_document_stats(documents) blog.display_document_stats(stats) # Save stats for tracking stats_file = save_stats(stats) create_vector_store = (not Path.exists(Path(VECTOR_STORAGE_PATH))) or (args.force_recreate) # Create a reference file for the vector store if create_vector_store: print("\nAttempting to save vector store reference file...") vector_store = blog.create_vector_store(documents, storage_path=VECTOR_STORAGE_PATH, force_recreate=create_vector_store) vector_store.client.close() print("Vector store reference file saved.") print("\n=== Update Summary ===") print(f"Processed {stats['total_documents']} documents") print(f"Stats saved to: {stats_file}") print("=====================") return 0 except Exception as e: print(f"Error: {e}") import traceback traceback.print_exc() return 1 if __name__ == "__main__": sys.exit(main())