File size: 3,566 Bytes
9681c5d 4dc95d7 9681c5d 4dc95d7 9681c5d 4dc95d7 9681c5d 4dc95d7 9681c5d 4dc95d7 9681c5d 4dc95d7 9681c5d 4dc95d7 9681c5d 4dc95d7 9681c5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
"""
Blog Data Update Script
This script updates the blog data vector store when new posts are added.
It can be scheduled to run periodically or manually executed.
Usage:
python update_blog_data.py [--force-recreate] [--data-dir DATA_DIR]
Options:
--force-recreate Force recreation of the vector store even if it exists
--data-dir DIR Directory containing the blog posts (default: data/)
"""
import os
import sys
import argparse
from datetime import datetime
import json
from pathlib import Path
from lets_talk.config import VECTOR_STORAGE_PATH
# Import the blog utilities module
import lets_talk.utils.blog as blog
def parse_args():
"""Parse command-line arguments"""
parser = argparse.ArgumentParser(description="Update blog data vector store")
parser.add_argument("--force-recreate", action="store_true",
help="Force recreation of the vector store")
parser.add_argument("--data-dir", default=blog.DATA_DIR,
help=f"Directory containing blog posts (default: {blog.DATA_DIR})")
return parser.parse_args()
def save_stats(stats, output_dir="./stats"):
"""Save stats to a JSON file for tracking changes over time"""
# Create directory if it doesn't exist
Path(output_dir).mkdir(exist_ok=True, parents=True)
# Create filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{output_dir}/blog_stats_{timestamp}.json"
# Save only the basic stats, not the full document list
basic_stats = {
"timestamp": timestamp,
"total_documents": stats["total_documents"],
"total_characters": stats["total_characters"],
"min_length": stats["min_length"],
"max_length": stats["max_length"],
"avg_length": stats["avg_length"],
}
with open(filename, "w") as f:
json.dump(basic_stats, f, indent=2)
print(f"Saved stats to {filename}")
return filename
def main():
"""Main function to update blog data"""
args = parse_args()
print("=== Blog Data Update ===")
print(f"Data directory: {args.data_dir}")
print(f"Force recreate: {args.force_recreate}")
print("========================")
# Process blog posts without creating embeddings
try:
# Load and process documents
documents = blog.load_blog_posts(args.data_dir)
documents = blog.update_document_metadata(documents)
# Get stats
stats = blog.get_document_stats(documents)
blog.display_document_stats(stats)
# Save stats for tracking
stats_file = save_stats(stats)
create_vector_store = (not Path.exists(Path(VECTOR_STORAGE_PATH))) or (args.force_recreate)
# Create a reference file for the vector store
if create_vector_store:
print("\nAttempting to save vector store reference file...")
vector_store = blog.create_vector_store(documents, storage_path=VECTOR_STORAGE_PATH, force_recreate=create_vector_store)
vector_store.client.close()
print("Vector store reference file saved.")
print("\n=== Update Summary ===")
print(f"Processed {stats['total_documents']} documents")
print(f"Stats saved to: {stats_file}")
print("=====================")
return 0
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())
|