File size: 4,722 Bytes
9681c5d
 
 
 
 
 
 
f5df877
9681c5d
 
 
 
 
 
 
 
 
 
 
 
d5d262c
9681c5d
 
4dc95d7
9681c5d
 
 
 
 
 
d5d262c
 
9681c5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5d262c
f5df877
 
 
 
 
 
 
 
 
 
 
 
d5d262c
 
 
f5df877
d5d262c
 
 
 
 
 
 
 
 
 
 
f5df877
 
 
 
 
 
 
 
 
 
 
d5d262c
f5df877
d5d262c
f5df877
 
 
9681c5d
 
 
 
 
 
 
 
 
 
d5d262c
4dc95d7
f5df877
d5d262c
f5df877
 
 
 
9681c5d
 
 
 
f5df877
9681c5d
 
f5df877
 
9681c5d
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
Blog Data Update Script

This script updates the blog data vector store when new posts are added.
It can be scheduled to run periodically or manually executed.

Usage:
    python pipeline.py [--force-recreate] [--data-dir DATA_DIR]

Options:
    --force-recreate   Force recreation of the vector store even if it exists
    --data-dir DIR     Directory containing the blog posts (default: data/)
"""

import os
import sys
import argparse
from datetime import datetime
import json
from pathlib import Path
from lets_talk.config import VECTOR_STORAGE_PATH, DATA_DIR

# Import the blog utilities module
import lets_talk.utils.blog as blog

def parse_args():
    """Parse command-line arguments"""
    parser = argparse.ArgumentParser(description="Update blog data vector store")
    parser.add_argument("--force-recreate", action="store_true", 
                        help="Force recreation of the vector store")
    parser.add_argument("--data-dir", default=DATA_DIR,
                        help=f"Directory containing blog posts (default: {DATA_DIR})")
    return parser.parse_args()

def save_stats(stats, output_dir="./stats"):
    """Save stats to a JSON file for tracking changes over time"""
    # Create directory if it doesn't exist
    Path(output_dir).mkdir(exist_ok=True, parents=True)
    
    # Create filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{output_dir}/blog_stats_{timestamp}.json"
    
    # Save only the basic stats, not the full document list
    basic_stats = {
        "timestamp": timestamp,
        "total_documents": stats["total_documents"],
        "total_characters": stats["total_characters"],
        "min_length": stats["min_length"],
        "max_length": stats["max_length"],
        "avg_length": stats["avg_length"],
    }
    
    with open(filename, "w") as f:
        json.dump(basic_stats, f, indent=2)
    
    print(f"Saved stats to {filename}")
    return filename

def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH, force_recreate=False):
    """
    Create or update the vector database with blog documents.
    
    Args:
        documents: List of document objects to store in the vector database
        data_dir: Directory containing the blog posts (for reporting)
        storage_path: Path where the vector database will be stored
        force_recreate: Whether to force recreation of the vector store
        
    Returns:
        Tuple of (success status, message)
    """

    
    
    try:
        # Load and process documents
        documents = blog.load_blog_posts(data_dir)
        documents = blog.update_document_metadata(documents)
        
        # Get stats
        stats = blog.get_document_stats(documents)
        blog.display_document_stats(stats)
        
        # Save stats for tracking
        stats_file = save_stats(stats)

        create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
        
        if create_vector_store:
            print("\nAttempting to save vector store reference file...")
            vector_store = blog.create_vector_store(
                documents, 
                storage_path=storage_path, 
                force_recreate=force_recreate
            )
            vector_store.client.close()
            print("Vector store reference file saved.")
            return True, f"Vector store successfully created at {storage_path}",stats, stats_file
        else:
            return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)",stats, stats_file
    except Exception as e:
        return False, f"Error creating vector store: {str(e)}"

def main():
    """Main function to update blog data"""
    args = parse_args()
    
    print("=== Blog Data Update ===")
    print(f"Data directory: {args.data_dir}")
    print(f"Force recreate: {args.force_recreate}")
    print("========================")
    
    try:
       

        # Create or update vector database
        success, message,stats,stats_file = create_vector_database(
            args.data_dir, 
            storage_path=VECTOR_STORAGE_PATH, 
            force_recreate=args.force_recreate
        )
        
        print("\n=== Update Summary ===")
        print(f"Processed {stats['total_documents']} documents")
        print(f"Stats saved to: {stats_file}")
        print(f"Vector DB status: {message}")
        print("=====================")
        
        if not success:
            return 1
        return 0
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return 1

if __name__ == "__main__":
    sys.exit(main())