mafzaal commited on
Commit
4e87dd5
·
1 Parent(s): d5d262c

feat: Add build vector store script and update pipeline for CI integration

Browse files
.github/workflows/build-vector-store.yml ADDED
File without changes
py-src/pipeline.py CHANGED
@@ -5,11 +5,13 @@ This script updates the blog data vector store when new posts are added.
5
  It can be scheduled to run periodically or manually executed.
6
 
7
  Usage:
8
- python pipeline.py [--force-recreate] [--data-dir DATA_DIR]
9
 
10
  Options:
11
  --force-recreate Force recreation of the vector store even if it exists
12
  --data-dir DIR Directory containing the blog posts (default: data/)
 
 
13
  """
14
 
15
  import os
@@ -17,12 +19,21 @@ import sys
17
  import argparse
18
  from datetime import datetime
19
  import json
 
20
  from pathlib import Path
21
  from lets_talk.config import VECTOR_STORAGE_PATH, DATA_DIR
22
 
23
  # Import the blog utilities module
24
  import lets_talk.utils.blog as blog
25
 
 
 
 
 
 
 
 
 
26
  def parse_args():
27
  """Parse command-line arguments"""
28
  parser = argparse.ArgumentParser(description="Update blog data vector store")
@@ -30,16 +41,34 @@ def parse_args():
30
  help="Force recreation of the vector store")
31
  parser.add_argument("--data-dir", default=DATA_DIR,
32
  help=f"Directory containing blog posts (default: {DATA_DIR})")
 
 
 
 
33
  return parser.parse_args()
34
 
35
- def save_stats(stats, output_dir="./stats"):
36
- """Save stats to a JSON file for tracking changes over time"""
 
 
 
 
 
 
 
 
 
37
  # Create directory if it doesn't exist
38
  Path(output_dir).mkdir(exist_ok=True, parents=True)
39
 
40
- # Create filename with timestamp
41
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
42
- filename = f"{output_dir}/blog_stats_{timestamp}.json"
 
 
 
 
 
43
 
44
  # Save only the basic stats, not the full document list
45
  basic_stats = {
@@ -54,27 +83,34 @@ def save_stats(stats, output_dir="./stats"):
54
  with open(filename, "w") as f:
55
  json.dump(basic_stats, f, indent=2)
56
 
57
- print(f"Saved stats to {filename}")
58
- return filename
 
 
 
 
 
 
 
59
 
60
- def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH, force_recreate=False):
 
61
  """
62
  Create or update the vector database with blog documents.
63
 
64
  Args:
65
- documents: List of document objects to store in the vector database
66
- data_dir: Directory containing the blog posts (for reporting)
67
  storage_path: Path where the vector database will be stored
68
  force_recreate: Whether to force recreation of the vector store
 
 
69
 
70
  Returns:
71
- Tuple of (success status, message)
72
  """
73
-
74
-
75
-
76
  try:
77
  # Load and process documents
 
78
  documents = blog.load_blog_posts(data_dir)
79
  documents = blog.update_document_metadata(documents)
80
 
@@ -83,57 +119,98 @@ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
83
  blog.display_document_stats(stats)
84
 
85
  # Save stats for tracking
86
- stats_file = save_stats(stats)
87
 
88
  create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
89
 
90
  if create_vector_store:
91
- print("\nAttempting to save vector store reference file...")
92
  vector_store = blog.create_vector_store(
93
  documents,
94
  storage_path=storage_path,
95
  force_recreate=force_recreate
96
  )
97
  vector_store.client.close()
98
- print("Vector store reference file saved.")
99
- return True, f"Vector store successfully created at {storage_path}",stats, stats_file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  else:
101
- return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)",stats, stats_file
 
102
  except Exception as e:
103
- return False, f"Error creating vector store: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  def main():
106
  """Main function to update blog data"""
107
  args = parse_args()
108
 
109
- print("=== Blog Data Update ===")
110
- print(f"Data directory: {args.data_dir}")
111
- print(f"Force recreate: {args.force_recreate}")
112
- print("========================")
 
 
113
 
114
  try:
115
-
116
-
117
  # Create or update vector database
118
- success, message,stats,stats_file = create_vector_database(
119
- args.data_dir,
120
  storage_path=VECTOR_STORAGE_PATH,
121
- force_recreate=args.force_recreate
 
 
122
  )
123
 
124
- print("\n=== Update Summary ===")
125
- print(f"Processed {stats['total_documents']} documents")
126
- print(f"Stats saved to: {stats_file}")
127
- print(f"Vector DB status: {message}")
128
- print("=====================")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  if not success:
131
  return 1
132
  return 0
133
  except Exception as e:
134
- print(f"Error: {e}")
135
- import traceback
136
- traceback.print_exc()
137
  return 1
138
 
139
  if __name__ == "__main__":
 
5
  It can be scheduled to run periodically or manually executed.
6
 
7
  Usage:
8
+ python pipeline.py [--force-recreate] [--data-dir DATA_DIR] [--output-dir OUTPUT_DIR] [--ci]
9
 
10
  Options:
11
  --force-recreate Force recreation of the vector store even if it exists
12
  --data-dir DIR Directory containing the blog posts (default: data/)
13
+ --output-dir DIR Directory to save stats and artifacts (default: ./stats)
14
+ --ci Run in CI mode (no interactive prompts, exit codes for CI)
15
  """
16
 
17
  import os
 
19
  import argparse
20
  from datetime import datetime
21
  import json
22
+ import logging
23
  from pathlib import Path
24
  from lets_talk.config import VECTOR_STORAGE_PATH, DATA_DIR
25
 
26
  # Import the blog utilities module
27
  import lets_talk.utils.blog as blog
28
 
29
+ # Set up logging
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
33
+ handlers=[logging.StreamHandler()]
34
+ )
35
+ logger = logging.getLogger("blog-pipeline")
36
+
37
  def parse_args():
38
  """Parse command-line arguments"""
39
  parser = argparse.ArgumentParser(description="Update blog data vector store")
 
41
  help="Force recreation of the vector store")
42
  parser.add_argument("--data-dir", default=DATA_DIR,
43
  help=f"Directory containing blog posts (default: {DATA_DIR})")
44
+ parser.add_argument("--output-dir", default="./stats",
45
+ help="Directory to save stats and artifacts (default: ./stats)")
46
+ parser.add_argument("--ci", action="store_true",
47
+ help="Run in CI mode (no interactive prompts, exit codes for CI)")
48
  return parser.parse_args()
49
 
50
+ def save_stats(stats, output_dir="./stats", ci_mode=False):
51
+ """Save stats to a JSON file for tracking changes over time
52
+
53
+ Args:
54
+ stats: Dictionary containing statistics about the blog posts
55
+ output_dir: Directory to save the stats file
56
+ ci_mode: Whether to run in CI mode (use fixed filename)
57
+
58
+ Returns:
59
+ Tuple of (filename, stats_dict)
60
+ """
61
  # Create directory if it doesn't exist
62
  Path(output_dir).mkdir(exist_ok=True, parents=True)
63
 
64
+ # Create filename with timestamp or use fixed name for CI
65
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
66
+ if ci_mode:
67
+ filename = f"{output_dir}/blog_stats_latest.json"
68
+ # Also create a timestamped version for historical tracking
69
+ history_filename = f"{output_dir}/blog_stats_{timestamp}.json"
70
+ else:
71
+ filename = f"{output_dir}/blog_stats_{timestamp}.json"
72
 
73
  # Save only the basic stats, not the full document list
74
  basic_stats = {
 
83
  with open(filename, "w") as f:
84
  json.dump(basic_stats, f, indent=2)
85
 
86
+ # In CI mode, also save a timestamped version
87
+ if ci_mode:
88
+ with open(history_filename, "w") as f:
89
+ json.dump(basic_stats, f, indent=2)
90
+ logger.info(f"Saved stats to {filename} and {history_filename}")
91
+ else:
92
+ logger.info(f"Saved stats to {filename}")
93
+
94
+ return filename, basic_stats
95
 
96
+ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
97
+ force_recreate=False, output_dir="./stats", ci_mode=False):
98
  """
99
  Create or update the vector database with blog documents.
100
 
101
  Args:
102
+ data_dir: Directory containing the blog posts
 
103
  storage_path: Path where the vector database will be stored
104
  force_recreate: Whether to force recreation of the vector store
105
+ output_dir: Directory to save stats and artifacts
106
+ ci_mode: Whether to run in CI mode
107
 
108
  Returns:
109
+ Tuple of (success status, message, stats, stats_file, stats_file_content)
110
  """
 
 
 
111
  try:
112
  # Load and process documents
113
+ logger.info(f"Loading blog posts from {data_dir}")
114
  documents = blog.load_blog_posts(data_dir)
115
  documents = blog.update_document_metadata(documents)
116
 
 
119
  blog.display_document_stats(stats)
120
 
121
  # Save stats for tracking
122
+ stats_file, stats_content = save_stats(stats, output_dir=output_dir, ci_mode=ci_mode)
123
 
124
  create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
125
 
126
  if create_vector_store:
127
+ logger.info("Creating vector store...")
128
  vector_store = blog.create_vector_store(
129
  documents,
130
  storage_path=storage_path,
131
  force_recreate=force_recreate
132
  )
133
  vector_store.client.close()
134
+ logger.info(f"Vector store successfully created at {storage_path}")
135
+
136
+ # In CI mode, create a metadata file with the build info
137
+ if ci_mode:
138
+ build_info = {
139
+ "build_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
140
+ "document_count": stats["total_documents"],
141
+ "storage_path": str(storage_path),
142
+ "vector_store_size_bytes": get_directory_size(storage_path),
143
+ }
144
+ build_info_path = Path(output_dir) / "vector_store_build_info.json"
145
+ with open(build_info_path, "w") as f:
146
+ json.dump(build_info, f, indent=2)
147
+ logger.info(f"Build info saved to {build_info_path}")
148
+
149
+ return True, f"Vector store successfully created at {storage_path}", stats, stats_file, stats_content
150
  else:
151
+ logger.info(f"Vector store already exists at {storage_path}")
152
+ return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)", stats, stats_file, stats_content
153
  except Exception as e:
154
+ logger.error(f"Error creating vector store: {str(e)}", exc_info=True)
155
+ return False, f"Error creating vector store: {str(e)}", None, None, None
156
+
157
+ def get_directory_size(path):
158
+ """Get the size of a directory in bytes"""
159
+ total_size = 0
160
+ for dirpath, dirnames, filenames in os.walk(path):
161
+ for filename in filenames:
162
+ filepath = os.path.join(dirpath, filename)
163
+ if not os.path.islink(filepath):
164
+ total_size += os.path.getsize(filepath)
165
+ return total_size
166
 
167
  def main():
168
  """Main function to update blog data"""
169
  args = parse_args()
170
 
171
+ logger.info("=== Blog Data Update ===")
172
+ logger.info(f"Data directory: {args.data_dir}")
173
+ logger.info(f"Force recreate: {args.force_recreate}")
174
+ logger.info(f"Output directory: {args.output_dir}")
175
+ logger.info(f"CI mode: {args.ci}")
176
+ logger.info("========================")
177
 
178
  try:
 
 
179
  # Create or update vector database
180
+ success, message, stats, stats_file, stats_content = create_vector_database(
181
+ data_dir=args.data_dir,
182
  storage_path=VECTOR_STORAGE_PATH,
183
+ force_recreate=args.force_recreate,
184
+ output_dir=args.output_dir,
185
+ ci_mode=args.ci
186
  )
187
 
188
+ logger.info("\n=== Update Summary ===")
189
+ if stats:
190
+ logger.info(f"Processed {stats['total_documents']} documents")
191
+ logger.info(f"Stats saved to: {stats_file}")
192
+ logger.info(f"Vector DB status: {message}")
193
+ logger.info("=====================")
194
+
195
+ # In CI mode, create a summary file that GitHub Actions can use to set outputs
196
+ if args.ci and stats:
197
+ ci_summary_path = Path(args.output_dir) / "ci_summary.json"
198
+ ci_summary = {
199
+ "status": "success" if success else "failure",
200
+ "message": message,
201
+ "stats_file": stats_file,
202
+ "document_count": stats["total_documents"],
203
+ "vector_store_path": str(VECTOR_STORAGE_PATH)
204
+ }
205
+ with open(ci_summary_path, "w") as f:
206
+ json.dump(ci_summary, f, indent=2)
207
+ logger.info(f"CI summary saved to {ci_summary_path}")
208
 
209
  if not success:
210
  return 1
211
  return 0
212
  except Exception as e:
213
+ logger.error(f"Error: {e}", exc_info=True)
 
 
214
  return 1
215
 
216
  if __name__ == "__main__":
scripts/build-vector-store.sh ADDED
File without changes