mafzaal commited on
Commit
d5d262c
·
1 Parent(s): f5df877

feat: Update blog data processing to create vector database and enhance stats tracking

Browse files
py-src/app.py CHANGED
@@ -10,7 +10,7 @@ load_dotenv()
10
  import pipeline
11
  #build vector store
12
  print("=== Blog Data Update ===")
13
- pipeline.main()
14
  print("========================")
15
 
16
  import chainlit as cl
 
10
  import pipeline
11
  #build vector store
12
  print("=== Blog Data Update ===")
13
+ pipeline.create_vector_database(force_recreate=True)
14
  print("========================")
15
 
16
  import chainlit as cl
py-src/pipeline.py CHANGED
@@ -18,7 +18,7 @@ import argparse
18
  from datetime import datetime
19
  import json
20
  from pathlib import Path
21
- from lets_talk.config import VECTOR_STORAGE_PATH
22
 
23
  # Import the blog utilities module
24
  import lets_talk.utils.blog as blog
@@ -28,8 +28,8 @@ def parse_args():
28
  parser = argparse.ArgumentParser(description="Update blog data vector store")
29
  parser.add_argument("--force-recreate", action="store_true",
30
  help="Force recreation of the vector store")
31
- parser.add_argument("--data-dir", default=blog.DATA_DIR,
32
- help=f"Directory containing blog posts (default: {blog.DATA_DIR})")
33
  return parser.parse_args()
34
 
35
  def save_stats(stats, output_dir="./stats"):
@@ -57,7 +57,7 @@ def save_stats(stats, output_dir="./stats"):
57
  print(f"Saved stats to {filename}")
58
  return filename
59
 
60
- def create_vector_database(documents, data_dir, storage_path=VECTOR_STORAGE_PATH, force_recreate=False):
61
  """
62
  Create or update the vector database with blog documents.
63
 
@@ -70,7 +70,21 @@ def create_vector_database(documents, data_dir, storage_path=VECTOR_STORAGE_PATH
70
  Returns:
71
  Tuple of (success status, message)
72
  """
 
 
 
73
  try:
 
 
 
 
 
 
 
 
 
 
 
74
  create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
75
 
76
  if create_vector_store:
@@ -82,9 +96,9 @@ def create_vector_database(documents, data_dir, storage_path=VECTOR_STORAGE_PATH
82
  )
83
  vector_store.client.close()
84
  print("Vector store reference file saved.")
85
- return True, f"Vector store successfully created at {storage_path}"
86
  else:
87
- return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)"
88
  except Exception as e:
89
  return False, f"Error creating vector store: {str(e)}"
90
 
@@ -98,20 +112,10 @@ def main():
98
  print("========================")
99
 
100
  try:
101
- # Load and process documents
102
- documents = blog.load_blog_posts(args.data_dir)
103
- documents = blog.update_document_metadata(documents)
104
-
105
- # Get stats
106
- stats = blog.get_document_stats(documents)
107
- blog.display_document_stats(stats)
108
-
109
- # Save stats for tracking
110
- stats_file = save_stats(stats)
111
 
112
  # Create or update vector database
113
- success, message = create_vector_database(
114
- documents,
115
  args.data_dir,
116
  storage_path=VECTOR_STORAGE_PATH,
117
  force_recreate=args.force_recreate
 
18
  from datetime import datetime
19
  import json
20
  from pathlib import Path
21
+ from lets_talk.config import VECTOR_STORAGE_PATH, DATA_DIR
22
 
23
  # Import the blog utilities module
24
  import lets_talk.utils.blog as blog
 
28
  parser = argparse.ArgumentParser(description="Update blog data vector store")
29
  parser.add_argument("--force-recreate", action="store_true",
30
  help="Force recreation of the vector store")
31
+ parser.add_argument("--data-dir", default=DATA_DIR,
32
+ help=f"Directory containing blog posts (default: {DATA_DIR})")
33
  return parser.parse_args()
34
 
35
  def save_stats(stats, output_dir="./stats"):
 
57
  print(f"Saved stats to {filename}")
58
  return filename
59
 
60
+ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH, force_recreate=False):
61
  """
62
  Create or update the vector database with blog documents.
63
 
 
70
  Returns:
71
  Tuple of (success status, message)
72
  """
73
+
74
+
75
+
76
  try:
77
+ # Load and process documents
78
+ documents = blog.load_blog_posts(data_dir)
79
+ documents = blog.update_document_metadata(documents)
80
+
81
+ # Get stats
82
+ stats = blog.get_document_stats(documents)
83
+ blog.display_document_stats(stats)
84
+
85
+ # Save stats for tracking
86
+ stats_file = save_stats(stats)
87
+
88
  create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
89
 
90
  if create_vector_store:
 
96
  )
97
  vector_store.client.close()
98
  print("Vector store reference file saved.")
99
+ return True, f"Vector store successfully created at {storage_path}",stats, stats_file
100
  else:
101
+ return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)",stats, stats_file
102
  except Exception as e:
103
  return False, f"Error creating vector store: {str(e)}"
104
 
 
112
  print("========================")
113
 
114
  try:
115
+
 
 
 
 
 
 
 
 
 
116
 
117
  # Create or update vector database
118
+ success, message,stats,stats_file = create_vector_database(
 
119
  args.data_dir,
120
  storage_path=VECTOR_STORAGE_PATH,
121
  force_recreate=args.force_recreate
pyproject.toml CHANGED
@@ -18,6 +18,7 @@ dependencies = [
18
  "langchain-qdrant>=0.2.0",
19
  "langchain-text-splitters>=0.3.8",
20
  "langgraph>=0.4.3",
 
21
  "pandas>=2.2.3",
22
  "python-dotenv>=1.1.0",
23
  "qdrant-client>=1.14.2",
 
18
  "langchain-qdrant>=0.2.0",
19
  "langchain-text-splitters>=0.3.8",
20
  "langgraph>=0.4.3",
21
+ "libmagic>=1.0",
22
  "pandas>=2.2.3",
23
  "python-dotenv>=1.1.0",
24
  "qdrant-client>=1.14.2",
stats/blog_stats_20250511_100823.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "20250511_100823",
3
+ "total_documents": 14,
4
+ "total_characters": 106275,
5
+ "min_length": 1900,
6
+ "max_length": 13468,
7
+ "avg_length": 7591.071428571428
8
+ }
uv.lock CHANGED
@@ -1143,6 +1143,7 @@ dependencies = [
1143
  { name = "langchain-qdrant" },
1144
  { name = "langchain-text-splitters" },
1145
  { name = "langgraph" },
 
1146
  { name = "pandas" },
1147
  { name = "python-dotenv" },
1148
  { name = "qdrant-client" },
@@ -1165,6 +1166,7 @@ requires-dist = [
1165
  { name = "langchain-qdrant", specifier = ">=0.2.0" },
1166
  { name = "langchain-text-splitters", specifier = ">=0.3.8" },
1167
  { name = "langgraph", specifier = ">=0.4.3" },
 
1168
  { name = "pandas", specifier = ">=2.2.3" },
1169
  { name = "python-dotenv", specifier = ">=1.1.0" },
1170
  { name = "qdrant-client", specifier = ">=1.14.2" },
@@ -1172,6 +1174,12 @@ requires-dist = [
1172
  { name = "websockets", specifier = ">=15.0.1" },
1173
  ]
1174
 
 
 
 
 
 
 
1175
  [[package]]
1176
  name = "literalai"
1177
  version = "0.1.201"
 
1143
  { name = "langchain-qdrant" },
1144
  { name = "langchain-text-splitters" },
1145
  { name = "langgraph" },
1146
+ { name = "libmagic" },
1147
  { name = "pandas" },
1148
  { name = "python-dotenv" },
1149
  { name = "qdrant-client" },
 
1166
  { name = "langchain-qdrant", specifier = ">=0.2.0" },
1167
  { name = "langchain-text-splitters", specifier = ">=0.3.8" },
1168
  { name = "langgraph", specifier = ">=0.4.3" },
1169
+ { name = "libmagic", specifier = ">=1.0" },
1170
  { name = "pandas", specifier = ">=2.2.3" },
1171
  { name = "python-dotenv", specifier = ">=1.1.0" },
1172
  { name = "qdrant-client", specifier = ">=1.14.2" },
 
1174
  { name = "websockets", specifier = ">=15.0.1" },
1175
  ]
1176
 
1177
+ [[package]]
1178
+ name = "libmagic"
1179
+ version = "1.0"
1180
+ source = { registry = "https://pypi.org/simple" }
1181
+ sdist = { url = "https://files.pythonhosted.org/packages/83/86/419ddfc3879b4565a60e0c75b6d19baec48428cbc2f15aca5320b3d136f6/libmagic-1.0.tar.gz", hash = "sha256:649f1ce7fb7c92796badbb812555e4a926351da4f5cdf82e810b5cd371aedf8d", size = 3665 }
1182
+
1183
  [[package]]
1184
  name = "literalai"
1185
  version = "0.1.201"