feat: Update blog data processing to create vector database and enhance stats tracking
Browse files- py-src/app.py +1 -1
- py-src/pipeline.py +22 -18
- pyproject.toml +1 -0
- stats/blog_stats_20250511_100823.json +8 -0
- uv.lock +8 -0
py-src/app.py
CHANGED
@@ -10,7 +10,7 @@ load_dotenv()
|
|
10 |
import pipeline
|
11 |
#build vector store
|
12 |
print("=== Blog Data Update ===")
|
13 |
-
pipeline.
|
14 |
print("========================")
|
15 |
|
16 |
import chainlit as cl
|
|
|
10 |
import pipeline
|
11 |
#build vector store
|
12 |
print("=== Blog Data Update ===")
|
13 |
+
pipeline.create_vector_database(force_recreate=True)
|
14 |
print("========================")
|
15 |
|
16 |
import chainlit as cl
|
py-src/pipeline.py
CHANGED
@@ -18,7 +18,7 @@ import argparse
|
|
18 |
from datetime import datetime
|
19 |
import json
|
20 |
from pathlib import Path
|
21 |
-
from lets_talk.config import VECTOR_STORAGE_PATH
|
22 |
|
23 |
# Import the blog utilities module
|
24 |
import lets_talk.utils.blog as blog
|
@@ -28,8 +28,8 @@ def parse_args():
|
|
28 |
parser = argparse.ArgumentParser(description="Update blog data vector store")
|
29 |
parser.add_argument("--force-recreate", action="store_true",
|
30 |
help="Force recreation of the vector store")
|
31 |
-
parser.add_argument("--data-dir", default=
|
32 |
-
help=f"Directory containing blog posts (default: {
|
33 |
return parser.parse_args()
|
34 |
|
35 |
def save_stats(stats, output_dir="./stats"):
|
@@ -57,7 +57,7 @@ def save_stats(stats, output_dir="./stats"):
|
|
57 |
print(f"Saved stats to {filename}")
|
58 |
return filename
|
59 |
|
60 |
-
def create_vector_database(
|
61 |
"""
|
62 |
Create or update the vector database with blog documents.
|
63 |
|
@@ -70,7 +70,21 @@ def create_vector_database(documents, data_dir, storage_path=VECTOR_STORAGE_PATH
|
|
70 |
Returns:
|
71 |
Tuple of (success status, message)
|
72 |
"""
|
|
|
|
|
|
|
73 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
|
75 |
|
76 |
if create_vector_store:
|
@@ -82,9 +96,9 @@ def create_vector_database(documents, data_dir, storage_path=VECTOR_STORAGE_PATH
|
|
82 |
)
|
83 |
vector_store.client.close()
|
84 |
print("Vector store reference file saved.")
|
85 |
-
return True, f"Vector store successfully created at {storage_path}"
|
86 |
else:
|
87 |
-
return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)"
|
88 |
except Exception as e:
|
89 |
return False, f"Error creating vector store: {str(e)}"
|
90 |
|
@@ -98,20 +112,10 @@ def main():
|
|
98 |
print("========================")
|
99 |
|
100 |
try:
|
101 |
-
|
102 |
-
documents = blog.load_blog_posts(args.data_dir)
|
103 |
-
documents = blog.update_document_metadata(documents)
|
104 |
-
|
105 |
-
# Get stats
|
106 |
-
stats = blog.get_document_stats(documents)
|
107 |
-
blog.display_document_stats(stats)
|
108 |
-
|
109 |
-
# Save stats for tracking
|
110 |
-
stats_file = save_stats(stats)
|
111 |
|
112 |
# Create or update vector database
|
113 |
-
success, message = create_vector_database(
|
114 |
-
documents,
|
115 |
args.data_dir,
|
116 |
storage_path=VECTOR_STORAGE_PATH,
|
117 |
force_recreate=args.force_recreate
|
|
|
18 |
from datetime import datetime
|
19 |
import json
|
20 |
from pathlib import Path
|
21 |
+
from lets_talk.config import VECTOR_STORAGE_PATH, DATA_DIR
|
22 |
|
23 |
# Import the blog utilities module
|
24 |
import lets_talk.utils.blog as blog
|
|
|
28 |
parser = argparse.ArgumentParser(description="Update blog data vector store")
|
29 |
parser.add_argument("--force-recreate", action="store_true",
|
30 |
help="Force recreation of the vector store")
|
31 |
+
parser.add_argument("--data-dir", default=DATA_DIR,
|
32 |
+
help=f"Directory containing blog posts (default: {DATA_DIR})")
|
33 |
return parser.parse_args()
|
34 |
|
35 |
def save_stats(stats, output_dir="./stats"):
|
|
|
57 |
print(f"Saved stats to {filename}")
|
58 |
return filename
|
59 |
|
60 |
+
def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH, force_recreate=False):
|
61 |
"""
|
62 |
Create or update the vector database with blog documents.
|
63 |
|
|
|
70 |
Returns:
|
71 |
Tuple of (success status, message)
|
72 |
"""
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
try:
|
77 |
+
# Load and process documents
|
78 |
+
documents = blog.load_blog_posts(data_dir)
|
79 |
+
documents = blog.update_document_metadata(documents)
|
80 |
+
|
81 |
+
# Get stats
|
82 |
+
stats = blog.get_document_stats(documents)
|
83 |
+
blog.display_document_stats(stats)
|
84 |
+
|
85 |
+
# Save stats for tracking
|
86 |
+
stats_file = save_stats(stats)
|
87 |
+
|
88 |
create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
|
89 |
|
90 |
if create_vector_store:
|
|
|
96 |
)
|
97 |
vector_store.client.close()
|
98 |
print("Vector store reference file saved.")
|
99 |
+
return True, f"Vector store successfully created at {storage_path}",stats, stats_file
|
100 |
else:
|
101 |
+
return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)",stats, stats_file
|
102 |
except Exception as e:
|
103 |
return False, f"Error creating vector store: {str(e)}"
|
104 |
|
|
|
112 |
print("========================")
|
113 |
|
114 |
try:
|
115 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
# Create or update vector database
|
118 |
+
success, message,stats,stats_file = create_vector_database(
|
|
|
119 |
args.data_dir,
|
120 |
storage_path=VECTOR_STORAGE_PATH,
|
121 |
force_recreate=args.force_recreate
|
pyproject.toml
CHANGED
@@ -18,6 +18,7 @@ dependencies = [
|
|
18 |
"langchain-qdrant>=0.2.0",
|
19 |
"langchain-text-splitters>=0.3.8",
|
20 |
"langgraph>=0.4.3",
|
|
|
21 |
"pandas>=2.2.3",
|
22 |
"python-dotenv>=1.1.0",
|
23 |
"qdrant-client>=1.14.2",
|
|
|
18 |
"langchain-qdrant>=0.2.0",
|
19 |
"langchain-text-splitters>=0.3.8",
|
20 |
"langgraph>=0.4.3",
|
21 |
+
"libmagic>=1.0",
|
22 |
"pandas>=2.2.3",
|
23 |
"python-dotenv>=1.1.0",
|
24 |
"qdrant-client>=1.14.2",
|
stats/blog_stats_20250511_100823.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"timestamp": "20250511_100823",
|
3 |
+
"total_documents": 14,
|
4 |
+
"total_characters": 106275,
|
5 |
+
"min_length": 1900,
|
6 |
+
"max_length": 13468,
|
7 |
+
"avg_length": 7591.071428571428
|
8 |
+
}
|
uv.lock
CHANGED
@@ -1143,6 +1143,7 @@ dependencies = [
|
|
1143 |
{ name = "langchain-qdrant" },
|
1144 |
{ name = "langchain-text-splitters" },
|
1145 |
{ name = "langgraph" },
|
|
|
1146 |
{ name = "pandas" },
|
1147 |
{ name = "python-dotenv" },
|
1148 |
{ name = "qdrant-client" },
|
@@ -1165,6 +1166,7 @@ requires-dist = [
|
|
1165 |
{ name = "langchain-qdrant", specifier = ">=0.2.0" },
|
1166 |
{ name = "langchain-text-splitters", specifier = ">=0.3.8" },
|
1167 |
{ name = "langgraph", specifier = ">=0.4.3" },
|
|
|
1168 |
{ name = "pandas", specifier = ">=2.2.3" },
|
1169 |
{ name = "python-dotenv", specifier = ">=1.1.0" },
|
1170 |
{ name = "qdrant-client", specifier = ">=1.14.2" },
|
@@ -1172,6 +1174,12 @@ requires-dist = [
|
|
1172 |
{ name = "websockets", specifier = ">=15.0.1" },
|
1173 |
]
|
1174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1175 |
[[package]]
|
1176 |
name = "literalai"
|
1177 |
version = "0.1.201"
|
|
|
1143 |
{ name = "langchain-qdrant" },
|
1144 |
{ name = "langchain-text-splitters" },
|
1145 |
{ name = "langgraph" },
|
1146 |
+
{ name = "libmagic" },
|
1147 |
{ name = "pandas" },
|
1148 |
{ name = "python-dotenv" },
|
1149 |
{ name = "qdrant-client" },
|
|
|
1166 |
{ name = "langchain-qdrant", specifier = ">=0.2.0" },
|
1167 |
{ name = "langchain-text-splitters", specifier = ">=0.3.8" },
|
1168 |
{ name = "langgraph", specifier = ">=0.4.3" },
|
1169 |
+
{ name = "libmagic", specifier = ">=1.0" },
|
1170 |
{ name = "pandas", specifier = ">=2.2.3" },
|
1171 |
{ name = "python-dotenv", specifier = ">=1.1.0" },
|
1172 |
{ name = "qdrant-client", specifier = ">=1.14.2" },
|
|
|
1174 |
{ name = "websockets", specifier = ">=15.0.1" },
|
1175 |
]
|
1176 |
|
1177 |
+
[[package]]
|
1178 |
+
name = "libmagic"
|
1179 |
+
version = "1.0"
|
1180 |
+
source = { registry = "https://pypi.org/simple" }
|
1181 |
+
sdist = { url = "https://files.pythonhosted.org/packages/83/86/419ddfc3879b4565a60e0c75b6d19baec48428cbc2f15aca5320b3d136f6/libmagic-1.0.tar.gz", hash = "sha256:649f1ce7fb7c92796badbb812555e4a926351da4f5cdf82e810b5cd371aedf8d", size = 3665 }
|
1182 |
+
|
1183 |
[[package]]
|
1184 |
name = "literalai"
|
1185 |
version = "0.1.201"
|