Add vector database creation configuration and update related scripts
Browse files- .env.example +6 -0
- CONTRIBUTING.md +6 -0
- README.md +10 -0
- py-src/app.py +2 -1
- py-src/lets_talk/config.py +6 -0
- py-src/pipeline.py +13 -9
- scripts/build-vector-store.sh +9 -2
.env.example
CHANGED
@@ -27,3 +27,9 @@ MAX_SEARCH_RESULTS=5
|
|
27 |
# Document Chunking Configuration
|
28 |
CHUNK_SIZE=1000
|
29 |
CHUNK_OVERLAP=200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
# Document Chunking Configuration
|
28 |
CHUNK_SIZE=1000
|
29 |
CHUNK_OVERLAP=200
|
30 |
+
|
31 |
+
# Vector Database Creation Configuration
|
32 |
+
FORCE_RECREATE=False
|
33 |
+
OUTPUT_DIR=./stats
|
34 |
+
USE_CHUNKING=True
|
35 |
+
SHOULD_SAVE_STATS=True
|
CONTRIBUTING.md
CHANGED
@@ -29,6 +29,12 @@ TheDataGuy Chat is a Q&A chatbot powered by the content from [TheDataGuy blog](h
|
|
29 |
VECTOR_STORAGE_PATH=./db/vector_store_tdg
|
30 |
LLM_MODEL=gpt-4o-mini
|
31 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
```
|
33 |
|
34 |
3. Install dependencies:
|
|
|
29 |
VECTOR_STORAGE_PATH=./db/vector_store_tdg
|
30 |
LLM_MODEL=gpt-4o-mini
|
31 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
|
32 |
+
|
33 |
+
# Vector Database Creation Configuration (optional)
|
34 |
+
FORCE_RECREATE=False # Whether to force recreation of the vector store
|
35 |
+
OUTPUT_DIR=./stats # Directory to save stats and artifacts
|
36 |
+
USE_CHUNKING=True # Whether to split documents into chunks
|
37 |
+
SHOULD_SAVE_STATS=True # Whether to save statistics about the documents
|
38 |
```
|
39 |
|
40 |
3. Install dependencies:
|
README.md
CHANGED
@@ -94,6 +94,16 @@ CHUNK_SIZE=1000
|
|
94 |
CHUNK_OVERLAP=200
|
95 |
```
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
## Running Locally
|
98 |
|
99 |
### Using Docker
|
|
|
94 |
CHUNK_OVERLAP=200
|
95 |
```
|
96 |
|
97 |
+
Additional configuration options for vector database creation:
|
98 |
+
|
99 |
+
```
|
100 |
+
# Vector Database Creation Configuration
|
101 |
+
FORCE_RECREATE=False # Whether to force recreation of the vector store
|
102 |
+
OUTPUT_DIR=./stats # Directory to save stats and artifacts
|
103 |
+
USE_CHUNKING=True # Whether to split documents into chunks
|
104 |
+
SHOULD_SAVE_STATS=True # Whether to save statistics about the documents
|
105 |
+
```
|
106 |
+
|
107 |
## Running Locally
|
108 |
|
109 |
### Using Docker
|
py-src/app.py
CHANGED
@@ -10,7 +10,8 @@ load_dotenv()
|
|
10 |
import pipeline
|
11 |
#build vector store
|
12 |
print("=== create vector db ===")
|
13 |
-
|
|
|
14 |
print("========================")
|
15 |
|
16 |
import chainlit as cl
|
|
|
10 |
import pipeline
|
11 |
#build vector store
|
12 |
print("=== create vector db ===")
|
13 |
+
# Use configuration from config rather than hardcoded values
|
14 |
+
pipeline.create_vector_database()
|
15 |
print("========================")
|
16 |
|
17 |
import chainlit as cl
|
py-src/lets_talk/config.py
CHANGED
@@ -20,5 +20,11 @@ MAX_SEARCH_RESULTS = int(os.environ.get("MAX_SEARCH_RESULTS", "5"))
|
|
20 |
CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1000"))
|
21 |
CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "200"))
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
|
|
|
20 |
CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1000"))
|
21 |
CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "200"))
|
22 |
|
23 |
+
# Vector database creation configuration
|
24 |
+
FORCE_RECREATE = os.environ.get("FORCE_RECREATE", "False").lower() == "true"
|
25 |
+
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "./stats")
|
26 |
+
USE_CHUNKING = os.environ.get("USE_CHUNKING", "True").lower() == "true"
|
27 |
+
SHOULD_SAVE_STATS = os.environ.get("SHOULD_SAVE_STATS", "True").lower() == "true"
|
28 |
+
|
29 |
|
30 |
|
py-src/pipeline.py
CHANGED
@@ -21,7 +21,10 @@ from datetime import datetime
|
|
21 |
import json
|
22 |
import logging
|
23 |
from pathlib import Path
|
24 |
-
from lets_talk.config import
|
|
|
|
|
|
|
25 |
|
26 |
# Import the blog utilities module
|
27 |
import lets_talk.utils.blog as blog
|
@@ -100,19 +103,20 @@ def save_stats(stats, output_dir="./stats", ci_mode=False):
|
|
100 |
return filename, basic_stats
|
101 |
|
102 |
def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
103 |
-
force_recreate=
|
104 |
-
use_chunking=
|
|
|
105 |
"""
|
106 |
Create or update the vector database with blog documents.
|
107 |
|
108 |
Args:
|
109 |
-
data_dir: Directory containing the blog posts
|
110 |
-
storage_path: Path where the vector database will be stored
|
111 |
-
force_recreate: Whether to force recreation of the vector store
|
112 |
-
output_dir: Directory to save stats and artifacts
|
113 |
ci_mode: Whether to run in CI mode
|
114 |
-
use_chunking: Whether to split documents into chunks
|
115 |
-
should_save_stats: Whether to save statistics about the documents
|
116 |
chunk_size: Size of each chunk in characters (default from config)
|
117 |
chunk_overlap: Overlap between chunks in characters (default from config)
|
118 |
|
|
|
21 |
import json
|
22 |
import logging
|
23 |
from pathlib import Path
|
24 |
+
from lets_talk.config import (
|
25 |
+
CHUNK_OVERLAP, CHUNK_SIZE, VECTOR_STORAGE_PATH, DATA_DIR,
|
26 |
+
FORCE_RECREATE, OUTPUT_DIR, USE_CHUNKING, SHOULD_SAVE_STATS
|
27 |
+
)
|
28 |
|
29 |
# Import the blog utilities module
|
30 |
import lets_talk.utils.blog as blog
|
|
|
103 |
return filename, basic_stats
|
104 |
|
105 |
def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
106 |
+
force_recreate=FORCE_RECREATE, output_dir=OUTPUT_DIR, ci_mode=False,
|
107 |
+
use_chunking=USE_CHUNKING, should_save_stats=SHOULD_SAVE_STATS,
|
108 |
+
chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
|
109 |
"""
|
110 |
Create or update the vector database with blog documents.
|
111 |
|
112 |
Args:
|
113 |
+
data_dir: Directory containing the blog posts (default from config)
|
114 |
+
storage_path: Path where the vector database will be stored (default from config)
|
115 |
+
force_recreate: Whether to force recreation of the vector store (default from config)
|
116 |
+
output_dir: Directory to save stats and artifacts (default from config)
|
117 |
ci_mode: Whether to run in CI mode
|
118 |
+
use_chunking: Whether to split documents into chunks (default from config)
|
119 |
+
should_save_stats: Whether to save statistics about the documents (default from config)
|
120 |
chunk_size: Size of each chunk in characters (default from config)
|
121 |
chunk_overlap: Overlap between chunks in characters (default from config)
|
122 |
|
scripts/build-vector-store.sh
CHANGED
@@ -1,14 +1,21 @@
|
|
1 |
#!/bin/bash
|
2 |
# Script to build vector store locally
|
3 |
# Usage: ./scripts/build-vector-store.sh [--force-recreate]
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
|
|
5 |
FORCE_RECREATE=""
|
6 |
if [[ "$1" == "--force-recreate" ]]; then
|
7 |
FORCE_RECREATE="--force-recreate"
|
8 |
fi
|
9 |
|
10 |
-
# Set output directory for artifacts
|
11 |
-
OUTPUT_DIR
|
12 |
mkdir -p $OUTPUT_DIR
|
13 |
|
14 |
echo "Building vector store with output to $OUTPUT_DIR"
|
|
|
1 |
#!/bin/bash
|
2 |
# Script to build vector store locally
|
3 |
# Usage: ./scripts/build-vector-store.sh [--force-recreate]
|
4 |
+
#
|
5 |
+
# Environment variables that can be set:
|
6 |
+
# FORCE_RECREATE - Set to "true" to force recreation of the vector store
|
7 |
+
# OUTPUT_DIR - Directory to save stats and artifacts (default: ./artifacts)
|
8 |
+
# USE_CHUNKING - Set to "false" to disable document chunking
|
9 |
+
# SHOULD_SAVE_STATS - Set to "false" to disable saving document statistics
|
10 |
|
11 |
+
# Parse command line arguments
|
12 |
FORCE_RECREATE=""
|
13 |
if [[ "$1" == "--force-recreate" ]]; then
|
14 |
FORCE_RECREATE="--force-recreate"
|
15 |
fi
|
16 |
|
17 |
+
# Set output directory for artifacts (use environment variable if set)
|
18 |
+
OUTPUT_DIR=${OUTPUT_DIR:-"./artifacts"}
|
19 |
mkdir -p $OUTPUT_DIR
|
20 |
|
21 |
echo "Building vector store with output to $OUTPUT_DIR"
|