mafzaal commited on
Commit
a092eef
·
1 Parent(s): 5b07cdb

Add vector database creation configuration and update related scripts

Browse files
.env.example CHANGED
@@ -27,3 +27,9 @@ MAX_SEARCH_RESULTS=5
27
  # Document Chunking Configuration
28
  CHUNK_SIZE=1000
29
  CHUNK_OVERLAP=200
 
 
 
 
 
 
 
27
  # Document Chunking Configuration
28
  CHUNK_SIZE=1000
29
  CHUNK_OVERLAP=200
30
+
31
+ # Vector Database Creation Configuration
32
+ FORCE_RECREATE=False
33
+ OUTPUT_DIR=./stats
34
+ USE_CHUNKING=True
35
+ SHOULD_SAVE_STATS=True
CONTRIBUTING.md CHANGED
@@ -29,6 +29,12 @@ TheDataGuy Chat is a Q&A chatbot powered by the content from [TheDataGuy blog](h
29
  VECTOR_STORAGE_PATH=./db/vector_store_tdg
30
  LLM_MODEL=gpt-4o-mini
31
  EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
 
 
 
 
 
 
32
  ```
33
 
34
  3. Install dependencies:
 
29
  VECTOR_STORAGE_PATH=./db/vector_store_tdg
30
  LLM_MODEL=gpt-4o-mini
31
  EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
32
+
33
+ # Vector Database Creation Configuration (optional)
34
+ FORCE_RECREATE=False # Whether to force recreation of the vector store
35
+ OUTPUT_DIR=./stats # Directory to save stats and artifacts
36
+ USE_CHUNKING=True # Whether to split documents into chunks
37
+ SHOULD_SAVE_STATS=True # Whether to save statistics about the documents
38
  ```
39
 
40
  3. Install dependencies:
README.md CHANGED
@@ -94,6 +94,16 @@ CHUNK_SIZE=1000
94
  CHUNK_OVERLAP=200
95
  ```
96
 
 
 
 
 
 
 
 
 
 
 
97
  ## Running Locally
98
 
99
  ### Using Docker
 
94
  CHUNK_OVERLAP=200
95
  ```
96
 
97
+ Additional configuration options for vector database creation:
98
+
99
+ ```
100
+ # Vector Database Creation Configuration
101
+ FORCE_RECREATE=False # Whether to force recreation of the vector store
102
+ OUTPUT_DIR=./stats # Directory to save stats and artifacts
103
+ USE_CHUNKING=True # Whether to split documents into chunks
104
+ SHOULD_SAVE_STATS=True # Whether to save statistics about the documents
105
+ ```
106
+
107
  ## Running Locally
108
 
109
  ### Using Docker
py-src/app.py CHANGED
@@ -10,7 +10,8 @@ load_dotenv()
10
  import pipeline
11
  #build vector store
12
  print("=== create vector db ===")
13
- pipeline.create_vector_database(force_recreate=True,save_stats=False,use_chunking=True)
 
14
  print("========================")
15
 
16
  import chainlit as cl
 
10
  import pipeline
11
  #build vector store
12
  print("=== create vector db ===")
13
+ # Use configuration from config rather than hardcoded values
14
+ pipeline.create_vector_database()
15
  print("========================")
16
 
17
  import chainlit as cl
py-src/lets_talk/config.py CHANGED
@@ -20,5 +20,11 @@ MAX_SEARCH_RESULTS = int(os.environ.get("MAX_SEARCH_RESULTS", "5"))
20
  CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1000"))
21
  CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "200"))
22
 
 
 
 
 
 
 
23
 
24
 
 
20
  CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1000"))
21
  CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "200"))
22
 
23
+ # Vector database creation configuration
24
+ FORCE_RECREATE = os.environ.get("FORCE_RECREATE", "False").lower() == "true"
25
+ OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "./stats")
26
+ USE_CHUNKING = os.environ.get("USE_CHUNKING", "True").lower() == "true"
27
+ SHOULD_SAVE_STATS = os.environ.get("SHOULD_SAVE_STATS", "True").lower() == "true"
28
+
29
 
30
 
py-src/pipeline.py CHANGED
@@ -21,7 +21,10 @@ from datetime import datetime
21
  import json
22
  import logging
23
  from pathlib import Path
24
- from lets_talk.config import VECTOR_STORAGE_PATH, DATA_DIR
 
 
 
25
 
26
  # Import the blog utilities module
27
  import lets_talk.utils.blog as blog
@@ -100,19 +103,20 @@ def save_stats(stats, output_dir="./stats", ci_mode=False):
100
  return filename, basic_stats
101
 
102
  def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
103
- force_recreate=False, output_dir="./stats", ci_mode=False,
104
- use_chunking=True, should_save_stats=True, chunk_size=None, chunk_overlap=None):
 
105
  """
106
  Create or update the vector database with blog documents.
107
 
108
  Args:
109
- data_dir: Directory containing the blog posts
110
- storage_path: Path where the vector database will be stored
111
- force_recreate: Whether to force recreation of the vector store
112
- output_dir: Directory to save stats and artifacts
113
  ci_mode: Whether to run in CI mode
114
- use_chunking: Whether to split documents into chunks
115
- should_save_stats: Whether to save statistics about the documents
116
  chunk_size: Size of each chunk in characters (default from config)
117
  chunk_overlap: Overlap between chunks in characters (default from config)
118
 
 
21
  import json
22
  import logging
23
  from pathlib import Path
24
+ from lets_talk.config import (
25
+ CHUNK_OVERLAP, CHUNK_SIZE, VECTOR_STORAGE_PATH, DATA_DIR,
26
+ FORCE_RECREATE, OUTPUT_DIR, USE_CHUNKING, SHOULD_SAVE_STATS
27
+ )
28
 
29
  # Import the blog utilities module
30
  import lets_talk.utils.blog as blog
 
103
  return filename, basic_stats
104
 
105
  def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
106
+ force_recreate=FORCE_RECREATE, output_dir=OUTPUT_DIR, ci_mode=False,
107
+ use_chunking=USE_CHUNKING, should_save_stats=SHOULD_SAVE_STATS,
108
+ chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
109
  """
110
  Create or update the vector database with blog documents.
111
 
112
  Args:
113
+ data_dir: Directory containing the blog posts (default from config)
114
+ storage_path: Path where the vector database will be stored (default from config)
115
+ force_recreate: Whether to force recreation of the vector store (default from config)
116
+ output_dir: Directory to save stats and artifacts (default from config)
117
  ci_mode: Whether to run in CI mode
118
+ use_chunking: Whether to split documents into chunks (default from config)
119
+ should_save_stats: Whether to save statistics about the documents (default from config)
120
  chunk_size: Size of each chunk in characters (default from config)
121
  chunk_overlap: Overlap between chunks in characters (default from config)
122
 
scripts/build-vector-store.sh CHANGED
@@ -1,14 +1,21 @@
1
  #!/bin/bash
2
  # Script to build vector store locally
3
  # Usage: ./scripts/build-vector-store.sh [--force-recreate]
 
 
 
 
 
 
4
 
 
5
  FORCE_RECREATE=""
6
  if [[ "$1" == "--force-recreate" ]]; then
7
  FORCE_RECREATE="--force-recreate"
8
  fi
9
 
10
- # Set output directory for artifacts
11
- OUTPUT_DIR="./artifacts"
12
  mkdir -p $OUTPUT_DIR
13
 
14
  echo "Building vector store with output to $OUTPUT_DIR"
 
1
  #!/bin/bash
2
  # Script to build vector store locally
3
  # Usage: ./scripts/build-vector-store.sh [--force-recreate]
4
+ #
5
+ # Environment variables that can be set:
6
+ # FORCE_RECREATE - Set to "true" to force recreation of the vector store
7
+ # OUTPUT_DIR - Directory to save stats and artifacts (default: ./artifacts)
8
+ # USE_CHUNKING - Set to "false" to disable document chunking
9
+ # SHOULD_SAVE_STATS - Set to "false" to disable saving document statistics
10
 
11
+ # Parse command line arguments
12
  FORCE_RECREATE=""
13
  if [[ "$1" == "--force-recreate" ]]; then
14
  FORCE_RECREATE="--force-recreate"
15
  fi
16
 
17
+ # Set output directory for artifacts (use environment variable if set)
18
+ OUTPUT_DIR=${OUTPUT_DIR:-"./artifacts"}
19
  mkdir -p $OUTPUT_DIR
20
 
21
  echo "Building vector store with output to $OUTPUT_DIR"