Add document chunking configuration and update related utilities
Browse files- .env.example +4 -0
- BLOG_DATA_UTILS.md +12 -0
- README.md +2 -0
- py-src/lets_talk/config.py +4 -0
- py-src/lets_talk/utils/blog.py +5 -3
- py-src/pipeline.py +30 -4
.env.example
CHANGED
@@ -23,3 +23,7 @@ BLOG_BASE_URL=https://thedataguy.pro/blog/
|
|
23 |
|
24 |
# Search Configuration
|
25 |
MAX_SEARCH_RESULTS=5
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
# Search Configuration
|
25 |
MAX_SEARCH_RESULTS=5
|
26 |
+
|
27 |
+
# Document Chunking Configuration
|
28 |
+
CHUNK_SIZE=1000
|
29 |
+
CHUNK_OVERLAP=200
|
BLOG_DATA_UTILS.md
CHANGED
@@ -46,6 +46,16 @@ When new blog posts are published, follow these steps:
|
|
46 |
uv run python update_blog_data.py --force-recreate
|
47 |
```
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
This will:
|
50 |
- Load all blog posts (including new ones)
|
51 |
- Update the vector embeddings
|
@@ -61,6 +71,8 @@ VECTOR_STORAGE_PATH=./db/vectorstore_v3 # Path to vector store
|
|
61 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l # Embedding model
|
62 |
QDRANT_COLLECTION=thedataguy_documents # Collection name
|
63 |
BLOG_BASE_URL=https://thedataguy.pro/blog/ # Base URL for blog
|
|
|
|
|
64 |
```
|
65 |
|
66 |
### In the Chainlit App
|
|
|
46 |
uv run python update_blog_data.py --force-recreate
|
47 |
```
|
48 |
|
49 |
+
Or customize the chunking behavior:
|
50 |
+
```bash
|
51 |
+
uv run python update_blog_data.py --chunk-size 1500 --chunk-overlap 300
|
52 |
+
```
|
53 |
+
|
54 |
+
Or use whole documents without chunking:
|
55 |
+
```bash
|
56 |
+
uv run python update_blog_data.py --no-chunking
|
57 |
+
```
|
58 |
+
|
59 |
This will:
|
60 |
- Load all blog posts (including new ones)
|
61 |
- Update the vector embeddings
|
|
|
71 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l # Embedding model
|
72 |
QDRANT_COLLECTION=thedataguy_documents # Collection name
|
73 |
BLOG_BASE_URL=https://thedataguy.pro/blog/ # Base URL for blog
|
74 |
+
CHUNK_SIZE=1000 # Size of each document chunk
|
75 |
+
CHUNK_OVERLAP=200 # Overlap between chunks
|
76 |
```
|
77 |
|
78 |
### In the Chainlit App
|
README.md
CHANGED
@@ -90,6 +90,8 @@ OPENAI_API_KEY=your_openai_api_key
|
|
90 |
VECTOR_STORAGE_PATH=./db/vector_store_tdg
|
91 |
LLM_MODEL=gpt-4o-mini
|
92 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
|
|
|
|
|
93 |
```
|
94 |
|
95 |
## Running Locally
|
|
|
90 |
VECTOR_STORAGE_PATH=./db/vector_store_tdg
|
91 |
LLM_MODEL=gpt-4o-mini
|
92 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
|
93 |
+
CHUNK_SIZE=1000
|
94 |
+
CHUNK_OVERLAP=200
|
95 |
```
|
96 |
|
97 |
## Running Locally
|
py-src/lets_talk/config.py
CHANGED
@@ -16,5 +16,9 @@ SDG_LLM_MODLEL = os.environ.get("SDG_LLM_MODEL", "gpt-4.1")
|
|
16 |
EVAL_LLM_MODEL = os.environ.get("EVAL_LLM_MODEL", "gpt-4.1")
|
17 |
MAX_SEARCH_RESULTS = int(os.environ.get("MAX_SEARCH_RESULTS", "5"))
|
18 |
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
|
|
16 |
EVAL_LLM_MODEL = os.environ.get("EVAL_LLM_MODEL", "gpt-4.1")
|
17 |
MAX_SEARCH_RESULTS = int(os.environ.get("MAX_SEARCH_RESULTS", "5"))
|
18 |
|
19 |
+
# Document chunking configuration
|
20 |
+
CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1000"))
|
21 |
+
CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "200"))
|
22 |
+
|
23 |
|
24 |
|
py-src/lets_talk/utils/blog.py
CHANGED
@@ -26,7 +26,9 @@ from lets_talk.config import (
|
|
26 |
VECTOR_STORAGE_PATH,
|
27 |
EMBEDDING_MODEL,
|
28 |
QDRANT_COLLECTION,
|
29 |
-
BLOG_BASE_URL
|
|
|
|
|
30 |
)
|
31 |
|
32 |
def load_blog_posts(data_dir: str = DATA_DIR,
|
@@ -161,8 +163,8 @@ def display_document_stats(stats: Dict[str, Any]):
|
|
161 |
|
162 |
|
163 |
def split_documents(documents: List[Document],
|
164 |
-
chunk_size: int =
|
165 |
-
chunk_overlap: int =
|
166 |
"""
|
167 |
Split documents into chunks for better embedding and retrieval.
|
168 |
|
|
|
26 |
VECTOR_STORAGE_PATH,
|
27 |
EMBEDDING_MODEL,
|
28 |
QDRANT_COLLECTION,
|
29 |
+
BLOG_BASE_URL,
|
30 |
+
CHUNK_SIZE,
|
31 |
+
CHUNK_OVERLAP
|
32 |
)
|
33 |
|
34 |
def load_blog_posts(data_dir: str = DATA_DIR,
|
|
|
163 |
|
164 |
|
165 |
def split_documents(documents: List[Document],
|
166 |
+
chunk_size: int = CHUNK_SIZE,
|
167 |
+
chunk_overlap: int = CHUNK_OVERLAP) -> List[Document]:
|
168 |
"""
|
169 |
Split documents into chunks for better embedding and retrieval.
|
170 |
|
py-src/pipeline.py
CHANGED
@@ -45,6 +45,12 @@ def parse_args():
|
|
45 |
help="Directory to save stats and artifacts (default: ./stats)")
|
46 |
parser.add_argument("--ci", action="store_true",
|
47 |
help="Run in CI mode (no interactive prompts, exit codes for CI)")
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
return parser.parse_args()
|
49 |
|
50 |
def save_stats(stats, output_dir="./stats", ci_mode=False):
|
@@ -94,7 +100,8 @@ def save_stats(stats, output_dir="./stats", ci_mode=False):
|
|
94 |
return filename, basic_stats
|
95 |
|
96 |
def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
97 |
-
force_recreate=False, output_dir="./stats", ci_mode=False,
|
|
|
98 |
"""
|
99 |
Create or update the vector database with blog documents.
|
100 |
|
@@ -104,6 +111,10 @@ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
|
104 |
force_recreate: Whether to force recreation of the vector store
|
105 |
output_dir: Directory to save stats and artifacts
|
106 |
ci_mode: Whether to run in CI mode
|
|
|
|
|
|
|
|
|
107 |
|
108 |
Returns:
|
109 |
Tuple of (success status, message, stats, stats_file, stats_file_content)
|
@@ -122,12 +133,20 @@ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
|
122 |
# Save stats for tracking
|
123 |
stats_file = None
|
124 |
stats_content = None
|
125 |
-
if
|
126 |
stats_file, stats_content = save_stats(stats, output_dir=output_dir, ci_mode=ci_mode)
|
127 |
|
128 |
if use_chunking:
|
129 |
logger.info("Chunking documents...")
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
|
133 |
|
@@ -183,6 +202,10 @@ def main():
|
|
183 |
logger.info(f"Force recreate: {args.force_recreate}")
|
184 |
logger.info(f"Output directory: {args.output_dir}")
|
185 |
logger.info(f"CI mode: {args.ci}")
|
|
|
|
|
|
|
|
|
186 |
logger.info("========================")
|
187 |
|
188 |
try:
|
@@ -192,7 +215,10 @@ def main():
|
|
192 |
storage_path=VECTOR_STORAGE_PATH,
|
193 |
force_recreate=args.force_recreate,
|
194 |
output_dir=args.output_dir,
|
195 |
-
ci_mode=args.ci
|
|
|
|
|
|
|
196 |
)
|
197 |
|
198 |
logger.info("\n=== Update Summary ===")
|
|
|
45 |
help="Directory to save stats and artifacts (default: ./stats)")
|
46 |
parser.add_argument("--ci", action="store_true",
|
47 |
help="Run in CI mode (no interactive prompts, exit codes for CI)")
|
48 |
+
parser.add_argument("--chunk-size", type=int,
|
49 |
+
help=f"Size of each chunk in characters (default from config)")
|
50 |
+
parser.add_argument("--chunk-overlap", type=int,
|
51 |
+
help=f"Overlap between chunks in characters (default from config)")
|
52 |
+
parser.add_argument("--no-chunking", action="store_true",
|
53 |
+
help="Don't split documents into chunks (use whole documents)")
|
54 |
return parser.parse_args()
|
55 |
|
56 |
def save_stats(stats, output_dir="./stats", ci_mode=False):
|
|
|
100 |
return filename, basic_stats
|
101 |
|
102 |
def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
103 |
+
force_recreate=False, output_dir="./stats", ci_mode=False,
|
104 |
+
use_chunking=True, should_save_stats=True, chunk_size=None, chunk_overlap=None):
|
105 |
"""
|
106 |
Create or update the vector database with blog documents.
|
107 |
|
|
|
111 |
force_recreate: Whether to force recreation of the vector store
|
112 |
output_dir: Directory to save stats and artifacts
|
113 |
ci_mode: Whether to run in CI mode
|
114 |
+
use_chunking: Whether to split documents into chunks
|
115 |
+
should_save_stats: Whether to save statistics about the documents
|
116 |
+
chunk_size: Size of each chunk in characters (default from config)
|
117 |
+
chunk_overlap: Overlap between chunks in characters (default from config)
|
118 |
|
119 |
Returns:
|
120 |
Tuple of (success status, message, stats, stats_file, stats_file_content)
|
|
|
133 |
# Save stats for tracking
|
134 |
stats_file = None
|
135 |
stats_content = None
|
136 |
+
if should_save_stats:
|
137 |
stats_file, stats_content = save_stats(stats, output_dir=output_dir, ci_mode=ci_mode)
|
138 |
|
139 |
if use_chunking:
|
140 |
logger.info("Chunking documents...")
|
141 |
+
# Use provided chunk_size and chunk_overlap or default from config
|
142 |
+
chunking_params = {}
|
143 |
+
if chunk_size is not None:
|
144 |
+
chunking_params['chunk_size'] = chunk_size
|
145 |
+
if chunk_overlap is not None:
|
146 |
+
chunking_params['chunk_overlap'] = chunk_overlap
|
147 |
+
|
148 |
+
logger.info(f"Using chunk size: {chunking_params.get('chunk_size', 'default')} and overlap: {chunking_params.get('chunk_overlap', 'default')}")
|
149 |
+
documents = blog.split_documents(documents, **chunking_params)
|
150 |
|
151 |
|
152 |
|
|
|
202 |
logger.info(f"Force recreate: {args.force_recreate}")
|
203 |
logger.info(f"Output directory: {args.output_dir}")
|
204 |
logger.info(f"CI mode: {args.ci}")
|
205 |
+
logger.info(f"Chunking: {not args.no_chunking}")
|
206 |
+
if not args.no_chunking:
|
207 |
+
logger.info(f"Chunk size: {args.chunk_size if args.chunk_size else 'default from config'}")
|
208 |
+
logger.info(f"Chunk overlap: {args.chunk_overlap if args.chunk_overlap else 'default from config'}")
|
209 |
logger.info("========================")
|
210 |
|
211 |
try:
|
|
|
215 |
storage_path=VECTOR_STORAGE_PATH,
|
216 |
force_recreate=args.force_recreate,
|
217 |
output_dir=args.output_dir,
|
218 |
+
ci_mode=args.ci,
|
219 |
+
use_chunking=not args.no_chunking,
|
220 |
+
chunk_size=args.chunk_size,
|
221 |
+
chunk_overlap=args.chunk_overlap
|
222 |
)
|
223 |
|
224 |
logger.info("\n=== Update Summary ===")
|