mafzaal commited on
Commit
3379e0a
·
1 Parent(s): 2754790

Add document chunking configuration and update related utilities

Browse files
.env.example CHANGED
@@ -23,3 +23,7 @@ BLOG_BASE_URL=https://thedataguy.pro/blog/
23
 
24
  # Search Configuration
25
  MAX_SEARCH_RESULTS=5
 
 
 
 
 
23
 
24
  # Search Configuration
25
  MAX_SEARCH_RESULTS=5
26
+
27
+ # Document Chunking Configuration
28
+ CHUNK_SIZE=1000
29
+ CHUNK_OVERLAP=200
BLOG_DATA_UTILS.md CHANGED
@@ -46,6 +46,16 @@ When new blog posts are published, follow these steps:
46
  uv run python update_blog_data.py --force-recreate
47
  ```
48
 
 
 
 
 
 
 
 
 
 
 
49
  This will:
50
  - Load all blog posts (including new ones)
51
  - Update the vector embeddings
@@ -61,6 +71,8 @@ VECTOR_STORAGE_PATH=./db/vectorstore_v3 # Path to vector store
61
  EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l # Embedding model
62
  QDRANT_COLLECTION=thedataguy_documents # Collection name
63
  BLOG_BASE_URL=https://thedataguy.pro/blog/ # Base URL for blog
 
 
64
  ```
65
 
66
  ### In the Chainlit App
 
46
  uv run python update_blog_data.py --force-recreate
47
  ```
48
 
49
+ Or customize the chunking behavior:
50
+ ```bash
51
+ uv run python update_blog_data.py --chunk-size 1500 --chunk-overlap 300
52
+ ```
53
+
54
+ Or use whole documents without chunking:
55
+ ```bash
56
+ uv run python update_blog_data.py --no-chunking
57
+ ```
58
+
59
  This will:
60
  - Load all blog posts (including new ones)
61
  - Update the vector embeddings
 
71
  EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l # Embedding model
72
  QDRANT_COLLECTION=thedataguy_documents # Collection name
73
  BLOG_BASE_URL=https://thedataguy.pro/blog/ # Base URL for blog
74
+ CHUNK_SIZE=1000 # Size of each document chunk
75
+ CHUNK_OVERLAP=200 # Overlap between chunks
76
  ```
77
 
78
  ### In the Chainlit App
README.md CHANGED
@@ -90,6 +90,8 @@ OPENAI_API_KEY=your_openai_api_key
90
  VECTOR_STORAGE_PATH=./db/vector_store_tdg
91
  LLM_MODEL=gpt-4o-mini
92
  EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
 
 
93
  ```
94
 
95
  ## Running Locally
 
90
  VECTOR_STORAGE_PATH=./db/vector_store_tdg
91
  LLM_MODEL=gpt-4o-mini
92
  EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
93
+ CHUNK_SIZE=1000
94
+ CHUNK_OVERLAP=200
95
  ```
96
 
97
  ## Running Locally
py-src/lets_talk/config.py CHANGED
@@ -16,5 +16,9 @@ SDG_LLM_MODLEL = os.environ.get("SDG_LLM_MODEL", "gpt-4.1")
16
  EVAL_LLM_MODEL = os.environ.get("EVAL_LLM_MODEL", "gpt-4.1")
17
  MAX_SEARCH_RESULTS = int(os.environ.get("MAX_SEARCH_RESULTS", "5"))
18
 
 
 
 
 
19
 
20
 
 
16
  EVAL_LLM_MODEL = os.environ.get("EVAL_LLM_MODEL", "gpt-4.1")
17
  MAX_SEARCH_RESULTS = int(os.environ.get("MAX_SEARCH_RESULTS", "5"))
18
 
19
+ # Document chunking configuration
20
+ CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1000"))
21
+ CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "200"))
22
+
23
 
24
 
py-src/lets_talk/utils/blog.py CHANGED
@@ -26,7 +26,9 @@ from lets_talk.config import (
26
  VECTOR_STORAGE_PATH,
27
  EMBEDDING_MODEL,
28
  QDRANT_COLLECTION,
29
- BLOG_BASE_URL
 
 
30
  )
31
 
32
  def load_blog_posts(data_dir: str = DATA_DIR,
@@ -161,8 +163,8 @@ def display_document_stats(stats: Dict[str, Any]):
161
 
162
 
163
  def split_documents(documents: List[Document],
164
- chunk_size: int = 1000,
165
- chunk_overlap: int = 200) -> List[Document]:
166
  """
167
  Split documents into chunks for better embedding and retrieval.
168
 
 
26
  VECTOR_STORAGE_PATH,
27
  EMBEDDING_MODEL,
28
  QDRANT_COLLECTION,
29
+ BLOG_BASE_URL,
30
+ CHUNK_SIZE,
31
+ CHUNK_OVERLAP
32
  )
33
 
34
  def load_blog_posts(data_dir: str = DATA_DIR,
 
163
 
164
 
165
  def split_documents(documents: List[Document],
166
+ chunk_size: int = CHUNK_SIZE,
167
+ chunk_overlap: int = CHUNK_OVERLAP) -> List[Document]:
168
  """
169
  Split documents into chunks for better embedding and retrieval.
170
 
py-src/pipeline.py CHANGED
@@ -45,6 +45,12 @@ def parse_args():
45
  help="Directory to save stats and artifacts (default: ./stats)")
46
  parser.add_argument("--ci", action="store_true",
47
  help="Run in CI mode (no interactive prompts, exit codes for CI)")
 
 
 
 
 
 
48
  return parser.parse_args()
49
 
50
  def save_stats(stats, output_dir="./stats", ci_mode=False):
@@ -94,7 +100,8 @@ def save_stats(stats, output_dir="./stats", ci_mode=False):
94
  return filename, basic_stats
95
 
96
  def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
97
- force_recreate=False, output_dir="./stats", ci_mode=False, use_chunking=True, save_stats=True):
 
98
  """
99
  Create or update the vector database with blog documents.
100
 
@@ -104,6 +111,10 @@ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
104
  force_recreate: Whether to force recreation of the vector store
105
  output_dir: Directory to save stats and artifacts
106
  ci_mode: Whether to run in CI mode
 
 
 
 
107
 
108
  Returns:
109
  Tuple of (success status, message, stats, stats_file, stats_file_content)
@@ -122,12 +133,20 @@ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
122
  # Save stats for tracking
123
  stats_file = None
124
  stats_content = None
125
- if save_stats:
126
  stats_file, stats_content = save_stats(stats, output_dir=output_dir, ci_mode=ci_mode)
127
 
128
  if use_chunking:
129
  logger.info("Chunking documents...")
130
- documents = blog.split_documents(documents)
 
 
 
 
 
 
 
 
131
 
132
 
133
 
@@ -183,6 +202,10 @@ def main():
183
  logger.info(f"Force recreate: {args.force_recreate}")
184
  logger.info(f"Output directory: {args.output_dir}")
185
  logger.info(f"CI mode: {args.ci}")
 
 
 
 
186
  logger.info("========================")
187
 
188
  try:
@@ -192,7 +215,10 @@ def main():
192
  storage_path=VECTOR_STORAGE_PATH,
193
  force_recreate=args.force_recreate,
194
  output_dir=args.output_dir,
195
- ci_mode=args.ci
 
 
 
196
  )
197
 
198
  logger.info("\n=== Update Summary ===")
 
45
  help="Directory to save stats and artifacts (default: ./stats)")
46
  parser.add_argument("--ci", action="store_true",
47
  help="Run in CI mode (no interactive prompts, exit codes for CI)")
48
+ parser.add_argument("--chunk-size", type=int,
49
+ help=f"Size of each chunk in characters (default from config)")
50
+ parser.add_argument("--chunk-overlap", type=int,
51
+ help=f"Overlap between chunks in characters (default from config)")
52
+ parser.add_argument("--no-chunking", action="store_true",
53
+ help="Don't split documents into chunks (use whole documents)")
54
  return parser.parse_args()
55
 
56
  def save_stats(stats, output_dir="./stats", ci_mode=False):
 
100
  return filename, basic_stats
101
 
102
  def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
103
+ force_recreate=False, output_dir="./stats", ci_mode=False,
104
+ use_chunking=True, should_save_stats=True, chunk_size=None, chunk_overlap=None):
105
  """
106
  Create or update the vector database with blog documents.
107
 
 
111
  force_recreate: Whether to force recreation of the vector store
112
  output_dir: Directory to save stats and artifacts
113
  ci_mode: Whether to run in CI mode
114
+ use_chunking: Whether to split documents into chunks
115
+ should_save_stats: Whether to save statistics about the documents
116
+ chunk_size: Size of each chunk in characters (default from config)
117
+ chunk_overlap: Overlap between chunks in characters (default from config)
118
 
119
  Returns:
120
  Tuple of (success status, message, stats, stats_file, stats_file_content)
 
133
  # Save stats for tracking
134
  stats_file = None
135
  stats_content = None
136
+ if should_save_stats:
137
  stats_file, stats_content = save_stats(stats, output_dir=output_dir, ci_mode=ci_mode)
138
 
139
  if use_chunking:
140
  logger.info("Chunking documents...")
141
+ # Use provided chunk_size and chunk_overlap or default from config
142
+ chunking_params = {}
143
+ if chunk_size is not None:
144
+ chunking_params['chunk_size'] = chunk_size
145
+ if chunk_overlap is not None:
146
+ chunking_params['chunk_overlap'] = chunk_overlap
147
+
148
+ logger.info(f"Using chunk size: {chunking_params.get('chunk_size', 'default')} and overlap: {chunking_params.get('chunk_overlap', 'default')}")
149
+ documents = blog.split_documents(documents, **chunking_params)
150
 
151
 
152
 
 
202
  logger.info(f"Force recreate: {args.force_recreate}")
203
  logger.info(f"Output directory: {args.output_dir}")
204
  logger.info(f"CI mode: {args.ci}")
205
+ logger.info(f"Chunking: {not args.no_chunking}")
206
+ if not args.no_chunking:
207
+ logger.info(f"Chunk size: {args.chunk_size if args.chunk_size else 'default from config'}")
208
+ logger.info(f"Chunk overlap: {args.chunk_overlap if args.chunk_overlap else 'default from config'}")
209
  logger.info("========================")
210
 
211
  try:
 
215
  storage_path=VECTOR_STORAGE_PATH,
216
  force_recreate=args.force_recreate,
217
  output_dir=args.output_dir,
218
+ ci_mode=args.ci,
219
+ use_chunking=not args.no_chunking,
220
+ chunk_size=args.chunk_size,
221
+ chunk_overlap=args.chunk_overlap
222
  )
223
 
224
  logger.info("\n=== Update Summary ===")