Spaces:

mafzaal
/

lets_talk

Running

App Files Files Community

mafzaal commited on May 13

Commit

3379e0a

1 Parent(s): 2754790

Add document chunking configuration and update related utilities

Browse files

Files changed (6) hide show

.env.example +4 -0
BLOG_DATA_UTILS.md +12 -0
README.md +2 -0
py-src/lets_talk/config.py +4 -0
py-src/lets_talk/utils/blog.py +5 -3
py-src/pipeline.py +30 -4

.env.example CHANGED Viewed

@@ -23,3 +23,7 @@ BLOG_BASE_URL=https://thedataguy.pro/blog/
 # Search Configuration
 MAX_SEARCH_RESULTS=5

 # Search Configuration
 MAX_SEARCH_RESULTS=5
+# Document Chunking Configuration
+CHUNK_SIZE=1000
+CHUNK_OVERLAP=200

BLOG_DATA_UTILS.md CHANGED Viewed

@@ -46,6 +46,16 @@ When new blog posts are published, follow these steps:
    uv run python update_blog_data.py --force-recreate
    ```
 This will:
 - Load all blog posts (including new ones)
 - Update the vector embeddings
@@ -61,6 +71,8 @@ VECTOR_STORAGE_PATH=./db/vectorstore_v3    # Path to vector store
 EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l  # Embedding model
 QDRANT_COLLECTION=thedataguy_documents     # Collection name
 BLOG_BASE_URL=https://thedataguy.pro/blog/ # Base URL for blog
 ```
 ### In the Chainlit App

    uv run python update_blog_data.py --force-recreate
    ```
+   Or customize the chunking behavior:
+   ```bash
+   uv run python update_blog_data.py --chunk-size 1500 --chunk-overlap 300
+   ```
+   Or use whole documents without chunking:
+   ```bash
+   uv run python update_blog_data.py --no-chunking
+   ```
 This will:
 - Load all blog posts (including new ones)
 - Update the vector embeddings
 EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l  # Embedding model
 QDRANT_COLLECTION=thedataguy_documents     # Collection name
 BLOG_BASE_URL=https://thedataguy.pro/blog/ # Base URL for blog
+CHUNK_SIZE=1000                            # Size of each document chunk
+CHUNK_OVERLAP=200                          # Overlap between chunks
 ```
 ### In the Chainlit App

README.md CHANGED Viewed

@@ -90,6 +90,8 @@ OPENAI_API_KEY=your_openai_api_key
 VECTOR_STORAGE_PATH=./db/vector_store_tdg
 LLM_MODEL=gpt-4o-mini
 EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
 ```
 ## Running Locally

 VECTOR_STORAGE_PATH=./db/vector_store_tdg
 LLM_MODEL=gpt-4o-mini
 EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
+CHUNK_SIZE=1000
+CHUNK_OVERLAP=200
 ```
 ## Running Locally

py-src/lets_talk/config.py CHANGED Viewed

@@ -16,5 +16,9 @@ SDG_LLM_MODLEL = os.environ.get("SDG_LLM_MODEL", "gpt-4.1")
 EVAL_LLM_MODEL = os.environ.get("EVAL_LLM_MODEL", "gpt-4.1")
 MAX_SEARCH_RESULTS = int(os.environ.get("MAX_SEARCH_RESULTS", "5"))

 EVAL_LLM_MODEL = os.environ.get("EVAL_LLM_MODEL", "gpt-4.1")
 MAX_SEARCH_RESULTS = int(os.environ.get("MAX_SEARCH_RESULTS", "5"))
+# Document chunking configuration
+CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1000"))
+CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "200"))

py-src/lets_talk/utils/blog.py CHANGED Viewed

@@ -26,7 +26,9 @@ from lets_talk.config import (
     VECTOR_STORAGE_PATH,
     EMBEDDING_MODEL,
     QDRANT_COLLECTION,
-    BLOG_BASE_URL
 )
 def load_blog_posts(data_dir: str = DATA_DIR,
@@ -161,8 +163,8 @@ def display_document_stats(stats: Dict[str, Any]):
 def split_documents(documents: List[Document],
-                   chunk_size: int = 1000,
-                   chunk_overlap: int = 200) -> List[Document]:
     """
     Split documents into chunks for better embedding and retrieval.

     VECTOR_STORAGE_PATH,
     EMBEDDING_MODEL,
     QDRANT_COLLECTION,
+    BLOG_BASE_URL,
+    CHUNK_SIZE,
+    CHUNK_OVERLAP
 )
 def load_blog_posts(data_dir: str = DATA_DIR,
 def split_documents(documents: List[Document],
+                   chunk_size: int = CHUNK_SIZE,
+                   chunk_overlap: int = CHUNK_OVERLAP) -> List[Document]:
     """
     Split documents into chunks for better embedding and retrieval.

py-src/pipeline.py CHANGED Viewed

@@ -45,6 +45,12 @@ def parse_args():
                         help="Directory to save stats and artifacts (default: ./stats)")
     parser.add_argument("--ci", action="store_true",
                         help="Run in CI mode (no interactive prompts, exit codes for CI)")
     return parser.parse_args()
 def save_stats(stats, output_dir="./stats", ci_mode=False):
@@ -94,7 +100,8 @@ def save_stats(stats, output_dir="./stats", ci_mode=False):
     return filename, basic_stats
 def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
-                      force_recreate=False, output_dir="./stats", ci_mode=False, use_chunking=True, save_stats=True):
     """
     Create or update the vector database with blog documents.
@@ -104,6 +111,10 @@ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
         force_recreate: Whether to force recreation of the vector store
         output_dir: Directory to save stats and artifacts
         ci_mode: Whether to run in CI mode
     Returns:
         Tuple of (success status, message, stats, stats_file, stats_file_content)
@@ -122,12 +133,20 @@ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
         # Save stats for tracking
         stats_file = None
         stats_content = None
-        if save_stats:
             stats_file, stats_content = save_stats(stats, output_dir=output_dir, ci_mode=ci_mode)
         if use_chunking:
             logger.info("Chunking documents...")
-            documents = blog.split_documents(documents)
@@ -183,6 +202,10 @@ def main():
     logger.info(f"Force recreate: {args.force_recreate}")
     logger.info(f"Output directory: {args.output_dir}")
     logger.info(f"CI mode: {args.ci}")
     logger.info("========================")
     try:
@@ -192,7 +215,10 @@ def main():
             storage_path=VECTOR_STORAGE_PATH,
             force_recreate=args.force_recreate,
             output_dir=args.output_dir,
-            ci_mode=args.ci
         )
         logger.info("\n=== Update Summary ===")

                         help="Directory to save stats and artifacts (default: ./stats)")
     parser.add_argument("--ci", action="store_true",
                         help="Run in CI mode (no interactive prompts, exit codes for CI)")
+    parser.add_argument("--chunk-size", type=int,
+                        help=f"Size of each chunk in characters (default from config)")
+    parser.add_argument("--chunk-overlap", type=int,
+                        help=f"Overlap between chunks in characters (default from config)")
+    parser.add_argument("--no-chunking", action="store_true",
+                        help="Don't split documents into chunks (use whole documents)")
     return parser.parse_args()
 def save_stats(stats, output_dir="./stats", ci_mode=False):
     return filename, basic_stats
 def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
+                      force_recreate=False, output_dir="./stats", ci_mode=False,
+                      use_chunking=True, should_save_stats=True, chunk_size=None, chunk_overlap=None):
     """
     Create or update the vector database with blog documents.
         force_recreate: Whether to force recreation of the vector store
         output_dir: Directory to save stats and artifacts
         ci_mode: Whether to run in CI mode
+        use_chunking: Whether to split documents into chunks
+        should_save_stats: Whether to save statistics about the documents
+        chunk_size: Size of each chunk in characters (default from config)
+        chunk_overlap: Overlap between chunks in characters (default from config)
     Returns:
         Tuple of (success status, message, stats, stats_file, stats_file_content)
         # Save stats for tracking
         stats_file = None
         stats_content = None
+        if should_save_stats:
             stats_file, stats_content = save_stats(stats, output_dir=output_dir, ci_mode=ci_mode)
         if use_chunking:
             logger.info("Chunking documents...")
+            # Use provided chunk_size and chunk_overlap or default from config
+            chunking_params = {}
+            if chunk_size is not None:
+                chunking_params['chunk_size'] = chunk_size
+            if chunk_overlap is not None:
+                chunking_params['chunk_overlap'] = chunk_overlap
+            logger.info(f"Using chunk size: {chunking_params.get('chunk_size', 'default')} and overlap: {chunking_params.get('chunk_overlap', 'default')}")
+            documents = blog.split_documents(documents, **chunking_params)
     logger.info(f"Force recreate: {args.force_recreate}")
     logger.info(f"Output directory: {args.output_dir}")
     logger.info(f"CI mode: {args.ci}")
+    logger.info(f"Chunking: {not args.no_chunking}")
+    if not args.no_chunking:
+        logger.info(f"Chunk size: {args.chunk_size if args.chunk_size else 'default from config'}")
+        logger.info(f"Chunk overlap: {args.chunk_overlap if args.chunk_overlap else 'default from config'}")
     logger.info("========================")
     try:
             storage_path=VECTOR_STORAGE_PATH,
             force_recreate=args.force_recreate,
             output_dir=args.output_dir,
+            ci_mode=args.ci,
+            use_chunking=not args.no_chunking,
+            chunk_size=args.chunk_size,
+            chunk_overlap=args.chunk_overlap
         )
         logger.info("\n=== Update Summary ===")