mafzaal commited on
Commit
af85e91
·
1 Parent(s): 04abf37

feat: Enhance blog data processing with utility functions

Browse files

- Added utility functions for loading and processing blog posts in `utils_data_loading.ipynb`.
- Implemented a new notebook `update_blog_data.ipynb` for updating blog data and vector store.
- Updated `app.py` to utilize utility functions for loading vector stores, with fallback to direct initialization.
- Improved error handling and logging during vector store loading.
- Added new dependencies in `pyproject.toml` for notebook processing and utilities.
- Documented the new utilities and usage instructions in `BLOG_DATA_UTILS.md`.

Files changed (5) hide show
  1. BLOG_DATA_UTILS.md +72 -0
  2. app.py +82 -20
  3. pyproject.toml +2 -0
  4. update_blog_data.ipynb +256 -0
  5. utils_data_loading.ipynb +454 -0
BLOG_DATA_UTILS.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Blog Data Utilities
2
+
3
+ This directory contains utilities for loading, processing, and maintaining blog post data for the RAG system.
4
+
5
+ ## Available Tools
6
+
7
+ ### `utils_data_loading.ipynb`
8
+
9
+ This notebook contains utility functions for:
10
+ - Loading blog posts from the data directory
11
+ - Processing and enriching metadata (adding URLs, titles, etc.)
12
+ - Getting statistics about the documents
13
+ - Creating and updating vector embeddings
14
+ - Loading existing vector stores
15
+
16
+ ### `update_blog_data.ipynb`
17
+
18
+ This notebook demonstrates how to:
19
+ - Use the utility functions to update the blog data
20
+ - Process new blog posts
21
+ - Update the vector store
22
+ - Test the updated system with sample queries
23
+ - Track changes over time
24
+
25
+ ## How to Use
26
+
27
+ ### Updating Blog Data
28
+
29
+ When new blog posts are published, follow these steps:
30
+
31
+ 1. Add the markdown files to the `data/` directory
32
+ 2. Run the update notebook:
33
+ ```bash
34
+ cd /home/mafzaal/source/lets-talk
35
+ uv run jupyter nbconvert --to notebook --execute update_blog_data.ipynb --output executed_update_$(date +%Y%m%d).ipynb
36
+ ```
37
+
38
+ This will:
39
+ - Load all blog posts (including new ones)
40
+ - Update the vector embeddings
41
+ - Save statistics for tracking
42
+
43
+ ### Customizing the Process
44
+
45
+ You can customize the process by editing the `.env` file:
46
+
47
+ ```
48
+ DATA_DIR=data/ # Directory containing blog posts
49
+ VECTOR_STORAGE_PATH=./db/vectorstore_v3 # Path to vector store
50
+ EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l # Embedding model
51
+ QDRANT_COLLECTION=thedataguy_documents # Collection name
52
+ BLOG_BASE_URL=https://thedataguy.pro/blog/ # Base URL for blog
53
+ FORCE_RECREATE_EMBEDDINGS=false # Whether to force recreation
54
+ ```
55
+
56
+ ### In the Chainlit App
57
+
58
+ The Chainlit app (`app.py`) has been updated to use these utility functions if available. It falls back to direct initialization if they can't be loaded.
59
+
60
+ ## Adding Custom Processing
61
+
62
+ To add custom processing for blog posts:
63
+
64
+ 1. Edit the `update_document_metadata` function in `utils_data_loading.ipynb`
65
+ 2. Add any additional enrichment or processing steps
66
+ 3. Update the vector store using the `update_blog_data.ipynb` notebook
67
+
68
+ ## Future Improvements
69
+
70
+ - Add support for incremental updates (only process new posts)
71
+ - Add webhook support to automatically update when new posts are published
72
+ - Add tracking of embedding models and versions
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
  import getpass
 
 
3
  from pathlib import Path
4
  from operator import itemgetter
5
  from dotenv import load_dotenv
@@ -16,23 +18,68 @@ from langchain_qdrant import QdrantVectorStore
16
  from qdrant_client import QdrantClient
17
  from qdrant_client.http.models import Distance, VectorParams
18
 
19
- # Get vector storage path from .env file with fallback
20
- storage_path = Path(os.environ.get("VECTOR_STORAGE_PATH", "./db/vectorstore_v3"))
21
- #qclient = QdrantClient(storage_path)
22
-
23
- # Load embedding model from environment variable with fallback
24
- embedding_model = os.environ.get("EMBEDDING_MODEL", "Snowflake/snowflake-arctic-embed-l")
25
- huggingface_embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
26
-
27
- # Set up Qdrant vectorstore from existing collection
28
- collection_name = os.environ.get("QDRANT_COLLECTION", "thedataguy_documents")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- vector_store = QdrantVectorStore.from_existing_collection(
31
- #client=qclient,
32
- path=storage_path,
33
- collection_name=collection_name,
34
- embedding=huggingface_embeddings,
35
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
 
38
  # Create a retriever
@@ -97,14 +144,29 @@ async def on_message(message: cl.Message):
97
  # Get chain from user session
98
  chain = cl.user_session.get("chain")
99
 
100
- print( message.content)
101
  # Call the chain with the user message
102
- response = chain.invoke({"question": message.content})
103
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  # Send the response with sources
106
  await cl.Message(
107
  content=response["response"].content,
108
-
109
  ).send()
110
 
 
1
  import os
2
  import getpass
3
+ import sys
4
+ import importlib.util
5
  from pathlib import Path
6
  from operator import itemgetter
7
  from dotenv import load_dotenv
 
18
  from qdrant_client import QdrantClient
19
  from qdrant_client.http.models import Distance, VectorParams
20
 
21
+ # Import utility functions from the notebook
22
+ def import_notebook_functions(notebook_path):
23
+ """Import functions from a Jupyter notebook"""
24
+ import nbformat
25
+ from importlib.util import spec_from_loader, module_from_spec
26
+ from IPython.core.interactiveshell import InteractiveShell
27
+
28
+ # Create a module
29
+ module_name = Path(notebook_path).stem
30
+ spec = spec_from_loader(module_name, loader=None)
31
+ module = module_from_spec(spec)
32
+ sys.modules[module_name] = module
33
+
34
+ # Read the notebook
35
+ with open(notebook_path) as f:
36
+ nb = nbformat.read(f, as_version=4)
37
+
38
+ # Execute code cells
39
+ shell = InteractiveShell.instance()
40
+ for cell in nb.cells:
41
+ if cell.cell_type == 'code':
42
+ # Skip example code
43
+ if 'if __name__ == "__main__":' in cell.source:
44
+ continue
45
+
46
+ code = shell.input_transformer_manager.transform_cell(cell.source)
47
+ exec(code, module.__dict__)
48
+
49
+ return module
50
 
51
+ # Try to import utility functions if available
52
+ try:
53
+ utils = import_notebook_functions('utils_data_loading.ipynb')
54
+
55
+ # Load vector store using the utility function
56
+ vector_store = utils.load_vector_store(
57
+ storage_path=os.environ.get("VECTOR_STORAGE_PATH", "./db/vectorstore_v3"),
58
+ collection_name=os.environ.get("QDRANT_COLLECTION", "thedataguy_documents"),
59
+ embedding_model=os.environ.get("EMBEDDING_MODEL", "Snowflake/snowflake-arctic-embed-l")
60
+ )
61
+
62
+ print("Successfully loaded vector store using utility functions")
63
+
64
+ except Exception as e:
65
+ print(f"Could not load utility functions: {e}")
66
+ print("Falling back to direct initialization")
67
+
68
+ # Get vector storage path from .env file with fallback
69
+ storage_path = Path(os.environ.get("VECTOR_STORAGE_PATH", "./db/vectorstore_v3"))
70
+
71
+ # Load embedding model from environment variable with fallback
72
+ embedding_model = os.environ.get("EMBEDDING_MODEL", "Snowflake/snowflake-arctic-embed-l")
73
+ huggingface_embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
74
+
75
+ # Set up Qdrant vectorstore from existing collection
76
+ collection_name = os.environ.get("QDRANT_COLLECTION", "thedataguy_documents")
77
+
78
+ vector_store = QdrantVectorStore.from_existing_collection(
79
+ path=storage_path,
80
+ collection_name=collection_name,
81
+ embedding=huggingface_embeddings,
82
+ )
83
 
84
 
85
  # Create a retriever
 
144
  # Get chain from user session
145
  chain = cl.user_session.get("chain")
146
 
147
+ print(message.content)
148
  # Call the chain with the user message
149
+ response = chain.invoke({"question": message.content})
150
+
151
+ # Get the sources to display them
152
+ sources = []
153
+ for doc in response["context"]:
154
+ if "url" in doc.metadata:
155
+ # Get title from post_title metadata if available, otherwise derive from URL
156
+ title = doc.metadata.get("post_title", "")
157
+ if not title:
158
+ title = doc.metadata["url"].split("/")[-2].replace("-", " ").title()
159
+
160
+ sources.append(
161
+ cl.Source(
162
+ url=doc.metadata["url"],
163
+ title=title
164
+ )
165
+ )
166
 
167
  # Send the response with sources
168
  await cl.Message(
169
  content=response["response"].content,
170
+ sources=sources
171
  ).send()
172
 
pyproject.toml CHANGED
@@ -7,6 +7,7 @@ requires-python = ">=3.13"
7
  dependencies = [
8
  "chainlit>=2.5.5",
9
  "ipykernel>=6.29.5",
 
10
  "langchain>=0.3.25",
11
  "langchain-community>=0.3.23",
12
  "langchain-core>=0.3.59",
@@ -14,6 +15,7 @@ dependencies = [
14
  "langchain-openai>=0.3.16",
15
  "langchain-qdrant>=0.2.0",
16
  "langchain-text-splitters>=0.3.8",
 
17
  "pandas>=2.2.3",
18
  "python-dotenv>=1.1.0",
19
  "qdrant-client>=1.14.2",
 
7
  dependencies = [
8
  "chainlit>=2.5.5",
9
  "ipykernel>=6.29.5",
10
+ "ipython>=9.2.0",
11
  "langchain>=0.3.25",
12
  "langchain-community>=0.3.23",
13
  "langchain-core>=0.3.59",
 
15
  "langchain-openai>=0.3.16",
16
  "langchain-qdrant>=0.2.0",
17
  "langchain-text-splitters>=0.3.8",
18
+ "nbformat>=5.10.4",
19
  "pandas>=2.2.3",
20
  "python-dotenv>=1.1.0",
21
  "qdrant-client>=1.14.2",
update_blog_data.ipynb ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "b1a955e7",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Update Blog Data\n",
9
+ "\n",
10
+ "This notebook demonstrates how to update the blog data and vector store when new blog posts are published. It uses the utility functions from `utils_data_loading.ipynb`."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "id": "6ec048b4",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "import sys\n",
21
+ "import os\n",
22
+ "from pathlib import Path\n",
23
+ "from dotenv import load_dotenv\n",
24
+ "import importlib.util\n",
25
+ "\n",
26
+ "# Load environment variables\n",
27
+ "load_dotenv()\n",
28
+ "\n",
29
+ "# Import utility functions from utils_data_loading.ipynb\n",
30
+ "# We'll do this by first converting the notebook to a Python module"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "id": "7f01d61f",
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "# Function to import the utility module\n",
41
+ "def import_notebook_as_module(notebook_path, module_name=\"utils_module\"):\n",
42
+ " \"\"\"\n",
43
+ " Import a Jupyter notebook as a Python module.\n",
44
+ " \n",
45
+ " Args:\n",
46
+ " notebook_path: Path to the notebook\n",
47
+ " module_name: Name to give the module\n",
48
+ " \n",
49
+ " Returns:\n",
50
+ " The imported module\n",
51
+ " \"\"\"\n",
52
+ " import nbformat\n",
53
+ " from importlib.util import spec_from_loader, module_from_spec\n",
54
+ " from IPython.core.interactiveshell import InteractiveShell\n",
55
+ " \n",
56
+ " shell = InteractiveShell.instance()\n",
57
+ " \n",
58
+ " with open(notebook_path) as f:\n",
59
+ " nb = nbformat.read(f, as_version=4)\n",
60
+ " \n",
61
+ " # Create a module\n",
62
+ " spec = spec_from_loader(module_name, loader=None)\n",
63
+ " module = module_from_spec(spec)\n",
64
+ " sys.modules[module_name] = module\n",
65
+ " \n",
66
+ " # Execute only the code cells in the notebook\n",
67
+ " for cell in nb.cells:\n",
68
+ " if cell.cell_type == 'code':\n",
69
+ " # Skip cells that start with certain keywords like \"if __name__ == \"__main__\":\"\n",
70
+ " if 'if __name__ == \"__main__\":' in cell.source:\n",
71
+ " continue\n",
72
+ " \n",
73
+ " # Execute the cell and store its content in the module\n",
74
+ " code = shell.input_transformer_manager.transform_cell(cell.source)\n",
75
+ " exec(code, module.__dict__)\n",
76
+ " \n",
77
+ " return module"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": null,
83
+ "id": "774c1373",
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "# Import the utility functions\n",
88
+ "utils = import_notebook_as_module('utils_data_loading.ipynb')\n",
89
+ "\n",
90
+ "# Now you can access all the functions from the utils module\n",
91
+ "print(\"Successfully imported utility functions.\")"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "markdown",
96
+ "id": "85ae6617",
97
+ "metadata": {},
98
+ "source": [
99
+ "## Configuration\n",
100
+ "\n",
101
+ "Set up the configuration for data processing."
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": null,
107
+ "id": "54e9ca48",
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": [
111
+ "# Configuration (can be overridden from .env file)\n",
112
+ "DATA_DIR = os.environ.get(\"DATA_DIR\", \"data/\")\n",
113
+ "VECTOR_STORAGE_PATH = os.environ.get(\"VECTOR_STORAGE_PATH\", \"./db/vectorstore_v3\")\n",
114
+ "BLOG_BASE_URL = os.environ.get(\"BLOG_BASE_URL\", \"https://thedataguy.pro/blog/\")\n",
115
+ "FORCE_RECREATE_EMBEDDINGS = os.environ.get(\"FORCE_RECREATE_EMBEDDINGS\", \"false\").lower() == \"true\"\n",
116
+ "\n",
117
+ "print(f\"Data Directory: {DATA_DIR}\")\n",
118
+ "print(f\"Vector Storage Path: {VECTOR_STORAGE_PATH}\")\n",
119
+ "print(f\"Blog Base URL: {BLOG_BASE_URL}\")\n",
120
+ "print(f\"Force Recreate Embeddings: {FORCE_RECREATE_EMBEDDINGS}\")"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "markdown",
125
+ "id": "cc19ab4c",
126
+ "metadata": {},
127
+ "source": [
128
+ "## Update Blog Data Process\n",
129
+ "\n",
130
+ "This process will:\n",
131
+ "1. Load existing blog posts\n",
132
+ "2. Process and update metadata\n",
133
+ "3. Create or update vector embeddings"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "id": "3d56f688",
140
+ "metadata": {},
141
+ "outputs": [],
142
+ "source": [
143
+ "# Process blog posts and create/update embeddings\n",
144
+ "result = utils.process_blog_posts(\n",
145
+ " data_dir=DATA_DIR,\n",
146
+ " create_embeddings=True,\n",
147
+ " force_recreate_embeddings=FORCE_RECREATE_EMBEDDINGS\n",
148
+ ")\n",
149
+ "\n",
150
+ "# Access the documents and vector store\n",
151
+ "documents = result[\"documents\"]\n",
152
+ "stats = result[\"stats\"]\n",
153
+ "vector_store = result[\"vector_store\"]\n",
154
+ "\n",
155
+ "print(f\"\\nProcessed {len(documents)} blog posts\")\n",
156
+ "print(f\"Vector store created/updated at: {VECTOR_STORAGE_PATH}\")"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "markdown",
161
+ "id": "ad3b2dca",
162
+ "metadata": {},
163
+ "source": [
164
+ "## Testing the Vector Store\n",
165
+ "\n",
166
+ "Let's test the vector store with a few queries to make sure it's working correctly."
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": null,
172
+ "id": "8b552e6b",
173
+ "metadata": {},
174
+ "outputs": [],
175
+ "source": [
176
+ "# Create a retriever from the vector store\n",
177
+ "retriever = vector_store.as_retriever(search_kwargs={\"k\": 2})\n",
178
+ "\n",
179
+ "# Test queries\n",
180
+ "test_queries = [\n",
181
+ " \"What is RAGAS?\",\n",
182
+ " \"How to build research agents?\",\n",
183
+ " \"What is metric driven development?\",\n",
184
+ " \"Who is TheDataGuy?\"\n",
185
+ "]\n",
186
+ "\n",
187
+ "for query in test_queries:\n",
188
+ " print(f\"\\nQuery: {query}\")\n",
189
+ " docs = retriever.invoke(query)\n",
190
+ " print(f\"Retrieved {len(docs)} documents:\")\n",
191
+ " for i, doc in enumerate(docs):\n",
192
+ " title = doc.metadata.get(\"post_title\", \"Unknown\")\n",
193
+ " url = doc.metadata.get(\"url\", \"No URL\")\n",
194
+ " print(f\"{i+1}. {title} ({url})\")"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "markdown",
199
+ "id": "ddbe9282",
200
+ "metadata": {},
201
+ "source": [
202
+ "## Schedule This Notebook\n",
203
+ "\n",
204
+ "To keep the blog data up-to-date, you can schedule this notebook to run periodically. \n",
205
+ "Here are some options:\n",
206
+ "\n",
207
+ "1. Use a cron job to run this notebook with papermill\n",
208
+ "2. Set up a GitHub Action to run this notebook on a schedule\n",
209
+ "3. Use Airflow or another workflow management system\n",
210
+ "\n",
211
+ "Example of running with papermill:\n",
212
+ "```bash\n",
213
+ "papermill update_blog_data.ipynb output_$(date +%Y%m%d).ipynb\n",
214
+ "```"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": null,
220
+ "id": "3634e064",
221
+ "metadata": {},
222
+ "outputs": [],
223
+ "source": [
224
+ "# Save stats to a file for tracking changes over time\n",
225
+ "import json\n",
226
+ "from datetime import datetime\n",
227
+ "\n",
228
+ "stats_dir = Path(\"stats\")\n",
229
+ "stats_dir.mkdir(exist_ok=True)\n",
230
+ "\n",
231
+ "# Add timestamp to stats\n",
232
+ "stats[\"timestamp\"] = datetime.now().isoformat()\n",
233
+ "\n",
234
+ "# Save stats\n",
235
+ "stats_path = stats_dir / f\"blog_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json\"\n",
236
+ "with open(stats_path, \"w\") as f:\n",
237
+ " json.dump(stats, f, indent=2)\n",
238
+ "\n",
239
+ "print(f\"Saved stats to {stats_path}\")"
240
+ ]
241
+ }
242
+ ],
243
+ "metadata": {
244
+ "kernelspec": {
245
+ "display_name": ".venv",
246
+ "language": "python",
247
+ "name": "python3"
248
+ },
249
+ "language_info": {
250
+ "name": "python",
251
+ "version": "3.13.2"
252
+ }
253
+ },
254
+ "nbformat": 4,
255
+ "nbformat_minor": 5
256
+ }
utils_data_loading.ipynb ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "b31c2849",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Utility Functions for Blog Post Loading and Processing\n",
9
+ "\n",
10
+ "This notebook contains utility functions for loading blog posts from the data directory, processing their metadata, and creating vector embeddings for use in the RAG system."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "id": "848b0a86",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "import os\n",
21
+ "import json\n",
22
+ "from pathlib import Path\n",
23
+ "from typing import List, Dict, Any, Optional\n",
24
+ "\n",
25
+ "from langchain_community.document_loaders import DirectoryLoader\n",
26
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
27
+ "from langchain.schema.document import Document\n",
28
+ "from langchain_huggingface import HuggingFaceEmbeddings\n",
29
+ "from langchain_community.vectorstores import Qdrant\n",
30
+ "\n",
31
+ "from IPython.display import Markdown, display\n",
32
+ "from dotenv import load_dotenv\n",
33
+ "\n",
34
+ "# Load environment variables from .env file\n",
35
+ "load_dotenv()"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "markdown",
40
+ "id": "39e32435",
41
+ "metadata": {},
42
+ "source": [
43
+ "## Configuration\n",
44
+ "\n",
45
+ "Load configuration from environment variables or use defaults."
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "id": "5a6a5d6d",
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "# Configuration with defaults\n",
56
+ "DATA_DIR = os.environ.get(\"DATA_DIR\", \"data/\")\n",
57
+ "VECTOR_STORAGE_PATH = os.environ.get(\"VECTOR_STORAGE_PATH\", \"./db/vectorstore_v3\")\n",
58
+ "EMBEDDING_MODEL = os.environ.get(\"EMBEDDING_MODEL\", \"Snowflake/snowflake-arctic-embed-l\")\n",
59
+ "QDRANT_COLLECTION = os.environ.get(\"QDRANT_COLLECTION\", \"thedataguy_documents\")\n",
60
+ "BLOG_BASE_URL = os.environ.get(\"BLOG_BASE_URL\", \"https://thedataguy.pro/blog/\")"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "markdown",
65
+ "id": "01454147",
66
+ "metadata": {},
67
+ "source": [
68
+ "## Utility Functions\n",
69
+ "\n",
70
+ "These functions handle the loading, processing, and storing of blog posts."
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": null,
76
+ "id": "25792cd5",
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "def load_blog_posts(data_dir: str = DATA_DIR, \n",
81
+ " glob_pattern: str = \"*.md\", \n",
82
+ " recursive: bool = True, \n",
83
+ " show_progress: bool = True) -> List[Document]:\n",
84
+ " \"\"\"\n",
85
+ " Load blog posts from the specified directory.\n",
86
+ " \n",
87
+ " Args:\n",
88
+ " data_dir: Directory containing the blog posts\n",
89
+ " glob_pattern: Pattern to match files\n",
90
+ " recursive: Whether to search subdirectories\n",
91
+ " show_progress: Whether to show a progress bar\n",
92
+ " \n",
93
+ " Returns:\n",
94
+ " List of Document objects containing the blog posts\n",
95
+ " \"\"\"\n",
96
+ " text_loader = DirectoryLoader(\n",
97
+ " data_dir, \n",
98
+ " glob=glob_pattern, \n",
99
+ " show_progress=show_progress,\n",
100
+ " recursive=recursive\n",
101
+ " )\n",
102
+ " \n",
103
+ " documents = text_loader.load()\n",
104
+ " print(f\"Loaded {len(documents)} documents from {data_dir}\")\n",
105
+ " return documents"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": null,
111
+ "id": "e7ddba72",
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "def update_document_metadata(documents: List[Document], \n",
116
+ " data_dir_prefix: str = DATA_DIR,\n",
117
+ " blog_base_url: str = BLOG_BASE_URL,\n",
118
+ " remove_suffix: str = \"index.md\") -> List[Document]:\n",
119
+ " \"\"\"\n",
120
+ " Update the metadata of documents to include URL and other information.\n",
121
+ " \n",
122
+ " Args:\n",
123
+ " documents: List of Document objects to update\n",
124
+ " data_dir_prefix: Prefix to replace in source paths\n",
125
+ " blog_base_url: Base URL for the blog posts\n",
126
+ " remove_suffix: Suffix to remove from paths (like index.md)\n",
127
+ " \n",
128
+ " Returns:\n",
129
+ " Updated list of Document objects\n",
130
+ " \"\"\"\n",
131
+ " for doc in documents:\n",
132
+ " # Create URL from source path\n",
133
+ " doc.metadata[\"url\"] = doc.metadata[\"source\"].replace(data_dir_prefix, blog_base_url)\n",
134
+ " \n",
135
+ " # Remove index.md or other suffix if present\n",
136
+ " if remove_suffix and doc.metadata[\"url\"].endswith(remove_suffix):\n",
137
+ " doc.metadata[\"url\"] = doc.metadata[\"url\"][:-len(remove_suffix)]\n",
138
+ " \n",
139
+ " # Extract post title from the directory structure\n",
140
+ " path_parts = Path(doc.metadata[\"source\"]).parts\n",
141
+ " if len(path_parts) > 1:\n",
142
+ " # Use the directory name as post_slug\n",
143
+ " doc.metadata[\"post_slug\"] = path_parts[-2]\n",
144
+ " doc.metadata[\"post_title\"] = path_parts[-2].replace(\"-\", \" \").title()\n",
145
+ " \n",
146
+ " # Add document length as metadata\n",
147
+ " doc.metadata[\"content_length\"] = len(doc.page_content)\n",
148
+ " \n",
149
+ " return documents"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": null,
155
+ "id": "e0dfe498",
156
+ "metadata": {},
157
+ "outputs": [],
158
+ "source": [
159
+ "def get_document_stats(documents: List[Document]) -> Dict[str, Any]:\n",
160
+ " \"\"\"\n",
161
+ " Get statistics about the documents.\n",
162
+ " \n",
163
+ " Args:\n",
164
+ " documents: List of Document objects\n",
165
+ " \n",
166
+ " Returns:\n",
167
+ " Dictionary with statistics\n",
168
+ " \"\"\"\n",
169
+ " stats = {\n",
170
+ " \"total_documents\": len(documents),\n",
171
+ " \"total_characters\": sum(len(doc.page_content) for doc in documents),\n",
172
+ " \"min_length\": min(len(doc.page_content) for doc in documents),\n",
173
+ " \"max_length\": max(len(doc.page_content) for doc in documents),\n",
174
+ " \"avg_length\": sum(len(doc.page_content) for doc in documents) / len(documents) if documents else 0,\n",
175
+ " }\n",
176
+ " \n",
177
+ " # Create a list of document info for analysis\n",
178
+ " doc_info = []\n",
179
+ " for doc in documents:\n",
180
+ " doc_info.append({\n",
181
+ " \"url\": doc.metadata.get(\"url\", \"\"),\n",
182
+ " \"source\": doc.metadata.get(\"source\", \"\"),\n",
183
+ " \"title\": doc.metadata.get(\"post_title\", \"\"),\n",
184
+ " \"text_length\": doc.metadata.get(\"content_length\", 0),\n",
185
+ " })\n",
186
+ " \n",
187
+ " stats[\"documents\"] = doc_info\n",
188
+ " return stats"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "id": "0ae139c0",
195
+ "metadata": {},
196
+ "outputs": [],
197
+ "source": [
198
+ "def display_document_stats(stats: Dict[str, Any]):\n",
199
+ " \"\"\"\n",
200
+ " Display document statistics in a readable format.\n",
201
+ " \n",
202
+ " Args:\n",
203
+ " stats: Dictionary with statistics from get_document_stats\n",
204
+ " \"\"\"\n",
205
+ " print(f\"Total Documents: {stats['total_documents']}\")\n",
206
+ " print(f\"Total Characters: {stats['total_characters']}\")\n",
207
+ " print(f\"Min Length: {stats['min_length']} characters\")\n",
208
+ " print(f\"Max Length: {stats['max_length']} characters\")\n",
209
+ " print(f\"Average Length: {stats['avg_length']:.2f} characters\")\n",
210
+ " \n",
211
+ " # Display documents as a table\n",
212
+ " import pandas as pd\n",
213
+ " if stats[\"documents\"]:\n",
214
+ " df = pd.DataFrame(stats[\"documents\"])\n",
215
+ " display(df)"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": null,
221
+ "id": "2dcf66b4",
222
+ "metadata": {},
223
+ "outputs": [],
224
+ "source": [
225
+ "def split_documents(documents: List[Document], \n",
226
+ " chunk_size: int = 1000, \n",
227
+ " chunk_overlap: int = 200) -> List[Document]:\n",
228
+ " \"\"\"\n",
229
+ " Split documents into chunks for better embedding and retrieval.\n",
230
+ " \n",
231
+ " Args:\n",
232
+ " documents: List of Document objects to split\n",
233
+ " chunk_size: Size of each chunk in characters\n",
234
+ " chunk_overlap: Overlap between chunks in characters\n",
235
+ " \n",
236
+ " Returns:\n",
237
+ " List of split Document objects\n",
238
+ " \"\"\"\n",
239
+ " text_splitter = RecursiveCharacterTextSplitter(\n",
240
+ " chunk_size=chunk_size,\n",
241
+ " chunk_overlap=chunk_overlap,\n",
242
+ " length_function=len,\n",
243
+ " )\n",
244
+ " \n",
245
+ " split_docs = text_splitter.split_documents(documents)\n",
246
+ " print(f\"Split {len(documents)} documents into {len(split_docs)} chunks\")\n",
247
+ " return split_docs"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": null,
253
+ "id": "527ad848",
254
+ "metadata": {},
255
+ "outputs": [],
256
+ "source": [
257
+ "def create_vector_store(documents: List[Document], \n",
258
+ " storage_path: str = VECTOR_STORAGE_PATH,\n",
259
+ " collection_name: str = QDRANT_COLLECTION,\n",
260
+ " embedding_model: str = EMBEDDING_MODEL,\n",
261
+ " force_recreate: bool = False) -> Qdrant:\n",
262
+ " \"\"\"\n",
263
+ " Create a vector store from documents.\n",
264
+ " \n",
265
+ " Args:\n",
266
+ " documents: List of Document objects to store\n",
267
+ " storage_path: Path to the vector store\n",
268
+ " collection_name: Name of the collection\n",
269
+ " embedding_model: Name of the embedding model\n",
270
+ " force_recreate: Whether to force recreation of the vector store\n",
271
+ " \n",
272
+ " Returns:\n",
273
+ " Qdrant vector store\n",
274
+ " \"\"\"\n",
275
+ " # Initialize the embedding model\n",
276
+ " embeddings = HuggingFaceEmbeddings(model_name=embedding_model)\n",
277
+ " \n",
278
+ " # Create the directory if it doesn't exist\n",
279
+ " storage_dir = Path(storage_path).parent\n",
280
+ " os.makedirs(storage_dir, exist_ok=True)\n",
281
+ " \n",
282
+ " # Check if vector store exists\n",
283
+ " vector_store_exists = Path(storage_path).exists() and not force_recreate\n",
284
+ " \n",
285
+ " if vector_store_exists:\n",
286
+ " print(f\"Loading existing vector store from {storage_path}\")\n",
287
+ " try:\n",
288
+ " vector_store = Qdrant(\n",
289
+ " path=storage_path,\n",
290
+ " embedding_function=embeddings,\n",
291
+ " collection_name=collection_name\n",
292
+ " )\n",
293
+ " return vector_store\n",
294
+ " except Exception as e:\n",
295
+ " print(f\"Error loading existing vector store: {e}\")\n",
296
+ " print(\"Creating new vector store...\")\n",
297
+ " force_recreate = True\n",
298
+ " \n",
299
+ " # Create new vector store\n",
300
+ " print(f\"Creating new vector store at {storage_path}\")\n",
301
+ " vector_store = Qdrant.from_documents(\n",
302
+ " documents=documents,\n",
303
+ " embedding=embeddings,\n",
304
+ " path=storage_path,\n",
305
+ " collection_name=collection_name,\n",
306
+ " )\n",
307
+ " \n",
308
+ " return vector_store"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "markdown",
313
+ "id": "c78f99fc",
314
+ "metadata": {},
315
+ "source": [
316
+ "## Example Usage\n",
317
+ "\n",
318
+ "Here's how to use these utility functions for processing blog posts."
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": null,
324
+ "id": "132d32c6",
325
+ "metadata": {},
326
+ "outputs": [],
327
+ "source": [
328
+ "def process_blog_posts(data_dir: str = DATA_DIR,\n",
329
+ " create_embeddings: bool = True,\n",
330
+ " force_recreate_embeddings: bool = False):\n",
331
+ " \"\"\"\n",
332
+ " Complete pipeline to process blog posts and optionally create vector embeddings.\n",
333
+ " \n",
334
+ " Args:\n",
335
+ " data_dir: Directory containing the blog posts\n",
336
+ " create_embeddings: Whether to create vector embeddings\n",
337
+ " force_recreate_embeddings: Whether to force recreation of embeddings\n",
338
+ " \n",
339
+ " Returns:\n",
340
+ " Dictionary with data and vector store (if created)\n",
341
+ " \"\"\"\n",
342
+ " # Load documents\n",
343
+ " documents = load_blog_posts(data_dir)\n",
344
+ " \n",
345
+ " # Update metadata\n",
346
+ " documents = update_document_metadata(documents)\n",
347
+ " \n",
348
+ " # Get and display stats\n",
349
+ " stats = get_document_stats(documents)\n",
350
+ " display_document_stats(stats)\n",
351
+ " \n",
352
+ " result = {\n",
353
+ " \"documents\": documents,\n",
354
+ " \"stats\": stats,\n",
355
+ " \"vector_store\": None\n",
356
+ " }\n",
357
+ " \n",
358
+ " # Create vector store if requested\n",
359
+ " if create_embeddings:\n",
360
+ " vector_store = create_vector_store(\n",
361
+ " documents, \n",
362
+ " force_recreate=force_recreate_embeddings\n",
363
+ " )\n",
364
+ " result[\"vector_store\"] = vector_store\n",
365
+ " \n",
366
+ " return result"
367
+ ]
368
+ },
369
+ {
370
+ "cell_type": "code",
371
+ "execution_count": null,
372
+ "id": "266d4fb3",
373
+ "metadata": {},
374
+ "outputs": [],
375
+ "source": [
376
+ "# Example usage\n",
377
+ "if __name__ == \"__main__\":\n",
378
+ " # Process blog posts without creating embeddings\n",
379
+ " result = process_blog_posts(create_embeddings=False)\n",
380
+ " \n",
381
+ " # Example: Access the documents\n",
382
+ " print(f\"\\nDocument example: {result['documents'][0].metadata}\")\n",
383
+ " \n",
384
+ " # Create embeddings if needed\n",
385
+ " # result = process_blog_posts(create_embeddings=True)\n",
386
+ " \n",
387
+ " # Retriever example\n",
388
+ " # retriever = result[\"vector_store\"].as_retriever()\n",
389
+ " # query = \"What is RAGAS?\"\n",
390
+ " # docs = retriever.invoke(query, k=2)\n",
391
+ " # print(f\"\\nRetrieved {len(docs)} documents for query: {query}\")"
392
+ ]
393
+ },
394
+ {
395
+ "cell_type": "markdown",
396
+ "id": "22132649",
397
+ "metadata": {},
398
+ "source": [
399
+ "## Function for Loading Existing Vector Store\n",
400
+ "\n",
401
+ "This function can be used to load an existing vector store without reprocessing all blog posts."
402
+ ]
403
+ },
404
+ {
405
+ "cell_type": "code",
406
+ "execution_count": null,
407
+ "id": "c24e0c02",
408
+ "metadata": {},
409
+ "outputs": [],
410
+ "source": [
411
+ "def load_vector_store(storage_path: str = VECTOR_STORAGE_PATH,\n",
412
+ " collection_name: str = QDRANT_COLLECTION,\n",
413
+ " embedding_model: str = EMBEDDING_MODEL) -> Optional[Qdrant]:\n",
414
+ " \"\"\"\n",
415
+ " Load an existing vector store.\n",
416
+ " \n",
417
+ " Args:\n",
418
+ " storage_path: Path to the vector store\n",
419
+ " collection_name: Name of the collection\n",
420
+ " embedding_model: Name of the embedding model\n",
421
+ " \n",
422
+ " Returns:\n",
423
+ " Qdrant vector store or None if it doesn't exist\n",
424
+ " \"\"\"\n",
425
+ " # Initialize the embedding model\n",
426
+ " embeddings = HuggingFaceEmbeddings(model_name=embedding_model)\n",
427
+ " \n",
428
+ " # Check if vector store exists\n",
429
+ " if not Path(storage_path).exists():\n",
430
+ " print(f\"Vector store not found at {storage_path}\")\n",
431
+ " return None\n",
432
+ " \n",
433
+ " try:\n",
434
+ " vector_store = Qdrant(\n",
435
+ " path=storage_path,\n",
436
+ " embedding_function=embeddings,\n",
437
+ " collection_name=collection_name\n",
438
+ " )\n",
439
+ " print(f\"Loaded vector store from {storage_path}\")\n",
440
+ " return vector_store\n",
441
+ " except Exception as e:\n",
442
+ " print(f\"Error loading vector store: {e}\")\n",
443
+ " return None"
444
+ ]
445
+ }
446
+ ],
447
+ "metadata": {
448
+ "language_info": {
449
+ "name": "python"
450
+ }
451
+ },
452
+ "nbformat": 4,
453
+ "nbformat_minor": 5
454
+ }