File size: 7,732 Bytes
af85e91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b1a955e7",
   "metadata": {},
   "source": [
    "# Update Blog Data\n",
    "\n",
    "This notebook demonstrates how to update the blog data and vector store when new blog posts are published. It uses the utility functions from `utils_data_loading.ipynb`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ec048b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "from pathlib import Path\n",
    "from dotenv import load_dotenv\n",
    "import importlib.util\n",
    "\n",
    "# Load environment variables\n",
    "load_dotenv()\n",
    "\n",
    "# Import utility functions from utils_data_loading.ipynb\n",
    "# We'll do this by first converting the notebook to a Python module"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f01d61f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to import the utility module\n",
    "def import_notebook_as_module(notebook_path, module_name=\"utils_module\"):\n",
    "    \"\"\"\n",
    "    Import a Jupyter notebook as a Python module.\n",
    "    \n",
    "    Args:\n",
    "        notebook_path: Path to the notebook\n",
    "        module_name: Name to give the module\n",
    "        \n",
    "    Returns:\n",
    "        The imported module\n",
    "    \"\"\"\n",
    "    import nbformat\n",
    "    from importlib.util import spec_from_loader, module_from_spec\n",
    "    from IPython.core.interactiveshell import InteractiveShell\n",
    "    \n",
    "    shell = InteractiveShell.instance()\n",
    "    \n",
    "    with open(notebook_path) as f:\n",
    "        nb = nbformat.read(f, as_version=4)\n",
    "    \n",
    "    # Create a module\n",
    "    spec = spec_from_loader(module_name, loader=None)\n",
    "    module = module_from_spec(spec)\n",
    "    sys.modules[module_name] = module\n",
    "    \n",
    "    # Execute only the code cells in the notebook\n",
    "    for cell in nb.cells:\n",
    "        if cell.cell_type == 'code':\n",
    "            # Skip cells that start with certain keywords like \"if __name__ == \"__main__\":\"\n",
    "            if 'if __name__ == \"__main__\":' in cell.source:\n",
    "                continue\n",
    "            \n",
    "            # Execute the cell and store its content in the module\n",
    "            code = shell.input_transformer_manager.transform_cell(cell.source)\n",
    "            exec(code, module.__dict__)\n",
    "    \n",
    "    return module"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "774c1373",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import the utility functions\n",
    "utils = import_notebook_as_module('utils_data_loading.ipynb')\n",
    "\n",
    "# Now you can access all the functions from the utils module\n",
    "print(\"Successfully imported utility functions.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "85ae6617",
   "metadata": {},
   "source": [
    "## Configuration\n",
    "\n",
    "Set up the configuration for data processing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54e9ca48",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Configuration (can be overridden from .env file)\n",
    "DATA_DIR = os.environ.get(\"DATA_DIR\", \"data/\")\n",
    "VECTOR_STORAGE_PATH = os.environ.get(\"VECTOR_STORAGE_PATH\", \"./db/vectorstore_v3\")\n",
    "BLOG_BASE_URL = os.environ.get(\"BLOG_BASE_URL\", \"https://thedataguy.pro/blog/\")\n",
    "FORCE_RECREATE_EMBEDDINGS = os.environ.get(\"FORCE_RECREATE_EMBEDDINGS\", \"false\").lower() == \"true\"\n",
    "\n",
    "print(f\"Data Directory: {DATA_DIR}\")\n",
    "print(f\"Vector Storage Path: {VECTOR_STORAGE_PATH}\")\n",
    "print(f\"Blog Base URL: {BLOG_BASE_URL}\")\n",
    "print(f\"Force Recreate Embeddings: {FORCE_RECREATE_EMBEDDINGS}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cc19ab4c",
   "metadata": {},
   "source": [
    "## Update Blog Data Process\n",
    "\n",
    "This process will:\n",
    "1. Load existing blog posts\n",
    "2. Process and update metadata\n",
    "3. Create or update vector embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d56f688",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Process blog posts and create/update embeddings\n",
    "result = utils.process_blog_posts(\n",
    "    data_dir=DATA_DIR,\n",
    "    create_embeddings=True,\n",
    "    force_recreate_embeddings=FORCE_RECREATE_EMBEDDINGS\n",
    ")\n",
    "\n",
    "# Access the documents and vector store\n",
    "documents = result[\"documents\"]\n",
    "stats = result[\"stats\"]\n",
    "vector_store = result[\"vector_store\"]\n",
    "\n",
    "print(f\"\\nProcessed {len(documents)} blog posts\")\n",
    "print(f\"Vector store created/updated at: {VECTOR_STORAGE_PATH}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad3b2dca",
   "metadata": {},
   "source": [
    "## Testing the Vector Store\n",
    "\n",
    "Let's test the vector store with a few queries to make sure it's working correctly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b552e6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a retriever from the vector store\n",
    "retriever = vector_store.as_retriever(search_kwargs={\"k\": 2})\n",
    "\n",
    "# Test queries\n",
    "test_queries = [\n",
    "    \"What is RAGAS?\",\n",
    "    \"How to build research agents?\",\n",
    "    \"What is metric driven development?\",\n",
    "    \"Who is TheDataGuy?\"\n",
    "]\n",
    "\n",
    "for query in test_queries:\n",
    "    print(f\"\\nQuery: {query}\")\n",
    "    docs = retriever.invoke(query)\n",
    "    print(f\"Retrieved {len(docs)} documents:\")\n",
    "    for i, doc in enumerate(docs):\n",
    "        title = doc.metadata.get(\"post_title\", \"Unknown\")\n",
    "        url = doc.metadata.get(\"url\", \"No URL\")\n",
    "        print(f\"{i+1}. {title} ({url})\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ddbe9282",
   "metadata": {},
   "source": [
    "## Schedule This Notebook\n",
    "\n",
    "To keep the blog data up-to-date, you can schedule this notebook to run periodically. \n",
    "Here are some options:\n",
    "\n",
    "1. Use a cron job to run this notebook with papermill\n",
    "2. Set up a GitHub Action to run this notebook on a schedule\n",
    "3. Use Airflow or another workflow management system\n",
    "\n",
    "Example of running with papermill:\n",
    "```bash\n",
    "papermill update_blog_data.ipynb output_$(date +%Y%m%d).ipynb\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3634e064",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save stats to a file for tracking changes over time\n",
    "import json\n",
    "from datetime import datetime\n",
    "\n",
    "stats_dir = Path(\"stats\")\n",
    "stats_dir.mkdir(exist_ok=True)\n",
    "\n",
    "# Add timestamp to stats\n",
    "stats[\"timestamp\"] = datetime.now().isoformat()\n",
    "\n",
    "# Save stats\n",
    "stats_path = stats_dir / f\"blog_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json\"\n",
    "with open(stats_path, \"w\") as f:\n",
    "    json.dump(stats, f, indent=2)\n",
    "\n",
    "print(f\"Saved stats to {stats_path}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}