Spaces:

mafzaal
/

lets_talk

Running

File size: 7,732 Bytes

af85e91

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b1a955e7",
   "metadata": {},
   "source": [
    "# Update Blog Data\n",
    "\n",
    "This notebook demonstrates how to update the blog data and vector store when new blog posts are published. It uses the utility functions from `utils_data_loading.ipynb`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ec048b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "from pathlib import Path\n",
    "from dotenv import load_dotenv\n",
    "import importlib.util\n",
    "\n",
    "# Load environment variables\n",
    "load_dotenv()\n",
    "\n",
    "# Import utility functions from utils_data_loading.ipynb\n",
    "# We'll do this by first converting the notebook to a Python module"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f01d61f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to import the utility module\n",
    "def import_notebook_as_module(notebook_path, module_name=\"utils_module\"):\n",
    "    \"\"\"\n",
    "    Import a Jupyter notebook as a Python module.\n",
    "    \n",
    "    Args:\n",
    "        notebook_path: Path to the notebook\n",
    "        module_name: Name to give the module\n",
    "        \n",
    "    Returns:\n",
    "        The imported module\n",
    "    \"\"\"\n",
    "    import nbformat\n",
    "    from importlib.util import spec_from_loader, module_from_spec\n",
    "    from IPython.core.interactiveshell import InteractiveShell\n",
    "    \n",
    "    shell = InteractiveShell.instance()\n",
    "    \n",
    "    with open(notebook_path) as f:\n",
    "        nb = nbformat.read(f, as_version=4)\n",
    "    \n",
    "    # Create a module\n",
    "    spec = spec_from_loader(module_name, loader=None)\n",
    "    module = module_from_spec(spec)\n",
    "    sys.modules[module_name] = module\n",
    "    \n",
    "    # Execute only the code cells in the notebook\n",
    "    for cell in nb.cells:\n",
    "        if cell.cell_type == 'code':\n",
    "            # Skip cells that start with certain keywords like \"if __name__ == \"__main__\":\"\n",
    "            if 'if __name__ == \"__main__\":' in cell.source:\n",
    "                continue\n",
    "            \n",
    "            # Execute the cell and store its content in the module\n",
    "            code = shell.input_transformer_manager.transform_cell(cell.source)\n",
    "            exec(code, module.__dict__)\n",
    "    \n",
    "    return module"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "774c1373",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import the utility functions\n",
    "utils = import_notebook_as_module('utils_data_loading.ipynb')\n",
    "\n",
    "# Now you can access all the functions from the utils module\n",
    "print(\"Successfully imported utility functions.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "85ae6617",
   "metadata": {},
   "source": [
    "## Configuration\n",
    "\n",
    "Set up the configuration for data processing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54e9ca48",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Configuration (can be overridden from .env file)\n",
    "DATA_DIR = os.environ.get(\"DATA_DIR\", \"data/\")\n",
    "VECTOR_STORAGE_PATH = os.environ.get(\"VECTOR_STORAGE_PATH\", \"./db/vectorstore_v3\")\n",
    "BLOG_BASE_URL = os.environ.get(\"BLOG_BASE_URL\", \"https://thedataguy.pro/blog/\")\n",
    "FORCE_RECREATE_EMBEDDINGS = os.environ.get(\"FORCE_RECREATE_EMBEDDINGS\", \"false\").lower() == \"true\"\n",
    "\n",
    "print(f\"Data Directory: {DATA_DIR}\")\n",
    "print(f\"Vector Storage Path: {VECTOR_STORAGE_PATH}\")\n",
    "print(f\"Blog Base URL: {BLOG_BASE_URL}\")\n",
    "print(f\"Force Recreate Embeddings: {FORCE_RECREATE_EMBEDDINGS}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cc19ab4c",
   "metadata": {},
   "source": [
    "## Update Blog Data Process\n",
    "\n",
    "This process will:\n",
    "1. Load existing blog posts\n",
    "2. Process and update metadata\n",
    "3. Create or update vector embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d56f688",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Process blog posts and create/update embeddings\n",
    "result = utils.process_blog_posts(\n",
    "    data_dir=DATA_DIR,\n",
    "    create_embeddings=True,\n",
    "    force_recreate_embeddings=FORCE_RECREATE_EMBEDDINGS\n",
    ")\n",
    "\n",
    "# Access the documents and vector store\n",
    "documents = result[\"documents\"]\n",
    "stats = result[\"stats\"]\n",
    "vector_store = result[\"vector_store\"]\n",
    "\n",
    "print(f\"\\nProcessed {len(documents)} blog posts\")\n",
    "print(f\"Vector store created/updated at: {VECTOR_STORAGE_PATH}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad3b2dca",
   "metadata": {},
   "source": [
    "## Testing the Vector Store\n",
    "\n",
    "Let's test the vector store with a few queries to make sure it's working correctly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b552e6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a retriever from the vector store\n",
    "retriever = vector_store.as_retriever(search_kwargs={\"k\": 2})\n",
    "\n",
    "# Test queries\n",
    "test_queries = [\n",
    "    \"What is RAGAS?\",\n",
    "    \"How to build research agents?\",\n",
    "    \"What is metric driven development?\",\n",
    "    \"Who is TheDataGuy?\"\n",
    "]\n",
    "\n",
    "for query in test_queries:\n",
    "    print(f\"\\nQuery: {query}\")\n",
    "    docs = retriever.invoke(query)\n",
    "    print(f\"Retrieved {len(docs)} documents:\")\n",
    "    for i, doc in enumerate(docs):\n",
    "        title = doc.metadata.get(\"post_title\", \"Unknown\")\n",
    "        url = doc.metadata.get(\"url\", \"No URL\")\n",
    "        print(f\"{i+1}. {title} ({url})\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ddbe9282",
   "metadata": {},
   "source": [
    "## Schedule This Notebook\n",
    "\n",
    "To keep the blog data up-to-date, you can schedule this notebook to run periodically. \n",
    "Here are some options:\n",
    "\n",
    "1. Use a cron job to run this notebook with papermill\n",
    "2. Set up a GitHub Action to run this notebook on a schedule\n",
    "3. Use Airflow or another workflow management system\n",
    "\n",
    "Example of running with papermill:\n",
    "```bash\n",
    "papermill update_blog_data.ipynb output_$(date +%Y%m%d).ipynb\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3634e064",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save stats to a file for tracking changes over time\n",
    "import json\n",
    "from datetime import datetime\n",
    "\n",
    "stats_dir = Path(\"stats\")\n",
    "stats_dir.mkdir(exist_ok=True)\n",
    "\n",
    "# Add timestamp to stats\n",
    "stats[\"timestamp\"] = datetime.now().isoformat()\n",
    "\n",
    "# Save stats\n",
    "stats_path = stats_dir / f\"blog_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json\"\n",
    "with open(stats_path, \"w\") as f:\n",
    "    json.dump(stats, f, indent=2)\n",
    "\n",
    "print(f\"Saved stats to {stats_path}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}