{ "cells": [ { "cell_type": "markdown", "id": "3b368f39", "metadata": {}, "source": [ "# Dealing with the Data\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "95ebfe0e", "metadata": {}, "outputs": [], "source": [ "import os\n", "import getpass\n", "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter your OpenAI API Key:\")" ] }, { "cell_type": "markdown", "id": "c869f7a5", "metadata": {}, "source": [ "TheDataGuy's blog posts are in markdown in git repo, that why we copied the docs here for initial version\n", "\n", "- 14 posts currently\n", "\n", "TODO:\n", "\n", "[ ] - Develop a pipeline to ingest data as new posts are published\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a523f17c", "metadata": {}, "outputs": [], "source": [ "from langchain_community.document_loaders import DirectoryLoader\n", "\n", "path = \"data/\"\n", "text_loader = DirectoryLoader(path, glob=\"*.md\", show_progress=True,recursive=True)\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "6c0cc8c8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 14/14 [00:00<00:00, 41.74it/s]\n" ] } ], "source": [ "raw_docs = text_loader.load()" ] }, { "cell_type": "code", "execution_count": 15, "id": "774a9a99", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "14" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(raw_docs)" ] }, { "cell_type": "code", "execution_count": null, "id": "4c7a6cdb", "metadata": {}, "outputs": [], "source": [ "# add url in metadata by replace \"data/\" with \"https://thedataguy.pro/\" and remove \"index.md\" from metadata.source\n", "for doc in raw_docs:\n", " doc.metadata[\"url\"] = doc.metadata[\"source\"].replace(\"data/\", \"https://thedataguy.pro/blog/\").replace(\"index.md\", \"\")\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 129, "id": "1a3f7432", "metadata": {}, "outputs": [], "source": [ "# build list of doc source and text length\n", "doc_info = []\n", "for doc in raw_docs:\n", " doc_info.append(\n", " {\n", " \"url\": doc.metadata[\"url\"],\n", " \"source\": doc.metadata[\"source\"],\n", " \"text_length\": len(doc.page_content),\n", " }\n", " )\n", "\n" ] }, { "cell_type": "code", "execution_count": 130, "id": "9f99da77", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "13468" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "max(len(doc.page_content) for doc in raw_docs)" ] }, { "cell_type": "markdown", "id": "a6aa672d", "metadata": {}, "source": [ "The longest blog post contains 13,468 characters. Ideally, we would like to retrieve full post content based on the query. \n", "\n", "> That's why chunking based on the blog post might be the best way." ] }, { "cell_type": "code", "execution_count": 131, "id": "6106468f", "metadata": {}, "outputs": [ { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "url", "rawType": "object", "type": "string" }, { "name": "source", "rawType": "object", "type": "string" }, { "name": "text_length", "rawType": "int64", "type": "integer" } ], "conversionMethod": "pd.DataFrame", "ref": "bbd3b888-a367-492d-9809-454ac5ea11ba", "rows": [ [ "0", "https://thedataguy.pro/introduction-to-ragas/", "data/introduction-to-ragas/index.md", "6071" ], [ "1", "https://thedataguy.pro/generating-test-data-with-ragas/", "data/generating-test-data-with-ragas/index.md", "13468" ], [ "2", "https://thedataguy.pro/advanced-metrics-and-customization-with-ragas/", "data/advanced-metrics-and-customization-with-ragas/index.md", "10455" ], [ "3", "https://thedataguy.pro/building-research-agent/", "data/building-research-agent/index.md", "6877" ], [ "4", "https://thedataguy.pro/rss-feed-announcement/", "data/rss-feed-announcement/index.md", "1900" ], [ "5", "https://thedataguy.pro/metric-driven-development/", "data/metric-driven-development/index.md", "11879" ], [ "6", "https://thedataguy.pro/basic-evaluation-workflow-with-ragas/", "data/basic-evaluation-workflow-with-ragas/index.md", "9164" ], [ "7", "https://thedataguy.pro/langchain-experience-csharp-perspective/", "data/langchain-experience-csharp-perspective/index.md", "3070" ], [ "8", "https://thedataguy.pro/evaluating-ai-agents-with-ragas/", "data/evaluating-ai-agents-with-ragas/index.md", "8907" ], [ "9", "https://thedataguy.pro/integrations-and-observability-with-ragas/", "data/integrations-and-observability-with-ragas/index.md", "8221" ], [ "10", "https://thedataguy.pro/building-feedback-loops-with-ragas/", "data/building-feedback-loops-with-ragas/index.md", "6891" ], [ "11", "https://thedataguy.pro/coming-back-to-ai-roots/", "data/coming-back-to-ai-roots/index.md", "5711" ], [ "12", "https://thedataguy.pro/data-is-king/", "data/data-is-king/index.md", "5987" ], [ "13", "https://thedataguy.pro/evaluating-rag-systems-with-ragas/", "data/evaluating-rag-systems-with-ragas/index.md", "7674" ] ], "shape": { "columns": 3, "rows": 14 } }, "text/html": [ "
\n", " | url | \n", "source | \n", "text_length | \n", "
---|---|---|---|
0 | \n", "https://thedataguy.pro/introduction-to-ragas/ | \n", "data/introduction-to-ragas/index.md | \n", "6071 | \n", "
1 | \n", "https://thedataguy.pro/generating-test-data-wi... | \n", "data/generating-test-data-with-ragas/index.md | \n", "13468 | \n", "
2 | \n", "https://thedataguy.pro/advanced-metrics-and-cu... | \n", "data/advanced-metrics-and-customization-with-r... | \n", "10455 | \n", "
3 | \n", "https://thedataguy.pro/building-research-agent/ | \n", "data/building-research-agent/index.md | \n", "6877 | \n", "
4 | \n", "https://thedataguy.pro/rss-feed-announcement/ | \n", "data/rss-feed-announcement/index.md | \n", "1900 | \n", "
5 | \n", "https://thedataguy.pro/metric-driven-development/ | \n", "data/metric-driven-development/index.md | \n", "11879 | \n", "
6 | \n", "https://thedataguy.pro/basic-evaluation-workfl... | \n", "data/basic-evaluation-workflow-with-ragas/inde... | \n", "9164 | \n", "
7 | \n", "https://thedataguy.pro/langchain-experience-cs... | \n", "data/langchain-experience-csharp-perspective/i... | \n", "3070 | \n", "
8 | \n", "https://thedataguy.pro/evaluating-ai-agents-wi... | \n", "data/evaluating-ai-agents-with-ragas/index.md | \n", "8907 | \n", "
9 | \n", "https://thedataguy.pro/integrations-and-observ... | \n", "data/integrations-and-observability-with-ragas... | \n", "8221 | \n", "
10 | \n", "https://thedataguy.pro/building-feedback-loops... | \n", "data/building-feedback-loops-with-ragas/index.md | \n", "6891 | \n", "
11 | \n", "https://thedataguy.pro/coming-back-to-ai-roots/ | \n", "data/coming-back-to-ai-roots/index.md | \n", "5711 | \n", "
12 | \n", "https://thedataguy.pro/data-is-king/ | \n", "data/data-is-king/index.md | \n", "5987 | \n", "
13 | \n", "https://thedataguy.pro/evaluating-rag-systems-... | \n", "data/evaluating-rag-systems-with-ragas/index.md | \n", "7674 | \n", "