# Utility Functions for Blog Post Loading and Processing

This notebook contains utility functions for loading blog posts from the data directory, processing their metadata, and creating vector embeddings for use in the RAG system.

In [None]:
import os
import json
from pathlib import Path
from typing import List, Dict, Any, Optional

from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant

from IPython.display import Markdown, display
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

## Configuration

Load configuration from environment variables or use defaults.

In [None]:
# Configuration with defaults
DATA_DIR = os.environ.get("DATA_DIR", "data/")
VECTOR_STORAGE_PATH = os.environ.get("VECTOR_STORAGE_PATH", "./db/vectorstore_v3")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "Snowflake/snowflake-arctic-embed-l")
QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION", "thedataguy_documents")
BLOG_BASE_URL = os.environ.get("BLOG_BASE_URL", "https://thedataguy.pro/blog/")

## Utility Functions

These functions handle the loading, processing, and storing of blog posts.

In [None]:
def load_blog_posts(data_dir: str = DATA_DIR, 
 glob_pattern: str = "*.md", 
 recursive: bool = True, 
 show_progress: bool = True) -> List[Document]:
 """
 Load blog posts from the specified directory.
 
 Args:
 data_dir: Directory containing the blog posts
 glob_pattern: Pattern to match files
 recursive: Whether to search subdirectories
 show_progress: Whether to show a progress bar
 
 Returns:
 List of Document objects containing the blog posts
 """
 text_loader = DirectoryLoader(
 data_dir, 
 glob=glob_pattern, 
 show_progress=show_progress,
 recursive=recursive
 )
 
 documents = text_loader.load()
 print(f"Loaded {len(documents)} documents from {data_dir}")
 return documents

In [None]:
def update_document_metadata(documents: List[Document], 
 data_dir_prefix: str = DATA_DIR,
 blog_base_url: str = BLOG_BASE_URL,
 remove_suffix: str = "index.md") -> List[Document]:
 """
 Update the metadata of documents to include URL and other information.
 
 Args:
 documents: List of Document objects to update
 data_dir_prefix: Prefix to replace in source paths
 blog_base_url: Base URL for the blog posts
 remove_suffix: Suffix to remove from paths (like index.md)
 
 Returns:
 Updated list of Document objects
 """
 for doc in documents:
 # Create URL from source path
 doc.metadata["url"] = doc.metadata["source"].replace(data_dir_prefix, blog_base_url)
 
 # Remove index.md or other suffix if present
 if remove_suffix and doc.metadata["url"].endswith(remove_suffix):
 doc.metadata["url"] = doc.metadata["url"][:-len(remove_suffix)]
 
 # Extract post title from the directory structure
 path_parts = Path(doc.metadata["source"]).parts
 if len(path_parts) > 1:
 # Use the directory name as post_slug
 doc.metadata["post_slug"] = path_parts[-2]
 doc.metadata["post_title"] = path_parts[-2].replace("-", " ").title()
 
 # Add document length as metadata
 doc.metadata["content_length"] = len(doc.page_content)
 
 return documents

In [None]:
def get_document_stats(documents: List[Document]) -> Dict[str, Any]:
 """
 Get statistics about the documents.
 
 Args:
 documents: List of Document objects
 
 Returns:
 Dictionary with statistics
 """
 stats = {
 "total_documents": len(documents),
 "total_characters": sum(len(doc.page_content) for doc in documents),
 "min_length": min(len(doc.page_content) for doc in documents),
 "max_length": max(len(doc.page_content) for doc in documents),
 "avg_length": sum(len(doc.page_content) for doc in documents) / len(documents) if documents else 0,
 }
 
 # Create a list of document info for analysis
 doc_info = []
 for doc in documents:
 doc_info.append({
 "url": doc.metadata.get("url", ""),
 "source": doc.metadata.get("source", ""),
 "title": doc.metadata.get("post_title", ""),
 "text_length": doc.metadata.get("content_length", 0),
 })
 
 stats["documents"] = doc_info
 return stats

In [None]:
def display_document_stats(stats: Dict[str, Any]):
 """
 Display document statistics in a readable format.
 
 Args:
 stats: Dictionary with statistics from get_document_stats
 """
 print(f"Total Documents: {stats['total_documents']}")
 print(f"Total Characters: {stats['total_characters']}")
 print(f"Min Length: {stats['min_length']} characters")
 print(f"Max Length: {stats['max_length']} characters")
 print(f"Average Length: {stats['avg_length']:.2f} characters")
 
 # Display documents as a table
 import pandas as pd
 if stats["documents"]:
 df = pd.DataFrame(stats["documents"])
 display(df)

In [None]:
def split_documents(documents: List[Document], 
 chunk_size: int = 1000, 
 chunk_overlap: int = 200) -> List[Document]:
 """
 Split documents into chunks for better embedding and retrieval.
 
 Args:
 documents: List of Document objects to split
 chunk_size: Size of each chunk in characters
 chunk_overlap: Overlap between chunks in characters
 
 Returns:
 List of split Document objects
 """
 text_splitter = RecursiveCharacterTextSplitter(
 chunk_size=chunk_size,
 chunk_overlap=chunk_overlap,
 length_function=len,
 )
 
 split_docs = text_splitter.split_documents(documents)
 print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
 return split_docs

In [None]:
def create_vector_store(documents: List[Document], 
 storage_path: str = VECTOR_STORAGE_PATH,
 collection_name: str = QDRANT_COLLECTION,
 embedding_model: str = EMBEDDING_MODEL,
 force_recreate: bool = False) -> Qdrant:
 """
 Create a vector store from documents.
 
 Args:
 documents: List of Document objects to store
 storage_path: Path to the vector store
 collection_name: Name of the collection
 embedding_model: Name of the embedding model
 force_recreate: Whether to force recreation of the vector store
 
 Returns:
 Qdrant vector store
 """
 # Initialize the embedding model
 embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
 
 # Create the directory if it doesn't exist
 storage_dir = Path(storage_path).parent
 os.makedirs(storage_dir, exist_ok=True)
 
 # Check if vector store exists
 vector_store_exists = Path(storage_path).exists() and not force_recreate
 
 if vector_store_exists:
 print(f"Loading existing vector store from {storage_path}")
 try:
 vector_store = Qdrant(
 path=storage_path,
 embedding_function=embeddings,
 collection_name=collection_name
 )
 return vector_store
 except Exception as e:
 print(f"Error loading existing vector store: {e}")
 print("Creating new vector store...")
 force_recreate = True
 
 # Create new vector store
 print(f"Creating new vector store at {storage_path}")
 vector_store = Qdrant.from_documents(
 documents=documents,
 embedding=embeddings,
 path=storage_path,
 collection_name=collection_name,
 )
 
 return vector_store

## Example Usage

Here's how to use these utility functions for processing blog posts.

In [None]:
def process_blog_posts(data_dir: str = DATA_DIR,
 create_embeddings: bool = True,
 force_recreate_embeddings: bool = False):
 """
 Complete pipeline to process blog posts and optionally create vector embeddings.
 
 Args:
 data_dir: Directory containing the blog posts
 create_embeddings: Whether to create vector embeddings
 force_recreate_embeddings: Whether to force recreation of embeddings
 
 Returns:
 Dictionary with data and vector store (if created)
 """
 # Load documents
 documents = load_blog_posts(data_dir)
 
 # Update metadata
 documents = update_document_metadata(documents)
 
 # Get and display stats
 stats = get_document_stats(documents)
 display_document_stats(stats)
 
 result = {
 "documents": documents,
 "stats": stats,
 "vector_store": None
 }
 
 # Create vector store if requested
 if create_embeddings:
 vector_store = create_vector_store(
 documents, 
 force_recreate=force_recreate_embeddings
 )
 result["vector_store"] = vector_store
 
 return result

In [None]:
# Example usage
if __name__ == "__main__":
 # Process blog posts without creating embeddings
 result = process_blog_posts(create_embeddings=False)
 
 # Example: Access the documents
 print(f"\nDocument example: {result['documents'][0].metadata}")
 
 # Create embeddings if needed
 # result = process_blog_posts(create_embeddings=True)
 
 # Retriever example
 # retriever = result["vector_store"].as_retriever()
 # query = "What is RAGAS?"
 # docs = retriever.invoke(query, k=2)
 # print(f"\nRetrieved {len(docs)} documents for query: {query}")

## Function for Loading Existing Vector Store

This function can be used to load an existing vector store without reprocessing all blog posts.

In [None]:
def load_vector_store(storage_path: str = VECTOR_STORAGE_PATH,
 collection_name: str = QDRANT_COLLECTION,
 embedding_model: str = EMBEDDING_MODEL) -> Optional[Qdrant]:
 """
 Load an existing vector store.
 
 Args:
 storage_path: Path to the vector store
 collection_name: Name of the collection
 embedding_model: Name of the embedding model
 
 Returns:
 Qdrant vector store or None if it doesn't exist
 """
 # Initialize the embedding model
 embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
 
 # Check if vector store exists
 if not Path(storage_path).exists():
 print(f"Vector store not found at {storage_path}")
 return None
 
 try:
 vector_store = Qdrant(
 path=storage_path,
 embedding_function=embeddings,
 collection_name=collection_name
 )
 print(f"Loaded vector store from {storage_path}")
 return vector_store
 except Exception as e:
 print(f"Error loading vector store: {e}")
 return None