In [2]:
import sys
import os

# Add the project root to the Python path
package_root = os.path.abspath(os.path.join(os.getcwd(), "../"))
print(f"Adding package root to sys.path: {package_root}")
if package_root not in sys.path:
	sys.path.append(package_root)


notebook_dir = os.getcwd()
print(f"Current notebook directory: {notebook_dir}")
# change to the directory to the root of the project
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
print(f"Project root: {project_root}")
os.chdir(project_root)

Adding package root to sys.path: /home/mafzaal/source/lets-talk/py-src
Current notebook directory: /home/mafzaal/source/lets-talk/py-src/notebooks
Project root: /home/mafzaal/source/lets-talk


In [3]:
import nest_asyncio
nest_asyncio.apply()

In [5]:
import lets_talk.utils.blog as blog


In [6]:
docs = blog.load_blog_posts()
docs = blog.update_document_metadata(docs)
split_docs = blog.split_documents(docs)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 14/14 [00:00<00:00, 3317.53it/s]

Loaded 14 documents from data/
Split 14 documents into 162 chunks





In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4.1",
    temperature=0,
)

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.
You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

question_generation_chain = qa_prompt_template | qa_chat_model

In [18]:
import tqdm
import asyncio


def extract_questions(response_text,n_questions):
    # Split the response text into lines
    lines = response_text.strip().split('\n')

    # Extract questions (format: "1. QUESTION")
    extracted_questions = []
    for line in lines:
        line = line.strip()
        if line and any(line.startswith(f"{i}.") for i in range(1, n_questions+1)):
            # Remove the number prefix and get just the question
            question = line.split('.', 1)[1].strip()
            extracted_questions.append(question)

    return extracted_questions

def create_questions(documents, n_questions, chain):
    question_set = []
    
    for doc in tqdm.tqdm(documents):
        
        context = doc.page_content

        # Generate questions using the question generation chain
        response = chain.invoke({
            "context": context,
            "n_questions": n_questions
        })

        questions = extract_questions(response.content,n_questions)
        
        for i, question in enumerate(questions):
            question_set.append({"question":question, "context": context})
            
    return question_set

In [19]:
ds = create_questions(documents=split_docs, n_questions=2, chain=question_generation_chain)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 162/162 [07:23<00:00,  2.74s/it]


In [20]:
import pandas as pd
df = pd.DataFrame(ds)
df.head(10)

Unnamed: 0,question,context
0,What role does Ragas play in evaluating the pe...,"---\ntitle: ""Part 1: Introduction to Ragas: Th..."
1,Why is it important to have reliable metrics w...,"---\ntitle: ""Part 1: Introduction to Ragas: Th..."
2,What are some of the key questions that Ragas ...,## What is Ragas?\n\n[Ragas](https://docs.raga...
3,Why is proper evaluation especially important ...,## What is Ragas?\n\n[Ragas](https://docs.raga...
4,What are the main purposes of evaluation as de...,Evaluation serves several key purposes:\n- **Q...
5,Which specialized metrics does Ragas provide f...,Evaluation serves several key purposes:\n- **Q...
6,How does Ragas assist in the process of test d...,### ðŸ§ª Test Data Generation\nCreating high-qual...
7,Which popular LLM frameworks and observability...,### ðŸ§ª Test Data Generation\nCreating high-qual...
8,What command is used to install Ragas accordin...,## Getting Started with Ragas\n\nInstalling Ra...
9,"In the example, which class is used to wrap th...",## Getting Started with Ragas\n\nInstalling Ra...


In [21]:
df.to_csv("evals/ft_questions.csv", index=False)

In [None]:
from datasets import Dataset

# Convert pandas DataFrame to Huggingface Dataset
hf_dataset = Dataset.from_pandas(df)

# Display some basic information about the dataset
print(f"Dataset has {len(hf_dataset)} examples")
print(f"Dataset features: {hf_dataset.features}")

# Show a few examples
print("\nSample examples:")
display(hf_dataset.select(range(3)).to_pandas())

# Save the dataset to disk (optional)
#hf_dataset.save_to_disk("ragas_qa_dataset")


Dataset has 324 examples
Dataset features: {'question': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None)}

Sample examples:


Unnamed: 0,question,context
0,What role does Ragas play in evaluating the pe...,"---\ntitle: ""Part 1: Introduction to Ragas: Th..."
1,Why is it important to have reliable metrics w...,"---\ntitle: ""Part 1: Introduction to Ragas: Th..."
2,What are some of the key questions that Ragas ...,## What is Ragas?\n\n[Ragas](https://docs.raga...


Saving the dataset (0/1 shards):   0%|          | 0/324 [00:00<?, ? examples/s]

In [23]:
hf_dataset.push_to_hub(
    repo_id="mafzaal/thedataguy_embed_ft" )

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mafzaal/thedataguy_embed_ft/commit/963348381fcb929a7367ff8933b62812a0e9ceb7', commit_message='Upload dataset', commit_description='', oid='963348381fcb929a7367ff8933b62812a0e9ceb7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mafzaal/thedataguy_embed_ft', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mafzaal/thedataguy_embed_ft'), pr_revision=None, pr_num=None)