mafzaal commited on
Commit
8f67d6f
Β·
1 Parent(s): 6f89cef

Add prompt templates for RAG, call LLM, and query tone check

Browse files

- Implemented a RAG prompt template for generating answers based on provided context.
- Created a call LLM prompt template for concise answers with context.
- Added a query tone check prompt template to evaluate the tone of input queries.

evals/ft_questions.csv ADDED
The diff for this file is too large to render. See raw diff
 
py-src/lets_talk/agent.py CHANGED
@@ -31,16 +31,7 @@ class InputState(TypedDict):
31
  documents: Optional[list[Document]]
32
 
33
 
34
- rag_prompt_template = """\
35
- You are a helpful assistant that answers questions based on the context provided.
36
- Generate a concise answer to the question in markdown format and include a list of relevant links to the context.
37
- You have access to the following information:
38
 
39
- Context:
40
- {context}
41
-
42
- If context is unrelated to question, say "I don't know".
43
- """
44
 
45
  # Update the call_model function to include current datetime
46
  def call_model(model, state: Dict[str, Any]) -> Dict[str, list[BaseMessage]]:
@@ -58,15 +49,12 @@ def call_model(model, state: Dict[str, Any]) -> Dict[str, list[BaseMessage]]:
58
  messages = state["messages"]
59
  context = state.get("context", "")
60
 
61
-
62
- # Get current datetime
63
- current_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
64
  # Insert system message with context before the latest user message
65
- sys_prompt = rag_prompt_template.format(
 
66
  context=context,
67
  )
68
- sys_prompt = f"Today is: {current_datetime}\n\n" + sys_prompt
69
- print(sys_prompt)
70
 
71
  context_message = SystemMessage(content=sys_prompt)
72
 
@@ -217,15 +205,6 @@ def parse_output(input_state: Dict[str, Any]) -> str:
217
  return "I encountered an error while processing your request."
218
 
219
 
220
- tone_check_prompt_template = """\
221
- Check if the input query is rude, derogatory, disrespectful, or negative, and respond with "YES" or "NO".
222
-
223
- Query:
224
- {query}
225
- # Output Format
226
-
227
- Respond only with "YES" or "NO".
228
- """
229
 
230
  def check_query_tone(state: Dict[str, Any]) -> Dict[str, str]:
231
  """
@@ -254,8 +233,8 @@ def check_query_rudeness(query: str) -> bool:
254
  Returns:
255
  True if the query is rude, False otherwise
256
  """
257
-
258
- tone_prompt = ChatPromptTemplate.from_template(tone_check_prompt_template)
259
  llm = ChatOpenAI(model=LLM_MODEL, temperature=LLM_TEMPERATURE)
260
 
261
  # Create chain
 
31
  documents: Optional[list[Document]]
32
 
33
 
 
 
 
 
34
 
 
 
 
 
 
35
 
36
  # Update the call_model function to include current datetime
37
  def call_model(model, state: Dict[str, Any]) -> Dict[str, list[BaseMessage]]:
 
49
  messages = state["messages"]
50
  context = state.get("context", "")
51
 
 
 
 
52
  # Insert system message with context before the latest user message
53
+ from prompts import call_llm_prompt_template
54
+ sys_prompt = call_llm_prompt_template.format(
55
  context=context,
56
  )
57
+
 
58
 
59
  context_message = SystemMessage(content=sys_prompt)
60
 
 
205
  return "I encountered an error while processing your request."
206
 
207
 
 
 
 
 
 
 
 
 
 
208
 
209
  def check_query_tone(state: Dict[str, Any]) -> Dict[str, str]:
210
  """
 
233
  Returns:
234
  True if the query is rude, False otherwise
235
  """
236
+ from prompts import query_tone_check_prompt_template
237
+ tone_prompt = ChatPromptTemplate.from_template(query_tone_check_prompt_template)
238
  llm = ChatOpenAI(model=LLM_MODEL, temperature=LLM_TEMPERATURE)
239
 
240
  # Create chain
py-src/lets_talk/prompts.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # Create RAG prompt template
4
+ rag_prompt_template = """\
5
+ You are a helpful assistant that answers questions based on the context provided.
6
+ Generate a concise answer to the question in markdown format and include a list of relevant links to the context.
7
+ Use links from context to help user to navigate to to find more information.
8
+ You have access to the following information:
9
+
10
+ Context:
11
+ {context}
12
+
13
+ Question:
14
+ {question}
15
+
16
+ If context is unrelated to question, say "I don't know".
17
+ """
18
+
19
+
20
+ call_llm_prompt_template = """\
21
+ You are a helpful assistant that answers questions based on the context provided.
22
+ Generate a concise answer to the question in markdown format and include a list of relevant links to the context.
23
+ You have access to the following information:
24
+
25
+ Context:
26
+ {context}
27
+
28
+ If context is unrelated to question, say "I don't know".
29
+ """
30
+
31
+
32
+
33
+ query_tone_check_prompt_template = """\
34
+ Check if the input query is rude, derogatory, disrespectful, or negative, and respond with "YES" or "NO".
35
+
36
+ Query:
37
+ {query}
38
+ # Output Format
39
+
40
+ Respond only with "YES" or "NO".
41
+ """
py-src/lets_talk/rag.py CHANGED
@@ -21,21 +21,8 @@ retriever = vector_store.as_retriever()
21
 
22
  llm = ChatOpenAI(model=config.LLM_MODEL, temperature=config.LLM_TEMPERATURE)
23
 
24
- # Create RAG prompt template
25
- rag_prompt_template = """\
26
- You are a helpful assistant that answers questions based on the context provided.
27
- Generate a concise answer to the question in markdown format and include a list of relevant links to the context.
28
- Use links from context to help user to navigate to to find more information.
29
- You have access to the following information:
30
 
31
- Context:
32
- {context}
33
-
34
- Question:
35
- {question}
36
-
37
- If context is unrelated to question, say "I don't know".
38
- """
39
 
40
  rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
41
 
 
21
 
22
  llm = ChatOpenAI(model=config.LLM_MODEL, temperature=config.LLM_TEMPERATURE)
23
 
 
 
 
 
 
 
24
 
25
+ from prompts import rag_prompt_template
 
 
 
 
 
 
 
26
 
27
  rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
28
 
py-src/notebooks/07_Fine_Tuning_Dataset.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "id": "c95ab233",
7
  "metadata": {},
8
  "outputs": [
@@ -37,7 +37,7 @@
37
  },
38
  {
39
  "cell_type": "code",
40
- "execution_count": 2,
41
  "id": "15e97530",
42
  "metadata": {},
43
  "outputs": [],
@@ -48,18 +48,17 @@
48
  },
49
  {
50
  "cell_type": "code",
51
- "execution_count": 3,
52
  "id": "b4f2ddc0",
53
  "metadata": {},
54
  "outputs": [],
55
  "source": [
56
- "import lets_talk.utils.blog as blog\n",
57
- "import lets_talk.utils.eval as eval"
58
  ]
59
  },
60
  {
61
  "cell_type": "code",
62
- "execution_count": 4,
63
  "id": "123779af",
64
  "metadata": {},
65
  "outputs": [
@@ -67,7 +66,7 @@
67
  "name": "stderr",
68
  "output_type": "stream",
69
  "text": [
70
- "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 14/14 [00:00<00:00, 3411.39it/s]"
71
  ]
72
  },
73
  {
@@ -94,7 +93,7 @@
94
  },
95
  {
96
  "cell_type": "code",
97
- "execution_count": 5,
98
  "id": "0b742838",
99
  "metadata": {},
100
  "outputs": [],
@@ -103,7 +102,7 @@
103
  "from langchain_openai import ChatOpenAI\n",
104
  "\n",
105
  "qa_chat_model = ChatOpenAI(\n",
106
- " model=\"gpt-4.1-mini\",\n",
107
  " temperature=0,\n",
108
  ")\n",
109
  "\n",
@@ -120,12 +119,13 @@
120
  "\"\"\"\n",
121
  "\n",
122
  "qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)\n",
 
123
  "question_generation_chain = qa_prompt_template | qa_chat_model"
124
  ]
125
  },
126
  {
127
  "cell_type": "code",
128
- "execution_count": null,
129
  "id": "5488c3d3",
130
  "metadata": {},
131
  "outputs": [],
@@ -149,7 +149,7 @@
149
  "\n",
150
  " return extracted_questions\n",
151
  "\n",
152
- "async def create_questions(documents, n_questions, chain):\n",
153
  " question_set = []\n",
154
  " \n",
155
  " for doc in tqdm.tqdm(documents):\n",
@@ -157,7 +157,7 @@
157
  " context = doc.page_content\n",
158
  "\n",
159
  " # Generate questions using the question generation chain\n",
160
- " response = await chain.ainvoke({\n",
161
  " \"context\": context,\n",
162
  " \"n_questions\": n_questions\n",
163
  " })\n",
@@ -165,86 +165,237 @@
165
  " questions = extract_questions(response.content,n_questions)\n",
166
  " \n",
167
  " for i, question in enumerate(questions):\n",
168
- " questions.append({\"question\":question, \"context\": context})\n",
 
169
  " return question_set"
170
  ]
171
  },
172
  {
173
  "cell_type": "code",
174
- "execution_count": 9,
175
- "id": "adb3ae7b",
176
  "metadata": {},
177
  "outputs": [
178
  {
179
- "name": "stdout",
180
  "output_type": "stream",
181
  "text": [
182
- "Extracted questions:\n",
183
- "1. What is the primary purpose of the Ragas evaluation framework in LLM applications?\n",
184
- "2. Why is it important to have reliable metrics when assessing the performance of LLM-based systems?\n",
185
- "3. What types of applications can benefit from using the Ragas framework for evaluation?\n"
186
  ]
187
  }
188
  ],
189
  "source": [
190
- "context = split_docs[0].page_content\n",
191
- "n_questions = 3\n",
192
- "response = question_generation_chain.invoke({\"context\": context, \"n_questions\": n_questions})\n",
193
- "questions = extract_questions(response.content, n_questions)\n",
194
- "print(\"Extracted questions:\")\n",
195
- "for i, question in enumerate(questions):\n",
196
- " print(f\"{i + 1}. {question}\")"
197
  ]
198
  },
199
  {
200
  "cell_type": "code",
201
- "execution_count": null,
202
- "id": "7c4a75f9",
203
  "metadata": {},
204
  "outputs": [
205
  {
206
  "data": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  "text/plain": [
208
- "2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  ]
210
  },
211
- "execution_count": 12,
212
  "metadata": {},
213
  "output_type": "execute_result"
214
  }
215
  ],
216
  "source": [
217
- "documents = split_docs[:2]\n",
218
- "len(documents)"
 
219
  ]
220
  },
221
  {
222
  "cell_type": "code",
223
- "execution_count": 16,
224
- "id": "b1ece53b",
225
  "metadata": {},
226
- "outputs": [
227
- {
228
- "name": "stderr",
229
- "output_type": "stream",
230
- "text": [
231
- " 0%| | 0/2 [00:00<?, ?it/s]"
232
- ]
233
- },
234
- {
235
- "ename": "",
236
- "evalue": "",
237
- "output_type": "error",
238
- "traceback": [
239
- "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
240
- "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
241
- "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
242
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
243
- ]
244
- }
245
- ],
246
  "source": [
247
- "ds = await create_questions(documents=docs, n_questions=3, chain=question_generation_chain)"
248
  ]
249
  }
250
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 2,
6
  "id": "c95ab233",
7
  "metadata": {},
8
  "outputs": [
 
37
  },
38
  {
39
  "cell_type": "code",
40
+ "execution_count": 3,
41
  "id": "15e97530",
42
  "metadata": {},
43
  "outputs": [],
 
48
  },
49
  {
50
  "cell_type": "code",
51
+ "execution_count": 5,
52
  "id": "b4f2ddc0",
53
  "metadata": {},
54
  "outputs": [],
55
  "source": [
56
+ "import lets_talk.utils.blog as blog\n"
 
57
  ]
58
  },
59
  {
60
  "cell_type": "code",
61
+ "execution_count": 6,
62
  "id": "123779af",
63
  "metadata": {},
64
  "outputs": [
 
66
  "name": "stderr",
67
  "output_type": "stream",
68
  "text": [
69
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 14/14 [00:00<00:00, 3317.53it/s]"
70
  ]
71
  },
72
  {
 
93
  },
94
  {
95
  "cell_type": "code",
96
+ "execution_count": 7,
97
  "id": "0b742838",
98
  "metadata": {},
99
  "outputs": [],
 
102
  "from langchain_openai import ChatOpenAI\n",
103
  "\n",
104
  "qa_chat_model = ChatOpenAI(\n",
105
+ " model=\"gpt-4.1\",\n",
106
  " temperature=0,\n",
107
  ")\n",
108
  "\n",
 
119
  "\"\"\"\n",
120
  "\n",
121
  "qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)\n",
122
+ "\n",
123
  "question_generation_chain = qa_prompt_template | qa_chat_model"
124
  ]
125
  },
126
  {
127
  "cell_type": "code",
128
+ "execution_count": 18,
129
  "id": "5488c3d3",
130
  "metadata": {},
131
  "outputs": [],
 
149
  "\n",
150
  " return extracted_questions\n",
151
  "\n",
152
+ "def create_questions(documents, n_questions, chain):\n",
153
  " question_set = []\n",
154
  " \n",
155
  " for doc in tqdm.tqdm(documents):\n",
 
157
  " context = doc.page_content\n",
158
  "\n",
159
  " # Generate questions using the question generation chain\n",
160
+ " response = chain.invoke({\n",
161
  " \"context\": context,\n",
162
  " \"n_questions\": n_questions\n",
163
  " })\n",
 
165
  " questions = extract_questions(response.content,n_questions)\n",
166
  " \n",
167
  " for i, question in enumerate(questions):\n",
168
+ " question_set.append({\"question\":question, \"context\": context})\n",
169
+ " \n",
170
  " return question_set"
171
  ]
172
  },
173
  {
174
  "cell_type": "code",
175
+ "execution_count": 19,
176
+ "id": "b1ece53b",
177
  "metadata": {},
178
  "outputs": [
179
  {
180
+ "name": "stderr",
181
  "output_type": "stream",
182
  "text": [
183
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 162/162 [07:23<00:00, 2.74s/it]\n"
 
 
 
184
  ]
185
  }
186
  ],
187
  "source": [
188
+ "ds = create_questions(documents=split_docs, n_questions=2, chain=question_generation_chain)"
 
 
 
 
 
 
189
  ]
190
  },
191
  {
192
  "cell_type": "code",
193
+ "execution_count": 20,
194
+ "id": "965cf609",
195
  "metadata": {},
196
  "outputs": [
197
  {
198
  "data": {
199
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
200
+ "columns": [
201
+ {
202
+ "name": "index",
203
+ "rawType": "int64",
204
+ "type": "integer"
205
+ },
206
+ {
207
+ "name": "question",
208
+ "rawType": "object",
209
+ "type": "string"
210
+ },
211
+ {
212
+ "name": "context",
213
+ "rawType": "object",
214
+ "type": "string"
215
+ }
216
+ ],
217
+ "conversionMethod": "pd.DataFrame",
218
+ "ref": "f0615b27-42e5-4774-a436-51ec88bb4498",
219
+ "rows": [
220
+ [
221
+ "0",
222
+ "What role does Ragas play in evaluating the performance of applications that use Large Language Models (LLMs)?",
223
+ "---\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\ndate: 2025-04-26T18:00:00-06:00\nlayout: blog\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\nreadingTime: 7\npublished: true\n---\n\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you're building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\n\n## What is Ragas?"
224
+ ],
225
+ [
226
+ "1",
227
+ "Why is it important to have reliable metrics when building systems like question-answering tools or conversational agents with LLMs?",
228
+ "---\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\ndate: 2025-04-26T18:00:00-06:00\nlayout: blog\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\nreadingTime: 7\npublished: true\n---\n\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you're building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\n\n## What is Ragas?"
229
+ ],
230
+ [
231
+ "2",
232
+ "What are some of the key questions that Ragas helps answer when evaluating LLM applications?",
233
+ "## What is Ragas?\n\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\n\nAt its core, Ragas helps answer crucial questions:\n- Is my application retrieving the right information?\n- Are the responses factually accurate and consistent with the retrieved context?\n- Does the system appropriately address the user's query?\n- How well does my application handle multi-turn conversations?\n\n## Why Evaluate LLM Applications?\n\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matterβ€”like healthcare, finance, or educationβ€”proper evaluation is non-negotiable."
234
+ ],
235
+ [
236
+ "3",
237
+ "Why is proper evaluation especially important for LLM applications in fields like healthcare, finance, or education?",
238
+ "## What is Ragas?\n\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\n\nAt its core, Ragas helps answer crucial questions:\n- Is my application retrieving the right information?\n- Are the responses factually accurate and consistent with the retrieved context?\n- Does the system appropriately address the user's query?\n- How well does my application handle multi-turn conversations?\n\n## Why Evaluate LLM Applications?\n\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matterβ€”like healthcare, finance, or educationβ€”proper evaluation is non-negotiable."
239
+ ],
240
+ [
241
+ "4",
242
+ "What are the main purposes of evaluation as described in the context?",
243
+ "Evaluation serves several key purposes:\n- **Quality assurance**: Identify and fix issues before they reach users\n- **Performance tracking**: Monitor how changes impact system performance\n- **Benchmarking**: Compare different approaches objectively\n- **Continuous improvement**: Build feedback loops to enhance your application\n\n## Key Features of Ragas\n\n### 🎯 Specialized Metrics\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\n\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\n- **Answer Relevancy**: Assesses if the response addresses the user's question\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic"
244
+ ],
245
+ [
246
+ "5",
247
+ "Which specialized metrics does Ragas provide for evaluating LLM applications, and what does each metric measure?",
248
+ "Evaluation serves several key purposes:\n- **Quality assurance**: Identify and fix issues before they reach users\n- **Performance tracking**: Monitor how changes impact system performance\n- **Benchmarking**: Compare different approaches objectively\n- **Continuous improvement**: Build feedback loops to enhance your application\n\n## Key Features of Ragas\n\n### 🎯 Specialized Metrics\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\n\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\n- **Answer Relevancy**: Assesses if the response addresses the user's question\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic"
249
+ ],
250
+ [
251
+ "6",
252
+ "How does Ragas assist in the process of test data generation for evaluation?",
253
+ "### πŸ§ͺ Test Data Generation\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\n\n### πŸ”— Seamless Integrations\nRagas works with popular LLM frameworks and tools:\n- [LangChain](https://www.langchain.com/)\n- [LlamaIndex](https://www.llamaindex.ai/)\n- [Haystack](https://haystack.deepset.ai/)\n- [OpenAI](https://openai.com/)\n\nObservability platforms \n- [Phoenix](https://phoenix.arize.com/)\n- [LangSmith](https://python.langchain.com/docs/introduction/)\n- [Langfuse](https://www.langfuse.com/)\n\n### πŸ“Š Comprehensive Analysis\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\n\n## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:"
254
+ ],
255
+ [
256
+ "7",
257
+ "Which popular LLM frameworks and observability platforms does Ragas integrate with?",
258
+ "### πŸ§ͺ Test Data Generation\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\n\n### πŸ”— Seamless Integrations\nRagas works with popular LLM frameworks and tools:\n- [LangChain](https://www.langchain.com/)\n- [LlamaIndex](https://www.llamaindex.ai/)\n- [Haystack](https://haystack.deepset.ai/)\n- [OpenAI](https://openai.com/)\n\nObservability platforms \n- [Phoenix](https://phoenix.arize.com/)\n- [LangSmith](https://python.langchain.com/docs/introduction/)\n- [Langfuse](https://www.langfuse.com/)\n\n### πŸ“Š Comprehensive Analysis\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\n\n## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:"
259
+ ],
260
+ [
261
+ "8",
262
+ "What command is used to install Ragas according to the provided context?",
263
+ "## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:\n\n```python\nfrom ragas.metrics import Faithfulness\nfrom ragas.evaluation import EvaluationDataset\nfrom ragas.dataset_schema import SingleTurnSample\nfrom langchain_openai import ChatOpenAI\nfrom ragas.llms import LangchainLLMWrapper\nfrom langchain_openai import ChatOpenAI\n\n# Initialize the LLM, you are going to new OPENAI API key\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \n\n# Your evaluation data\ntest_data = {\n \"user_input\": \"What is the capital of France?\",\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\n \"response\": \"The capital of France is Paris.\"\n}\n\n# Create a sample\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor"
264
+ ],
265
+ [
266
+ "9",
267
+ "In the example, which class is used to wrap the ChatOpenAI model for evaluation purposes?",
268
+ "## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:\n\n```python\nfrom ragas.metrics import Faithfulness\nfrom ragas.evaluation import EvaluationDataset\nfrom ragas.dataset_schema import SingleTurnSample\nfrom langchain_openai import ChatOpenAI\nfrom ragas.llms import LangchainLLMWrapper\nfrom langchain_openai import ChatOpenAI\n\n# Initialize the LLM, you are going to new OPENAI API key\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \n\n# Your evaluation data\ntest_data = {\n \"user_input\": \"What is the capital of France?\",\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\n \"response\": \"The capital of France is Paris.\"\n}\n\n# Create a sample\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor"
269
+ ]
270
+ ],
271
+ "shape": {
272
+ "columns": 2,
273
+ "rows": 10
274
+ }
275
+ },
276
+ "text/html": [
277
+ "<div>\n",
278
+ "<style scoped>\n",
279
+ " .dataframe tbody tr th:only-of-type {\n",
280
+ " vertical-align: middle;\n",
281
+ " }\n",
282
+ "\n",
283
+ " .dataframe tbody tr th {\n",
284
+ " vertical-align: top;\n",
285
+ " }\n",
286
+ "\n",
287
+ " .dataframe thead th {\n",
288
+ " text-align: right;\n",
289
+ " }\n",
290
+ "</style>\n",
291
+ "<table border=\"1\" class=\"dataframe\">\n",
292
+ " <thead>\n",
293
+ " <tr style=\"text-align: right;\">\n",
294
+ " <th></th>\n",
295
+ " <th>question</th>\n",
296
+ " <th>context</th>\n",
297
+ " </tr>\n",
298
+ " </thead>\n",
299
+ " <tbody>\n",
300
+ " <tr>\n",
301
+ " <th>0</th>\n",
302
+ " <td>What role does Ragas play in evaluating the pe...</td>\n",
303
+ " <td>---\\ntitle: \"Part 1: Introduction to Ragas: Th...</td>\n",
304
+ " </tr>\n",
305
+ " <tr>\n",
306
+ " <th>1</th>\n",
307
+ " <td>Why is it important to have reliable metrics w...</td>\n",
308
+ " <td>---\\ntitle: \"Part 1: Introduction to Ragas: Th...</td>\n",
309
+ " </tr>\n",
310
+ " <tr>\n",
311
+ " <th>2</th>\n",
312
+ " <td>What are some of the key questions that Ragas ...</td>\n",
313
+ " <td>## What is Ragas?\\n\\n[Ragas](https://docs.raga...</td>\n",
314
+ " </tr>\n",
315
+ " <tr>\n",
316
+ " <th>3</th>\n",
317
+ " <td>Why is proper evaluation especially important ...</td>\n",
318
+ " <td>## What is Ragas?\\n\\n[Ragas](https://docs.raga...</td>\n",
319
+ " </tr>\n",
320
+ " <tr>\n",
321
+ " <th>4</th>\n",
322
+ " <td>What are the main purposes of evaluation as de...</td>\n",
323
+ " <td>Evaluation serves several key purposes:\\n- **Q...</td>\n",
324
+ " </tr>\n",
325
+ " <tr>\n",
326
+ " <th>5</th>\n",
327
+ " <td>Which specialized metrics does Ragas provide f...</td>\n",
328
+ " <td>Evaluation serves several key purposes:\\n- **Q...</td>\n",
329
+ " </tr>\n",
330
+ " <tr>\n",
331
+ " <th>6</th>\n",
332
+ " <td>How does Ragas assist in the process of test d...</td>\n",
333
+ " <td>### πŸ§ͺ Test Data Generation\\nCreating high-qual...</td>\n",
334
+ " </tr>\n",
335
+ " <tr>\n",
336
+ " <th>7</th>\n",
337
+ " <td>Which popular LLM frameworks and observability...</td>\n",
338
+ " <td>### πŸ§ͺ Test Data Generation\\nCreating high-qual...</td>\n",
339
+ " </tr>\n",
340
+ " <tr>\n",
341
+ " <th>8</th>\n",
342
+ " <td>What command is used to install Ragas accordin...</td>\n",
343
+ " <td>## Getting Started with Ragas\\n\\nInstalling Ra...</td>\n",
344
+ " </tr>\n",
345
+ " <tr>\n",
346
+ " <th>9</th>\n",
347
+ " <td>In the example, which class is used to wrap th...</td>\n",
348
+ " <td>## Getting Started with Ragas\\n\\nInstalling Ra...</td>\n",
349
+ " </tr>\n",
350
+ " </tbody>\n",
351
+ "</table>\n",
352
+ "</div>"
353
+ ],
354
  "text/plain": [
355
+ " question \\\n",
356
+ "0 What role does Ragas play in evaluating the pe... \n",
357
+ "1 Why is it important to have reliable metrics w... \n",
358
+ "2 What are some of the key questions that Ragas ... \n",
359
+ "3 Why is proper evaluation especially important ... \n",
360
+ "4 What are the main purposes of evaluation as de... \n",
361
+ "5 Which specialized metrics does Ragas provide f... \n",
362
+ "6 How does Ragas assist in the process of test d... \n",
363
+ "7 Which popular LLM frameworks and observability... \n",
364
+ "8 What command is used to install Ragas accordin... \n",
365
+ "9 In the example, which class is used to wrap th... \n",
366
+ "\n",
367
+ " context \n",
368
+ "0 ---\\ntitle: \"Part 1: Introduction to Ragas: Th... \n",
369
+ "1 ---\\ntitle: \"Part 1: Introduction to Ragas: Th... \n",
370
+ "2 ## What is Ragas?\\n\\n[Ragas](https://docs.raga... \n",
371
+ "3 ## What is Ragas?\\n\\n[Ragas](https://docs.raga... \n",
372
+ "4 Evaluation serves several key purposes:\\n- **Q... \n",
373
+ "5 Evaluation serves several key purposes:\\n- **Q... \n",
374
+ "6 ### πŸ§ͺ Test Data Generation\\nCreating high-qual... \n",
375
+ "7 ### πŸ§ͺ Test Data Generation\\nCreating high-qual... \n",
376
+ "8 ## Getting Started with Ragas\\n\\nInstalling Ra... \n",
377
+ "9 ## Getting Started with Ragas\\n\\nInstalling Ra... "
378
  ]
379
  },
380
+ "execution_count": 20,
381
  "metadata": {},
382
  "output_type": "execute_result"
383
  }
384
  ],
385
  "source": [
386
+ "import pandas as pd\n",
387
+ "df = pd.DataFrame(ds)\n",
388
+ "df.head(10)"
389
  ]
390
  },
391
  {
392
  "cell_type": "code",
393
+ "execution_count": 21,
394
+ "id": "b8c025fa",
395
  "metadata": {},
396
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  "source": [
398
+ "df.to_csv(\"evals/ft_questions.csv\", index=False)"
399
  ]
400
  }
401
  ],