Add prompt templates for RAG, call LLM, and query tone check
Browse files- Implemented a RAG prompt template for generating answers based on provided context.
- Created a call LLM prompt template for concise answers with context.
- Added a query tone check prompt template to evaluate the tone of input queries.
- evals/ft_questions.csv +0 -0
- py-src/lets_talk/agent.py +5 -26
- py-src/lets_talk/prompts.py +41 -0
- py-src/lets_talk/rag.py +1 -14
- py-src/notebooks/07_Fine_Tuning_Dataset.ipynb +207 -56
evals/ft_questions.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
py-src/lets_talk/agent.py
CHANGED
@@ -31,16 +31,7 @@ class InputState(TypedDict):
|
|
31 |
documents: Optional[list[Document]]
|
32 |
|
33 |
|
34 |
-
rag_prompt_template = """\
|
35 |
-
You are a helpful assistant that answers questions based on the context provided.
|
36 |
-
Generate a concise answer to the question in markdown format and include a list of relevant links to the context.
|
37 |
-
You have access to the following information:
|
38 |
|
39 |
-
Context:
|
40 |
-
{context}
|
41 |
-
|
42 |
-
If context is unrelated to question, say "I don't know".
|
43 |
-
"""
|
44 |
|
45 |
# Update the call_model function to include current datetime
|
46 |
def call_model(model, state: Dict[str, Any]) -> Dict[str, list[BaseMessage]]:
|
@@ -58,15 +49,12 @@ def call_model(model, state: Dict[str, Any]) -> Dict[str, list[BaseMessage]]:
|
|
58 |
messages = state["messages"]
|
59 |
context = state.get("context", "")
|
60 |
|
61 |
-
|
62 |
-
# Get current datetime
|
63 |
-
current_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
64 |
# Insert system message with context before the latest user message
|
65 |
-
|
|
|
66 |
context=context,
|
67 |
)
|
68 |
-
|
69 |
-
print(sys_prompt)
|
70 |
|
71 |
context_message = SystemMessage(content=sys_prompt)
|
72 |
|
@@ -217,15 +205,6 @@ def parse_output(input_state: Dict[str, Any]) -> str:
|
|
217 |
return "I encountered an error while processing your request."
|
218 |
|
219 |
|
220 |
-
tone_check_prompt_template = """\
|
221 |
-
Check if the input query is rude, derogatory, disrespectful, or negative, and respond with "YES" or "NO".
|
222 |
-
|
223 |
-
Query:
|
224 |
-
{query}
|
225 |
-
# Output Format
|
226 |
-
|
227 |
-
Respond only with "YES" or "NO".
|
228 |
-
"""
|
229 |
|
230 |
def check_query_tone(state: Dict[str, Any]) -> Dict[str, str]:
|
231 |
"""
|
@@ -254,8 +233,8 @@ def check_query_rudeness(query: str) -> bool:
|
|
254 |
Returns:
|
255 |
True if the query is rude, False otherwise
|
256 |
"""
|
257 |
-
|
258 |
-
tone_prompt = ChatPromptTemplate.from_template(
|
259 |
llm = ChatOpenAI(model=LLM_MODEL, temperature=LLM_TEMPERATURE)
|
260 |
|
261 |
# Create chain
|
|
|
31 |
documents: Optional[list[Document]]
|
32 |
|
33 |
|
|
|
|
|
|
|
|
|
34 |
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
# Update the call_model function to include current datetime
|
37 |
def call_model(model, state: Dict[str, Any]) -> Dict[str, list[BaseMessage]]:
|
|
|
49 |
messages = state["messages"]
|
50 |
context = state.get("context", "")
|
51 |
|
|
|
|
|
|
|
52 |
# Insert system message with context before the latest user message
|
53 |
+
from prompts import call_llm_prompt_template
|
54 |
+
sys_prompt = call_llm_prompt_template.format(
|
55 |
context=context,
|
56 |
)
|
57 |
+
|
|
|
58 |
|
59 |
context_message = SystemMessage(content=sys_prompt)
|
60 |
|
|
|
205 |
return "I encountered an error while processing your request."
|
206 |
|
207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
|
209 |
def check_query_tone(state: Dict[str, Any]) -> Dict[str, str]:
|
210 |
"""
|
|
|
233 |
Returns:
|
234 |
True if the query is rude, False otherwise
|
235 |
"""
|
236 |
+
from prompts import query_tone_check_prompt_template
|
237 |
+
tone_prompt = ChatPromptTemplate.from_template(query_tone_check_prompt_template)
|
238 |
llm = ChatOpenAI(model=LLM_MODEL, temperature=LLM_TEMPERATURE)
|
239 |
|
240 |
# Create chain
|
py-src/lets_talk/prompts.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
# Create RAG prompt template
|
4 |
+
rag_prompt_template = """\
|
5 |
+
You are a helpful assistant that answers questions based on the context provided.
|
6 |
+
Generate a concise answer to the question in markdown format and include a list of relevant links to the context.
|
7 |
+
Use links from context to help user to navigate to to find more information.
|
8 |
+
You have access to the following information:
|
9 |
+
|
10 |
+
Context:
|
11 |
+
{context}
|
12 |
+
|
13 |
+
Question:
|
14 |
+
{question}
|
15 |
+
|
16 |
+
If context is unrelated to question, say "I don't know".
|
17 |
+
"""
|
18 |
+
|
19 |
+
|
20 |
+
call_llm_prompt_template = """\
|
21 |
+
You are a helpful assistant that answers questions based on the context provided.
|
22 |
+
Generate a concise answer to the question in markdown format and include a list of relevant links to the context.
|
23 |
+
You have access to the following information:
|
24 |
+
|
25 |
+
Context:
|
26 |
+
{context}
|
27 |
+
|
28 |
+
If context is unrelated to question, say "I don't know".
|
29 |
+
"""
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
query_tone_check_prompt_template = """\
|
34 |
+
Check if the input query is rude, derogatory, disrespectful, or negative, and respond with "YES" or "NO".
|
35 |
+
|
36 |
+
Query:
|
37 |
+
{query}
|
38 |
+
# Output Format
|
39 |
+
|
40 |
+
Respond only with "YES" or "NO".
|
41 |
+
"""
|
py-src/lets_talk/rag.py
CHANGED
@@ -21,21 +21,8 @@ retriever = vector_store.as_retriever()
|
|
21 |
|
22 |
llm = ChatOpenAI(model=config.LLM_MODEL, temperature=config.LLM_TEMPERATURE)
|
23 |
|
24 |
-
# Create RAG prompt template
|
25 |
-
rag_prompt_template = """\
|
26 |
-
You are a helpful assistant that answers questions based on the context provided.
|
27 |
-
Generate a concise answer to the question in markdown format and include a list of relevant links to the context.
|
28 |
-
Use links from context to help user to navigate to to find more information.
|
29 |
-
You have access to the following information:
|
30 |
|
31 |
-
|
32 |
-
{context}
|
33 |
-
|
34 |
-
Question:
|
35 |
-
{question}
|
36 |
-
|
37 |
-
If context is unrelated to question, say "I don't know".
|
38 |
-
"""
|
39 |
|
40 |
rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
|
41 |
|
|
|
21 |
|
22 |
llm = ChatOpenAI(model=config.LLM_MODEL, temperature=config.LLM_TEMPERATURE)
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
from prompts import rag_prompt_template
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
|
28 |
|
py-src/notebooks/07_Fine_Tuning_Dataset.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"id": "c95ab233",
|
7 |
"metadata": {},
|
8 |
"outputs": [
|
@@ -37,7 +37,7 @@
|
|
37 |
},
|
38 |
{
|
39 |
"cell_type": "code",
|
40 |
-
"execution_count":
|
41 |
"id": "15e97530",
|
42 |
"metadata": {},
|
43 |
"outputs": [],
|
@@ -48,18 +48,17 @@
|
|
48 |
},
|
49 |
{
|
50 |
"cell_type": "code",
|
51 |
-
"execution_count":
|
52 |
"id": "b4f2ddc0",
|
53 |
"metadata": {},
|
54 |
"outputs": [],
|
55 |
"source": [
|
56 |
-
"import lets_talk.utils.blog as blog\n"
|
57 |
-
"import lets_talk.utils.eval as eval"
|
58 |
]
|
59 |
},
|
60 |
{
|
61 |
"cell_type": "code",
|
62 |
-
"execution_count":
|
63 |
"id": "123779af",
|
64 |
"metadata": {},
|
65 |
"outputs": [
|
@@ -67,7 +66,7 @@
|
|
67 |
"name": "stderr",
|
68 |
"output_type": "stream",
|
69 |
"text": [
|
70 |
-
"100%|ββββββββββ| 14/14 [00:00<00:00,
|
71 |
]
|
72 |
},
|
73 |
{
|
@@ -94,7 +93,7 @@
|
|
94 |
},
|
95 |
{
|
96 |
"cell_type": "code",
|
97 |
-
"execution_count":
|
98 |
"id": "0b742838",
|
99 |
"metadata": {},
|
100 |
"outputs": [],
|
@@ -103,7 +102,7 @@
|
|
103 |
"from langchain_openai import ChatOpenAI\n",
|
104 |
"\n",
|
105 |
"qa_chat_model = ChatOpenAI(\n",
|
106 |
-
" model=\"gpt-4.1
|
107 |
" temperature=0,\n",
|
108 |
")\n",
|
109 |
"\n",
|
@@ -120,12 +119,13 @@
|
|
120 |
"\"\"\"\n",
|
121 |
"\n",
|
122 |
"qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)\n",
|
|
|
123 |
"question_generation_chain = qa_prompt_template | qa_chat_model"
|
124 |
]
|
125 |
},
|
126 |
{
|
127 |
"cell_type": "code",
|
128 |
-
"execution_count":
|
129 |
"id": "5488c3d3",
|
130 |
"metadata": {},
|
131 |
"outputs": [],
|
@@ -149,7 +149,7 @@
|
|
149 |
"\n",
|
150 |
" return extracted_questions\n",
|
151 |
"\n",
|
152 |
-
"
|
153 |
" question_set = []\n",
|
154 |
" \n",
|
155 |
" for doc in tqdm.tqdm(documents):\n",
|
@@ -157,7 +157,7 @@
|
|
157 |
" context = doc.page_content\n",
|
158 |
"\n",
|
159 |
" # Generate questions using the question generation chain\n",
|
160 |
-
" response =
|
161 |
" \"context\": context,\n",
|
162 |
" \"n_questions\": n_questions\n",
|
163 |
" })\n",
|
@@ -165,86 +165,237 @@
|
|
165 |
" questions = extract_questions(response.content,n_questions)\n",
|
166 |
" \n",
|
167 |
" for i, question in enumerate(questions):\n",
|
168 |
-
"
|
|
|
169 |
" return question_set"
|
170 |
]
|
171 |
},
|
172 |
{
|
173 |
"cell_type": "code",
|
174 |
-
"execution_count":
|
175 |
-
"id": "
|
176 |
"metadata": {},
|
177 |
"outputs": [
|
178 |
{
|
179 |
-
"name": "
|
180 |
"output_type": "stream",
|
181 |
"text": [
|
182 |
-
"
|
183 |
-
"1. What is the primary purpose of the Ragas evaluation framework in LLM applications?\n",
|
184 |
-
"2. Why is it important to have reliable metrics when assessing the performance of LLM-based systems?\n",
|
185 |
-
"3. What types of applications can benefit from using the Ragas framework for evaluation?\n"
|
186 |
]
|
187 |
}
|
188 |
],
|
189 |
"source": [
|
190 |
-
"
|
191 |
-
"n_questions = 3\n",
|
192 |
-
"response = question_generation_chain.invoke({\"context\": context, \"n_questions\": n_questions})\n",
|
193 |
-
"questions = extract_questions(response.content, n_questions)\n",
|
194 |
-
"print(\"Extracted questions:\")\n",
|
195 |
-
"for i, question in enumerate(questions):\n",
|
196 |
-
" print(f\"{i + 1}. {question}\")"
|
197 |
]
|
198 |
},
|
199 |
{
|
200 |
"cell_type": "code",
|
201 |
-
"execution_count":
|
202 |
-
"id": "
|
203 |
"metadata": {},
|
204 |
"outputs": [
|
205 |
{
|
206 |
"data": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
"text/plain": [
|
208 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
]
|
210 |
},
|
211 |
-
"execution_count":
|
212 |
"metadata": {},
|
213 |
"output_type": "execute_result"
|
214 |
}
|
215 |
],
|
216 |
"source": [
|
217 |
-
"
|
218 |
-
"
|
|
|
219 |
]
|
220 |
},
|
221 |
{
|
222 |
"cell_type": "code",
|
223 |
-
"execution_count":
|
224 |
-
"id": "
|
225 |
"metadata": {},
|
226 |
-
"outputs": [
|
227 |
-
{
|
228 |
-
"name": "stderr",
|
229 |
-
"output_type": "stream",
|
230 |
-
"text": [
|
231 |
-
" 0%| | 0/2 [00:00<?, ?it/s]"
|
232 |
-
]
|
233 |
-
},
|
234 |
-
{
|
235 |
-
"ename": "",
|
236 |
-
"evalue": "",
|
237 |
-
"output_type": "error",
|
238 |
-
"traceback": [
|
239 |
-
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
|
240 |
-
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
|
241 |
-
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
|
242 |
-
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
243 |
-
]
|
244 |
-
}
|
245 |
-
],
|
246 |
"source": [
|
247 |
-
"
|
248 |
]
|
249 |
}
|
250 |
],
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
"id": "c95ab233",
|
7 |
"metadata": {},
|
8 |
"outputs": [
|
|
|
37 |
},
|
38 |
{
|
39 |
"cell_type": "code",
|
40 |
+
"execution_count": 3,
|
41 |
"id": "15e97530",
|
42 |
"metadata": {},
|
43 |
"outputs": [],
|
|
|
48 |
},
|
49 |
{
|
50 |
"cell_type": "code",
|
51 |
+
"execution_count": 5,
|
52 |
"id": "b4f2ddc0",
|
53 |
"metadata": {},
|
54 |
"outputs": [],
|
55 |
"source": [
|
56 |
+
"import lets_talk.utils.blog as blog\n"
|
|
|
57 |
]
|
58 |
},
|
59 |
{
|
60 |
"cell_type": "code",
|
61 |
+
"execution_count": 6,
|
62 |
"id": "123779af",
|
63 |
"metadata": {},
|
64 |
"outputs": [
|
|
|
66 |
"name": "stderr",
|
67 |
"output_type": "stream",
|
68 |
"text": [
|
69 |
+
"100%|ββββββββββ| 14/14 [00:00<00:00, 3317.53it/s]"
|
70 |
]
|
71 |
},
|
72 |
{
|
|
|
93 |
},
|
94 |
{
|
95 |
"cell_type": "code",
|
96 |
+
"execution_count": 7,
|
97 |
"id": "0b742838",
|
98 |
"metadata": {},
|
99 |
"outputs": [],
|
|
|
102 |
"from langchain_openai import ChatOpenAI\n",
|
103 |
"\n",
|
104 |
"qa_chat_model = ChatOpenAI(\n",
|
105 |
+
" model=\"gpt-4.1\",\n",
|
106 |
" temperature=0,\n",
|
107 |
")\n",
|
108 |
"\n",
|
|
|
119 |
"\"\"\"\n",
|
120 |
"\n",
|
121 |
"qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)\n",
|
122 |
+
"\n",
|
123 |
"question_generation_chain = qa_prompt_template | qa_chat_model"
|
124 |
]
|
125 |
},
|
126 |
{
|
127 |
"cell_type": "code",
|
128 |
+
"execution_count": 18,
|
129 |
"id": "5488c3d3",
|
130 |
"metadata": {},
|
131 |
"outputs": [],
|
|
|
149 |
"\n",
|
150 |
" return extracted_questions\n",
|
151 |
"\n",
|
152 |
+
"def create_questions(documents, n_questions, chain):\n",
|
153 |
" question_set = []\n",
|
154 |
" \n",
|
155 |
" for doc in tqdm.tqdm(documents):\n",
|
|
|
157 |
" context = doc.page_content\n",
|
158 |
"\n",
|
159 |
" # Generate questions using the question generation chain\n",
|
160 |
+
" response = chain.invoke({\n",
|
161 |
" \"context\": context,\n",
|
162 |
" \"n_questions\": n_questions\n",
|
163 |
" })\n",
|
|
|
165 |
" questions = extract_questions(response.content,n_questions)\n",
|
166 |
" \n",
|
167 |
" for i, question in enumerate(questions):\n",
|
168 |
+
" question_set.append({\"question\":question, \"context\": context})\n",
|
169 |
+
" \n",
|
170 |
" return question_set"
|
171 |
]
|
172 |
},
|
173 |
{
|
174 |
"cell_type": "code",
|
175 |
+
"execution_count": 19,
|
176 |
+
"id": "b1ece53b",
|
177 |
"metadata": {},
|
178 |
"outputs": [
|
179 |
{
|
180 |
+
"name": "stderr",
|
181 |
"output_type": "stream",
|
182 |
"text": [
|
183 |
+
"100%|ββββββββββ| 162/162 [07:23<00:00, 2.74s/it]\n"
|
|
|
|
|
|
|
184 |
]
|
185 |
}
|
186 |
],
|
187 |
"source": [
|
188 |
+
"ds = create_questions(documents=split_docs, n_questions=2, chain=question_generation_chain)"
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
]
|
190 |
},
|
191 |
{
|
192 |
"cell_type": "code",
|
193 |
+
"execution_count": 20,
|
194 |
+
"id": "965cf609",
|
195 |
"metadata": {},
|
196 |
"outputs": [
|
197 |
{
|
198 |
"data": {
|
199 |
+
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
|
200 |
+
"columns": [
|
201 |
+
{
|
202 |
+
"name": "index",
|
203 |
+
"rawType": "int64",
|
204 |
+
"type": "integer"
|
205 |
+
},
|
206 |
+
{
|
207 |
+
"name": "question",
|
208 |
+
"rawType": "object",
|
209 |
+
"type": "string"
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"name": "context",
|
213 |
+
"rawType": "object",
|
214 |
+
"type": "string"
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"conversionMethod": "pd.DataFrame",
|
218 |
+
"ref": "f0615b27-42e5-4774-a436-51ec88bb4498",
|
219 |
+
"rows": [
|
220 |
+
[
|
221 |
+
"0",
|
222 |
+
"What role does Ragas play in evaluating the performance of applications that use Large Language Models (LLMs)?",
|
223 |
+
"---\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\ndate: 2025-04-26T18:00:00-06:00\nlayout: blog\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\nreadingTime: 7\npublished: true\n---\n\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you're building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\n\n## What is Ragas?"
|
224 |
+
],
|
225 |
+
[
|
226 |
+
"1",
|
227 |
+
"Why is it important to have reliable metrics when building systems like question-answering tools or conversational agents with LLMs?",
|
228 |
+
"---\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\ndate: 2025-04-26T18:00:00-06:00\nlayout: blog\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\nreadingTime: 7\npublished: true\n---\n\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you're building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\n\n## What is Ragas?"
|
229 |
+
],
|
230 |
+
[
|
231 |
+
"2",
|
232 |
+
"What are some of the key questions that Ragas helps answer when evaluating LLM applications?",
|
233 |
+
"## What is Ragas?\n\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\n\nAt its core, Ragas helps answer crucial questions:\n- Is my application retrieving the right information?\n- Are the responses factually accurate and consistent with the retrieved context?\n- Does the system appropriately address the user's query?\n- How well does my application handle multi-turn conversations?\n\n## Why Evaluate LLM Applications?\n\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matterβlike healthcare, finance, or educationβproper evaluation is non-negotiable."
|
234 |
+
],
|
235 |
+
[
|
236 |
+
"3",
|
237 |
+
"Why is proper evaluation especially important for LLM applications in fields like healthcare, finance, or education?",
|
238 |
+
"## What is Ragas?\n\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\n\nAt its core, Ragas helps answer crucial questions:\n- Is my application retrieving the right information?\n- Are the responses factually accurate and consistent with the retrieved context?\n- Does the system appropriately address the user's query?\n- How well does my application handle multi-turn conversations?\n\n## Why Evaluate LLM Applications?\n\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matterβlike healthcare, finance, or educationβproper evaluation is non-negotiable."
|
239 |
+
],
|
240 |
+
[
|
241 |
+
"4",
|
242 |
+
"What are the main purposes of evaluation as described in the context?",
|
243 |
+
"Evaluation serves several key purposes:\n- **Quality assurance**: Identify and fix issues before they reach users\n- **Performance tracking**: Monitor how changes impact system performance\n- **Benchmarking**: Compare different approaches objectively\n- **Continuous improvement**: Build feedback loops to enhance your application\n\n## Key Features of Ragas\n\n### π― Specialized Metrics\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\n\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\n- **Answer Relevancy**: Assesses if the response addresses the user's question\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic"
|
244 |
+
],
|
245 |
+
[
|
246 |
+
"5",
|
247 |
+
"Which specialized metrics does Ragas provide for evaluating LLM applications, and what does each metric measure?",
|
248 |
+
"Evaluation serves several key purposes:\n- **Quality assurance**: Identify and fix issues before they reach users\n- **Performance tracking**: Monitor how changes impact system performance\n- **Benchmarking**: Compare different approaches objectively\n- **Continuous improvement**: Build feedback loops to enhance your application\n\n## Key Features of Ragas\n\n### π― Specialized Metrics\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\n\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\n- **Answer Relevancy**: Assesses if the response addresses the user's question\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic"
|
249 |
+
],
|
250 |
+
[
|
251 |
+
"6",
|
252 |
+
"How does Ragas assist in the process of test data generation for evaluation?",
|
253 |
+
"### π§ͺ Test Data Generation\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\n\n### π Seamless Integrations\nRagas works with popular LLM frameworks and tools:\n- [LangChain](https://www.langchain.com/)\n- [LlamaIndex](https://www.llamaindex.ai/)\n- [Haystack](https://haystack.deepset.ai/)\n- [OpenAI](https://openai.com/)\n\nObservability platforms \n- [Phoenix](https://phoenix.arize.com/)\n- [LangSmith](https://python.langchain.com/docs/introduction/)\n- [Langfuse](https://www.langfuse.com/)\n\n### π Comprehensive Analysis\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\n\n## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:"
|
254 |
+
],
|
255 |
+
[
|
256 |
+
"7",
|
257 |
+
"Which popular LLM frameworks and observability platforms does Ragas integrate with?",
|
258 |
+
"### π§ͺ Test Data Generation\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\n\n### π Seamless Integrations\nRagas works with popular LLM frameworks and tools:\n- [LangChain](https://www.langchain.com/)\n- [LlamaIndex](https://www.llamaindex.ai/)\n- [Haystack](https://haystack.deepset.ai/)\n- [OpenAI](https://openai.com/)\n\nObservability platforms \n- [Phoenix](https://phoenix.arize.com/)\n- [LangSmith](https://python.langchain.com/docs/introduction/)\n- [Langfuse](https://www.langfuse.com/)\n\n### π Comprehensive Analysis\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\n\n## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:"
|
259 |
+
],
|
260 |
+
[
|
261 |
+
"8",
|
262 |
+
"What command is used to install Ragas according to the provided context?",
|
263 |
+
"## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:\n\n```python\nfrom ragas.metrics import Faithfulness\nfrom ragas.evaluation import EvaluationDataset\nfrom ragas.dataset_schema import SingleTurnSample\nfrom langchain_openai import ChatOpenAI\nfrom ragas.llms import LangchainLLMWrapper\nfrom langchain_openai import ChatOpenAI\n\n# Initialize the LLM, you are going to new OPENAI API key\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \n\n# Your evaluation data\ntest_data = {\n \"user_input\": \"What is the capital of France?\",\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\n \"response\": \"The capital of France is Paris.\"\n}\n\n# Create a sample\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor"
|
264 |
+
],
|
265 |
+
[
|
266 |
+
"9",
|
267 |
+
"In the example, which class is used to wrap the ChatOpenAI model for evaluation purposes?",
|
268 |
+
"## Getting Started with Ragas\n\nInstalling Ragas is straightforward:\n\n```bash\nuv init && uv add ragas\n```\n\nHere's a simple example of evaluating a response using Ragas:\n\n```python\nfrom ragas.metrics import Faithfulness\nfrom ragas.evaluation import EvaluationDataset\nfrom ragas.dataset_schema import SingleTurnSample\nfrom langchain_openai import ChatOpenAI\nfrom ragas.llms import LangchainLLMWrapper\nfrom langchain_openai import ChatOpenAI\n\n# Initialize the LLM, you are going to new OPENAI API key\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \n\n# Your evaluation data\ntest_data = {\n \"user_input\": \"What is the capital of France?\",\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\n \"response\": \"The capital of France is Paris.\"\n}\n\n# Create a sample\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor"
|
269 |
+
]
|
270 |
+
],
|
271 |
+
"shape": {
|
272 |
+
"columns": 2,
|
273 |
+
"rows": 10
|
274 |
+
}
|
275 |
+
},
|
276 |
+
"text/html": [
|
277 |
+
"<div>\n",
|
278 |
+
"<style scoped>\n",
|
279 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
280 |
+
" vertical-align: middle;\n",
|
281 |
+
" }\n",
|
282 |
+
"\n",
|
283 |
+
" .dataframe tbody tr th {\n",
|
284 |
+
" vertical-align: top;\n",
|
285 |
+
" }\n",
|
286 |
+
"\n",
|
287 |
+
" .dataframe thead th {\n",
|
288 |
+
" text-align: right;\n",
|
289 |
+
" }\n",
|
290 |
+
"</style>\n",
|
291 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
292 |
+
" <thead>\n",
|
293 |
+
" <tr style=\"text-align: right;\">\n",
|
294 |
+
" <th></th>\n",
|
295 |
+
" <th>question</th>\n",
|
296 |
+
" <th>context</th>\n",
|
297 |
+
" </tr>\n",
|
298 |
+
" </thead>\n",
|
299 |
+
" <tbody>\n",
|
300 |
+
" <tr>\n",
|
301 |
+
" <th>0</th>\n",
|
302 |
+
" <td>What role does Ragas play in evaluating the pe...</td>\n",
|
303 |
+
" <td>---\\ntitle: \"Part 1: Introduction to Ragas: Th...</td>\n",
|
304 |
+
" </tr>\n",
|
305 |
+
" <tr>\n",
|
306 |
+
" <th>1</th>\n",
|
307 |
+
" <td>Why is it important to have reliable metrics w...</td>\n",
|
308 |
+
" <td>---\\ntitle: \"Part 1: Introduction to Ragas: Th...</td>\n",
|
309 |
+
" </tr>\n",
|
310 |
+
" <tr>\n",
|
311 |
+
" <th>2</th>\n",
|
312 |
+
" <td>What are some of the key questions that Ragas ...</td>\n",
|
313 |
+
" <td>## What is Ragas?\\n\\n[Ragas](https://docs.raga...</td>\n",
|
314 |
+
" </tr>\n",
|
315 |
+
" <tr>\n",
|
316 |
+
" <th>3</th>\n",
|
317 |
+
" <td>Why is proper evaluation especially important ...</td>\n",
|
318 |
+
" <td>## What is Ragas?\\n\\n[Ragas](https://docs.raga...</td>\n",
|
319 |
+
" </tr>\n",
|
320 |
+
" <tr>\n",
|
321 |
+
" <th>4</th>\n",
|
322 |
+
" <td>What are the main purposes of evaluation as de...</td>\n",
|
323 |
+
" <td>Evaluation serves several key purposes:\\n- **Q...</td>\n",
|
324 |
+
" </tr>\n",
|
325 |
+
" <tr>\n",
|
326 |
+
" <th>5</th>\n",
|
327 |
+
" <td>Which specialized metrics does Ragas provide f...</td>\n",
|
328 |
+
" <td>Evaluation serves several key purposes:\\n- **Q...</td>\n",
|
329 |
+
" </tr>\n",
|
330 |
+
" <tr>\n",
|
331 |
+
" <th>6</th>\n",
|
332 |
+
" <td>How does Ragas assist in the process of test d...</td>\n",
|
333 |
+
" <td>### π§ͺ Test Data Generation\\nCreating high-qual...</td>\n",
|
334 |
+
" </tr>\n",
|
335 |
+
" <tr>\n",
|
336 |
+
" <th>7</th>\n",
|
337 |
+
" <td>Which popular LLM frameworks and observability...</td>\n",
|
338 |
+
" <td>### π§ͺ Test Data Generation\\nCreating high-qual...</td>\n",
|
339 |
+
" </tr>\n",
|
340 |
+
" <tr>\n",
|
341 |
+
" <th>8</th>\n",
|
342 |
+
" <td>What command is used to install Ragas accordin...</td>\n",
|
343 |
+
" <td>## Getting Started with Ragas\\n\\nInstalling Ra...</td>\n",
|
344 |
+
" </tr>\n",
|
345 |
+
" <tr>\n",
|
346 |
+
" <th>9</th>\n",
|
347 |
+
" <td>In the example, which class is used to wrap th...</td>\n",
|
348 |
+
" <td>## Getting Started with Ragas\\n\\nInstalling Ra...</td>\n",
|
349 |
+
" </tr>\n",
|
350 |
+
" </tbody>\n",
|
351 |
+
"</table>\n",
|
352 |
+
"</div>"
|
353 |
+
],
|
354 |
"text/plain": [
|
355 |
+
" question \\\n",
|
356 |
+
"0 What role does Ragas play in evaluating the pe... \n",
|
357 |
+
"1 Why is it important to have reliable metrics w... \n",
|
358 |
+
"2 What are some of the key questions that Ragas ... \n",
|
359 |
+
"3 Why is proper evaluation especially important ... \n",
|
360 |
+
"4 What are the main purposes of evaluation as de... \n",
|
361 |
+
"5 Which specialized metrics does Ragas provide f... \n",
|
362 |
+
"6 How does Ragas assist in the process of test d... \n",
|
363 |
+
"7 Which popular LLM frameworks and observability... \n",
|
364 |
+
"8 What command is used to install Ragas accordin... \n",
|
365 |
+
"9 In the example, which class is used to wrap th... \n",
|
366 |
+
"\n",
|
367 |
+
" context \n",
|
368 |
+
"0 ---\\ntitle: \"Part 1: Introduction to Ragas: Th... \n",
|
369 |
+
"1 ---\\ntitle: \"Part 1: Introduction to Ragas: Th... \n",
|
370 |
+
"2 ## What is Ragas?\\n\\n[Ragas](https://docs.raga... \n",
|
371 |
+
"3 ## What is Ragas?\\n\\n[Ragas](https://docs.raga... \n",
|
372 |
+
"4 Evaluation serves several key purposes:\\n- **Q... \n",
|
373 |
+
"5 Evaluation serves several key purposes:\\n- **Q... \n",
|
374 |
+
"6 ### π§ͺ Test Data Generation\\nCreating high-qual... \n",
|
375 |
+
"7 ### π§ͺ Test Data Generation\\nCreating high-qual... \n",
|
376 |
+
"8 ## Getting Started with Ragas\\n\\nInstalling Ra... \n",
|
377 |
+
"9 ## Getting Started with Ragas\\n\\nInstalling Ra... "
|
378 |
]
|
379 |
},
|
380 |
+
"execution_count": 20,
|
381 |
"metadata": {},
|
382 |
"output_type": "execute_result"
|
383 |
}
|
384 |
],
|
385 |
"source": [
|
386 |
+
"import pandas as pd\n",
|
387 |
+
"df = pd.DataFrame(ds)\n",
|
388 |
+
"df.head(10)"
|
389 |
]
|
390 |
},
|
391 |
{
|
392 |
"cell_type": "code",
|
393 |
+
"execution_count": 21,
|
394 |
+
"id": "b8c025fa",
|
395 |
"metadata": {},
|
396 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
"source": [
|
398 |
+
"df.to_csv(\"evals/ft_questions.csv\", index=False)"
|
399 |
]
|
400 |
}
|
401 |
],
|