File size: 182,987 Bytes
9ffcda2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ca8bd0e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Adding package root to sys.path: /home/mafzaal/source/lets-talk/py-src\n",
      "Current notebook directory: /home/mafzaal/source/lets-talk/py-src/notebooks\n",
      "Project root: /home/mafzaal/source/lets-talk\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "\n",
    "# Add the project root to the Python path\n",
    "package_root = os.path.abspath(os.path.join(os.getcwd(), \"../\"))\n",
    "print(f\"Adding package root to sys.path: {package_root}\")\n",
    "if package_root not in sys.path:\n",
    "\tsys.path.append(package_root)\n",
    "\n",
    "notebook_dir = os.getcwd()\n",
    "print(f\"Current notebook directory: {notebook_dir}\")\n",
    "# change to the directory to the root of the project\n",
    "project_root = os.path.abspath(os.path.join(os.getcwd(), \"../../\"))\n",
    "print(f\"Project root: {project_root}\")\n",
    "os.chdir(project_root)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b48fa7d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# set LANGCHAIN_TRACING_V2 to false to disable tracing\n",
    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"false\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "cd3c7329",
   "metadata": {},
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "import lets_talk.utils.blog as blog\n",
    "import lets_talk.utils.eval as eval\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1f9f2076",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<module 'lets_talk.utils.blog' from '/home/mafzaal/source/lets-talk/py-src/lets_talk/utils/blog.py'>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# hot reload the module\n",
    "import importlib\n",
    "importlib.reload(eval)\n",
    "importlib.reload(blog)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cad859be",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "user_input",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "reference_contexts",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "reference",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "synthesizer_name",
         "rawType": "object",
         "type": "string"
        }
       ],
       "conversionMethod": "pd.DataFrame",
       "ref": "e3da4b39-04aa-48ef-bce2-247a3114158f",
       "rows": [
        [
         "0",
         "How are Large Language Models integrated into modern applications, and why is their performance evaluation considered critical according to the context?",
         "['---\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\\ndate: 2025-04-26T18:00:00-06:00\\nlayout: blog\\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\\nreadingTime: 7\\npublished: true\\n---\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\n## What is Ragas?']",
         "Large Language Models (LLMs) are becoming fundamental components of modern applications, such as question-answering systems, document retrieval tools, and conversational agents. Effectively evaluating their performance is considered increasingly critical to ensure reliable metrics for assessing how well these applications perform.",
         "single_hop_specifc_query_synthesizer"
        ],
        [
         "1",
         "Howw does Ragas help evalute LLM applikations in the helthcare industree, and why is this importent for ensuring akuracy and reliabilitee?",
         "[\"## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\"]",
         "Ragas is an open-source evaluation framework designed for LLM applications, with strengths in Retrieval-Augmented Generation systems. It provides specialized metrics to address challenges unique to LLM-powered systems, such as determining if the application retrieves the right information and if responses are factually accurate and consistent with the retrieved context. Evaluating LLM applications is especially important in healthcare, where accuracy and reliability are critical, because LLMs can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. Proper evaluation with frameworks like Ragas is non-negotiable in healthcare to ensure the system's accuracy and reliability.",
         "single_hop_specifc_query_synthesizer"
        ],
        [
         "2",
         "What specialized metrics does Ragas provide for evaluating LLM applications?",
         "[\"Evaluation serves several key purposes:\\n- **Quality assurance**: Identify and fix issues before they reach users\\n- **Performance tracking**: Monitor how changes impact system performance\\n- **Benchmarking**: Compare different approaches objectively\\n- **Continuous improvement**: Build feedback loops to enhance your application\\n\\n## Key Features of Ragas\\n\\n### 🎯 Specialized Metrics\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\\n- **Answer Relevancy**: Assesses if the response addresses the user's question\\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic\"]",
         "Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications, including faithfulness, context relevancy, answer relevancy, and topic adherence.",
         "single_hop_specifc_query_synthesizer"
        ],
        [
         "3",
         "me wanna know how LangSmith work with Ragas, like is it for test data or what, and how it help me as LLM app builder, can you tell me all about LangSmith from this info?",
         "[\"### 🧪 Test Data Generation\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n### 🔗 Seamless Integrations\\nRagas works with popular LLM frameworks and tools:\\n- [LangChain](https://www.langchain.com/)\\n- [LlamaIndex](https://www.llamaindex.ai/)\\n- [Haystack](https://haystack.deepset.ai/)\\n- [OpenAI](https://openai.com/)\\n\\nObservability platforms \\n- [Phoenix](https://phoenix.arize.com/)\\n- [LangSmith](https://python.langchain.com/docs/introduction/)\\n- [Langfuse](https://www.langfuse.com/)\\n\\n### 📊 Comprehensive Analysis\\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\\n\\n## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere's a simple example of evaluating a response using Ragas:\"]",
         "LangSmith is listed as one of the observability platforms that Ragas works with. The context does not provide details about how LangSmith specifically functions, but it shows that Ragas integrates with LangSmith to support observability in LLM-powered systems.",
         "single_hop_specifc_query_synthesizer"
        ],
        [
         "4",
         "How do I use the OPENAI API key when initializing an LLM for evaluation with Ragas?",
         "['## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python\\nfrom ragas.metrics import Faithfulness\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import SingleTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\nfrom langchain_openai import ChatOpenAI\\n\\n# Initialize the LLM, you are going to new OPENAI API key\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \\n\\n# Your evaluation data\\ntest_data = {\\n    \"user_input\": \"What is the capital of France?\",\\n    \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\\n    \"response\": \"The capital of France is Paris.\"\\n}\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data)  # Unpack the dictionary into the constructor']",
         "To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you need to provide your OPENAI API key during the initialization of the ChatOpenAI model, as shown in the example: evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")).",
         "single_hop_specifc_query_synthesizer"
        ],
        [
         "5",
         "How does synthetic data generation contribute to the importance of data quantity in improving RAG evaluation, and what paradigm shift does this represent according to Peter Norvig's perspective?",
         "['<1-hop>\\n\\n## Why and How to Generate Synthetic Data for RAG Evaluation\\n\\nIn the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, **synthetic data generation** is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like [RAGAS](https://github.com/explodinggradients/ragas) and [LangSmith](https://smith.langchain.com/).\\n\\n---\\n\\n### Why Generate Synthetic Data?\\n\\n1. **Early Signal, Fast Iteration**  \\n   Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production.\\n\\n2. **Controlled Complexity**  \\n   You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases.', '<2-hop>\\n\\n## The Origin of \"Data is King\"\\n\\nPeter Norvig famously stated, \"We don\\'t have better algorithms. We just have more data.\" This statement emerged during a time when Google\\'s approach to machine translation was yielding surprisingly effective results not through algorithmic innovations, but through the sheer volume of multilingual data they had amassed. \\n\\nThis perspective represented a paradigm shift. Prior to this, the field had largely focused on crafting ever more sophisticated algorithms, with the assumption that smarter code would yield better results. Norvig\\'s insight suggested something different: even relatively simple algorithms could outperform more sophisticated ones when trained on sufficiently large datasets.\\n\\n## The Business Imperative of Data Ownership']",
         "Synthetic data generation enables rapid creation of test sets that mimic real user queries and contexts, allowing for early evaluation and iteration of RAG systems even when real-world data is scarce or expensive to label. This approach increases the quantity of data available for evaluation, supporting robust system development. According to Peter Norvig's perspective, as described in the context, the importance of data quantity represents a paradigm shift: rather than relying solely on more sophisticated algorithms, having more data—even with simpler algorithms—can lead to better results. Thus, synthetic data generation aligns with this shift by providing the large datasets necessary to improve system performance.",
         "multi_hop_abstract_query_synthesizer"
        ],
        [
         "6",
         "How does Ragas support the evaluation of both LLM applications and AI agents, and what specialized metrics are introduced for evaluating AI agents?",
         "['<1-hop>\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data)  # Unpack the dictionary into the constructor\\n\\n# Create metric\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n# Calculate the score\\nresult = await faithfulness.single_turn_ascore(sample)\\nprint(f\"Faithfulness score: {result}\")\\n```\\n\\n> 💡 **Try it yourself:**  \\n> Explore the hands-on notebook for this workflow:  \\n> [01_Introduction_to_Ragas](https://github.com/mafzaal/intro-to-ragas/blob/master/01_Introduction_to_Ragas.ipynb)\\n\\n## What\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)**  \\nWe\\'ll explore each metric in detail, explaining when and how to use them effectively.', \"<2-hop>\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)**  \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)**  \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)**   \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)**  \\n**Part 5: Advanced Evaluation Techniques — _You are here_**  \\n*Next up in the series:*  \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)**  \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)**  \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '<3-hop>\\n\\n---\\ntitle: \"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"\\ndate: 2025-04-28T06:00:00-06:00\\nlayout: blog\\ndescription: \"Learn how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications.\"\\ncategories: [\"AI\", \"Agents\", \"Evaluation\", \"Ragas\", \"LLM\"]\\ncoverImage: \"/images/ai_agent_evaluation.png\"   \\nreadingTime: 8\\npublished: true\\n---\\n\\nIn our previous posts, we\\'ve explored how Ragas evaluates RAG systems and enables custom metrics for specialized applications. As LLMs evolve beyond simple question-answering to become powerful AI agents, evaluation needs have grown more sophisticated too. In this post, we\\'ll explore Ragas\\' specialized metrics for evaluating AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\n## The Challenge of Evaluating AI Agents']",
         "Ragas supports the evaluation of LLM applications by providing metrics such as faithfulness, which can be applied to single-turn samples as shown in the introductory workflow. For AI agents, Ragas introduces specialized metrics designed to assess more complex behaviors, including goal accuracy, tool call accuracy, and topic adherence. These metrics enable the evaluation of AI agents that perform multi-turn interactions, utilize tools, and work toward specific goals, thus addressing the advanced requirements of evaluating agent-based applications.",
         "multi_hop_abstract_query_synthesizer"
        ],
        [
         "7",
         "How does Metric-Driven Development (MDD) utilize combined performance metrics to guide project outcomes, and what are some examples of such metrics in practice?",
         "['<1-hop>\\n\\n## What Exactly is Metric-Driven Development?\\n\\nMetric-Driven Development (MDD) is a simple but effective framework where teams:\\n\\n1.  **Define Clear, Measurable Goals:** Set specific numerical targets (e.g., \"Increase user sign-ups by 20% this quarter\").\\n2.  **Base Decisions on Data:** Rely on evidence and measurements, not just opinions or assumptions.\\n3.  **Iterate and Learn Quickly:** Continuously measure the impact of changes to see what works and what doesn\\'t.\\n\\nThink of MDD as a **GPS for your project**. Without clear metrics, you\\'re driving in the fog, hoping you\\'re heading in the right direction. With MDD, you get real-time feedback, ensuring you\\'re moving towards your destination efficiently.\\n\\n## Why Teams Struggle Without Clear Metrics\\n\\nWithout a metric-driven approach, teams often fall into common traps:', '<2-hop>\\n\\n*   **Metric:** A combined score, e.g., `Points Scored - (Time Taken * Penalty Factor)`.\\n    *   **Impact:** Trains AI opponents that are challenging but fair, balancing speed and skill.\\n*   **Autonomous Vehicles: Safety & Comfort Score**\\n    *   **Metric:** Combination of factors like smooth acceleration/braking, lane adherence, and deductions for interventions or near-misses.\\n    *   **Impact:** Guides development towards vehicles that are not only safe but also provide a comfortable ride.']",
         "Metric-Driven Development (MDD) utilizes combined performance metrics by defining clear, measurable goals and basing decisions on data rather than assumptions. This approach ensures that teams receive real-time feedback and can iterate quickly to improve outcomes. Examples of combined performance metrics in practice include a score such as 'Points Scored - (Time Taken * Penalty Factor)' to train AI opponents that balance speed and skill, and a 'Safety & Comfort Score' for autonomous vehicles, which combines factors like smooth acceleration, lane adherence, and deductions for interventions or near-misses. These combined metrics help guide development towards achieving specific, balanced objectives.",
         "multi_hop_abstract_query_synthesizer"
        ],
        [
         "8",
         "How does Ragas provide specialized evaluation metrics for LLMs, and what steps are involved in creating a custom metric to assess technical accuracy in programming explanations?",
         "[\"<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\", '<2-hop>\\n\\n## Creating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python\\nfrom dataclasses import dataclass, field\\nfrom typing import Dict, Optional, Set\\nimport typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric\\nfrom ragas.prompt import PydanticPrompt\\nfrom ragas.metrics import MetricType, MetricOutputType\\nfrom pydantic import BaseModel\\n\\n# Define input/output models for the prompt\\nclass TechnicalAccuracyInput(BaseModel):\\n    question: str\\n    context: str\\n    response: str\\n    programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel):\\n    score: float\\n    feedback: str']",
         "Ragas is an open-source evaluation framework specifically designed for LLM applications, offering specialized metrics that address challenges unique to LLM-powered systems, such as ensuring factual accuracy, consistency with retrieved context, and appropriate query handling. To create a custom metric for evaluating technical accuracy in programming explanations, Ragas allows developers to define input and output models (for example, using Pydantic BaseModel classes for technical accuracy input and output), and implement the metric logic using its extensible metric classes. This enables tailored evaluation beyond traditional NLP metrics, supporting the needs of high-stakes LLM applications.",
         "multi_hop_abstract_query_synthesizer"
        ],
        [
         "9",
         "How do observability best practices contribute to building production-ready AI systems?",
         "['<1-hop>\\n\\n## Best Practices for Observability\\n\\n1. **Define clear thresholds**: Establish performance baselines and alert thresholds for each metric\\n2. **Segment evaluations**: Break down results by query type, data source, or other relevant factors\\n3. **Historical tracking**: Maintain historical evaluation data to identify trends and regressions\\n4. **Correlation analysis**: Link evaluation metrics to user feedback and business outcomes\\n5. **Regular benchmarking**: Periodically evaluate against fixed test sets to ensure consistency\\n6. **Alert on regressions**: Implement automated alerts when metrics drop below thresholds\\n7. **Contextualize metrics**: Include example failures alongside aggregate metrics for better understanding\\n\\n## Building a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:', \"<2-hop>\\n\\n- **Production-ready**: Understanding software engineering best practices helps create AI systems that can operate reliably at scale.\\n- **User-focused**: Experience with UX principles ensures AI solutions are designed with actual human users in mind.\\n- **Integrated**: Knowledge of enterprise systems makes it easier to connect AI capabilities with existing business processes.\\n- **Simplified**: My experience in streamlining complex business processes helps me identify where AI can have the greatest impact through intelligent automation.\\n- **Business-oriented**: I understand that AI isn't just about the technology—it's about solving real business problems and creating measurable value.\\n- **Practical**: I focus on practical applications that deliver immediate benefits rather than getting caught up in theoretical possibilities.\\n\\n## What's Next\\n\\nAs I return to my AI roots, I'm excited to share this journey with you through this blog. In the coming months, I plan to write about:\"]",
         "Observability best practices, such as defining clear thresholds, segmenting evaluations, maintaining historical tracking, and alerting on regressions, ensure that AI systems are continuously monitored and improved. These practices are essential for creating production-ready AI systems that can operate reliably at scale, as they enable consistent performance evaluation and rapid response to issues.",
         "multi_hop_abstract_query_synthesizer"
        ]
       ],
       "shape": {
        "columns": 4,
        "rows": 10
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_input</th>\n",
       "      <th>reference_contexts</th>\n",
       "      <th>reference</th>\n",
       "      <th>synthesizer_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>How are Large Language Models integrated into ...</td>\n",
       "      <td>['---\\ntitle: \"Part 1: Introduction to Ragas: ...</td>\n",
       "      <td>Large Language Models (LLMs) are becoming fund...</td>\n",
       "      <td>single_hop_specifc_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Howw does Ragas help evalute LLM applikations ...</td>\n",
       "      <td>[\"## What is Ragas?\\n\\n[Ragas](https://docs.ra...</td>\n",
       "      <td>Ragas is an open-source evaluation framework d...</td>\n",
       "      <td>single_hop_specifc_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>What specialized metrics does Ragas provide fo...</td>\n",
       "      <td>[\"Evaluation serves several key purposes:\\n- *...</td>\n",
       "      <td>Ragas offers both LLM-based and computational ...</td>\n",
       "      <td>single_hop_specifc_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>me wanna know how LangSmith work with Ragas, l...</td>\n",
       "      <td>[\"### 🧪 Test Data Generation\\nCreating high-qu...</td>\n",
       "      <td>LangSmith is listed as one of the observabilit...</td>\n",
       "      <td>single_hop_specifc_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>How do I use the OPENAI API key when initializ...</td>\n",
       "      <td>['## Getting Started with Ragas\\n\\nInstalling ...</td>\n",
       "      <td>To use the OPENAI API key when initializing an...</td>\n",
       "      <td>single_hop_specifc_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>How does synthetic data generation contribute ...</td>\n",
       "      <td>['&lt;1-hop&gt;\\n\\n## Why and How to Generate Synthe...</td>\n",
       "      <td>Synthetic data generation enables rapid creati...</td>\n",
       "      <td>multi_hop_abstract_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>How does Ragas support the evaluation of both ...</td>\n",
       "      <td>['&lt;1-hop&gt;\\n\\n# Create a sample\\nsample = Singl...</td>\n",
       "      <td>Ragas supports the evaluation of LLM applicati...</td>\n",
       "      <td>multi_hop_abstract_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>How does Metric-Driven Development (MDD) utili...</td>\n",
       "      <td>['&lt;1-hop&gt;\\n\\n## What Exactly is Metric-Driven ...</td>\n",
       "      <td>Metric-Driven Development (MDD) utilizes combi...</td>\n",
       "      <td>multi_hop_abstract_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>How does Ragas provide specialized evaluation ...</td>\n",
       "      <td>[\"&lt;1-hop&gt;\\n\\n## What is Ragas?\\n\\n[Ragas](http...</td>\n",
       "      <td>Ragas is an open-source evaluation framework s...</td>\n",
       "      <td>multi_hop_abstract_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>How do observability best practices contribute...</td>\n",
       "      <td>['&lt;1-hop&gt;\\n\\n## Best Practices for Observabili...</td>\n",
       "      <td>Observability best practices, such as defining...</td>\n",
       "      <td>multi_hop_abstract_query_synthesizer</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          user_input  \\\n",
       "0  How are Large Language Models integrated into ...   \n",
       "1  Howw does Ragas help evalute LLM applikations ...   \n",
       "2  What specialized metrics does Ragas provide fo...   \n",
       "3  me wanna know how LangSmith work with Ragas, l...   \n",
       "4  How do I use the OPENAI API key when initializ...   \n",
       "5  How does synthetic data generation contribute ...   \n",
       "6  How does Ragas support the evaluation of both ...   \n",
       "7  How does Metric-Driven Development (MDD) utili...   \n",
       "8  How does Ragas provide specialized evaluation ...   \n",
       "9  How do observability best practices contribute...   \n",
       "\n",
       "                                  reference_contexts  \\\n",
       "0  ['---\\ntitle: \"Part 1: Introduction to Ragas: ...   \n",
       "1  [\"## What is Ragas?\\n\\n[Ragas](https://docs.ra...   \n",
       "2  [\"Evaluation serves several key purposes:\\n- *...   \n",
       "3  [\"### 🧪 Test Data Generation\\nCreating high-qu...   \n",
       "4  ['## Getting Started with Ragas\\n\\nInstalling ...   \n",
       "5  ['<1-hop>\\n\\n## Why and How to Generate Synthe...   \n",
       "6  ['<1-hop>\\n\\n# Create a sample\\nsample = Singl...   \n",
       "7  ['<1-hop>\\n\\n## What Exactly is Metric-Driven ...   \n",
       "8  [\"<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](http...   \n",
       "9  ['<1-hop>\\n\\n## Best Practices for Observabili...   \n",
       "\n",
       "                                           reference  \\\n",
       "0  Large Language Models (LLMs) are becoming fund...   \n",
       "1  Ragas is an open-source evaluation framework d...   \n",
       "2  Ragas offers both LLM-based and computational ...   \n",
       "3  LangSmith is listed as one of the observabilit...   \n",
       "4  To use the OPENAI API key when initializing an...   \n",
       "5  Synthetic data generation enables rapid creati...   \n",
       "6  Ragas supports the evaluation of LLM applicati...   \n",
       "7  Metric-Driven Development (MDD) utilizes combi...   \n",
       "8  Ragas is an open-source evaluation framework s...   \n",
       "9  Observability best practices, such as defining...   \n",
       "\n",
       "                       synthesizer_name  \n",
       "0  single_hop_specifc_query_synthesizer  \n",
       "1  single_hop_specifc_query_synthesizer  \n",
       "2  single_hop_specifc_query_synthesizer  \n",
       "3  single_hop_specifc_query_synthesizer  \n",
       "4  single_hop_specifc_query_synthesizer  \n",
       "5  multi_hop_abstract_query_synthesizer  \n",
       "6  multi_hop_abstract_query_synthesizer  \n",
       "7  multi_hop_abstract_query_synthesizer  \n",
       "8  multi_hop_abstract_query_synthesizer  \n",
       "9  multi_hop_abstract_query_synthesizer  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_csv(\"evals/testset_2.csv\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4ae903d8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Testset(samples=[TestsetSample(eval_sample=SingleTurnSample(user_input='How are Large Language Models integrated into modern applications, and why is their performance evaluation considered critical according to the context?', retrieved_contexts=None, reference_contexts=['---\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\\ndate: 2025-04-26T18:00:00-06:00\\nlayout: blog\\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\\nreadingTime: 7\\npublished: true\\n---\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\n## What is Ragas?'], response=None, multi_responses=None, reference='Large Language Models (LLMs) are becoming fundamental components of modern applications, such as question-answering systems, document retrieval tools, and conversational agents. Effectively evaluating their performance is considered increasingly critical to ensure reliable metrics for assessing how well these applications perform.', rubrics=None), synthesizer_name='single_hop_specifc_query_synthesizer'), TestsetSample(eval_sample=SingleTurnSample(user_input='Howw does Ragas help evalute LLM applikations in the helthcare industree, and why is this importent for ensuring akuracy and reliabilitee?', retrieved_contexts=None, reference_contexts=[\"## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\"], response=None, multi_responses=None, reference=\"Ragas is an open-source evaluation framework designed for LLM applications, with strengths in Retrieval-Augmented Generation systems. It provides specialized metrics to address challenges unique to LLM-powered systems, such as determining if the application retrieves the right information and if responses are factually accurate and consistent with the retrieved context. Evaluating LLM applications is especially important in healthcare, where accuracy and reliability are critical, because LLMs can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. Proper evaluation with frameworks like Ragas is non-negotiable in healthcare to ensure the system's accuracy and reliability.\", rubrics=None), synthesizer_name='single_hop_specifc_query_synthesizer'), TestsetSample(eval_sample=SingleTurnSample(user_input='What specialized metrics does Ragas provide for evaluating LLM applications?', retrieved_contexts=None, reference_contexts=[\"Evaluation serves several key purposes:\\n- **Quality assurance**: Identify and fix issues before they reach users\\n- **Performance tracking**: Monitor how changes impact system performance\\n- **Benchmarking**: Compare different approaches objectively\\n- **Continuous improvement**: Build feedback loops to enhance your application\\n\\n## Key Features of Ragas\\n\\n### 🎯 Specialized Metrics\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\\n- **Answer Relevancy**: Assesses if the response addresses the user's question\\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic\"], response=None, multi_responses=None, reference='Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications, including faithfulness, context relevancy, answer relevancy, and topic adherence.', rubrics=None), synthesizer_name='single_hop_specifc_query_synthesizer'), TestsetSample(eval_sample=SingleTurnSample(user_input='me wanna know how LangSmith work with Ragas, like is it for test data or what, and how it help me as LLM app builder, can you tell me all about LangSmith from this info?', retrieved_contexts=None, reference_contexts=[\"### 🧪 Test Data Generation\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n### 🔗 Seamless Integrations\\nRagas works with popular LLM frameworks and tools:\\n- [LangChain](https://www.langchain.com/)\\n- [LlamaIndex](https://www.llamaindex.ai/)\\n- [Haystack](https://haystack.deepset.ai/)\\n- [OpenAI](https://openai.com/)\\n\\nObservability platforms \\n- [Phoenix](https://phoenix.arize.com/)\\n- [LangSmith](https://python.langchain.com/docs/introduction/)\\n- [Langfuse](https://www.langfuse.com/)\\n\\n### 📊 Comprehensive Analysis\\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\\n\\n## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere's a simple example of evaluating a response using Ragas:\"], response=None, multi_responses=None, reference='LangSmith is listed as one of the observability platforms that Ragas works with. The context does not provide details about how LangSmith specifically functions, but it shows that Ragas integrates with LangSmith to support observability in LLM-powered systems.', rubrics=None), synthesizer_name='single_hop_specifc_query_synthesizer'), TestsetSample(eval_sample=SingleTurnSample(user_input='How do I use the OPENAI API key when initializing an LLM for evaluation with Ragas?', retrieved_contexts=None, reference_contexts=['## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python\\nfrom ragas.metrics import Faithfulness\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import SingleTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\nfrom langchain_openai import ChatOpenAI\\n\\n# Initialize the LLM, you are going to new OPENAI API key\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \\n\\n# Your evaluation data\\ntest_data = {\\n    \"user_input\": \"What is the capital of France?\",\\n    \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\\n    \"response\": \"The capital of France is Paris.\"\\n}\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data)  # Unpack the dictionary into the constructor'], response=None, multi_responses=None, reference='To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you need to provide your OPENAI API key during the initialization of the ChatOpenAI model, as shown in the example: evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")).', rubrics=None), synthesizer_name='single_hop_specifc_query_synthesizer'), TestsetSample(eval_sample=SingleTurnSample(user_input=\"How does synthetic data generation contribute to the importance of data quantity in improving RAG evaluation, and what paradigm shift does this represent according to Peter Norvig's perspective?\", retrieved_contexts=None, reference_contexts=['<1-hop>\\n\\n## Why and How to Generate Synthetic Data for RAG Evaluation\\n\\nIn the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, **synthetic data generation** is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like [RAGAS](https://github.com/explodinggradients/ragas) and [LangSmith](https://smith.langchain.com/).\\n\\n---\\n\\n### Why Generate Synthetic Data?\\n\\n1. **Early Signal, Fast Iteration**  \\n   Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production.\\n\\n2. **Controlled Complexity**  \\n   You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases.', '<2-hop>\\n\\n## The Origin of \"Data is King\"\\n\\nPeter Norvig famously stated, \"We don\\'t have better algorithms. We just have more data.\" This statement emerged during a time when Google\\'s approach to machine translation was yielding surprisingly effective results not through algorithmic innovations, but through the sheer volume of multilingual data they had amassed. \\n\\nThis perspective represented a paradigm shift. Prior to this, the field had largely focused on crafting ever more sophisticated algorithms, with the assumption that smarter code would yield better results. Norvig\\'s insight suggested something different: even relatively simple algorithms could outperform more sophisticated ones when trained on sufficiently large datasets.\\n\\n## The Business Imperative of Data Ownership'], response=None, multi_responses=None, reference=\"Synthetic data generation enables rapid creation of test sets that mimic real user queries and contexts, allowing for early evaluation and iteration of RAG systems even when real-world data is scarce or expensive to label. This approach increases the quantity of data available for evaluation, supporting robust system development. According to Peter Norvig's perspective, as described in the context, the importance of data quantity represents a paradigm shift: rather than relying solely on more sophisticated algorithms, having more data—even with simpler algorithms—can lead to better results. Thus, synthetic data generation aligns with this shift by providing the large datasets necessary to improve system performance.\", rubrics=None), synthesizer_name='multi_hop_abstract_query_synthesizer'), TestsetSample(eval_sample=SingleTurnSample(user_input='How does Ragas support the evaluation of both LLM applications and AI agents, and what specialized metrics are introduced for evaluating AI agents?', retrieved_contexts=None, reference_contexts=['<1-hop>\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data)  # Unpack the dictionary into the constructor\\n\\n# Create metric\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n# Calculate the score\\nresult = await faithfulness.single_turn_ascore(sample)\\nprint(f\"Faithfulness score: {result}\")\\n```\\n\\n> 💡 **Try it yourself:**  \\n> Explore the hands-on notebook for this workflow:  \\n> [01_Introduction_to_Ragas](https://github.com/mafzaal/intro-to-ragas/blob/master/01_Introduction_to_Ragas.ipynb)\\n\\n## What\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)**  \\nWe\\'ll explore each metric in detail, explaining when and how to use them effectively.', \"<2-hop>\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)**  \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)**  \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)**   \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)**  \\n**Part 5: Advanced Evaluation Techniques — _You are here_**  \\n*Next up in the series:*  \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)**  \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)**  \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '<3-hop>\\n\\n---\\ntitle: \"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"\\ndate: 2025-04-28T06:00:00-06:00\\nlayout: blog\\ndescription: \"Learn how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications.\"\\ncategories: [\"AI\", \"Agents\", \"Evaluation\", \"Ragas\", \"LLM\"]\\ncoverImage: \"/images/ai_agent_evaluation.png\"   \\nreadingTime: 8\\npublished: true\\n---\\n\\nIn our previous posts, we\\'ve explored how Ragas evaluates RAG systems and enables custom metrics for specialized applications. As LLMs evolve beyond simple question-answering to become powerful AI agents, evaluation needs have grown more sophisticated too. In this post, we\\'ll explore Ragas\\' specialized metrics for evaluating AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\n## The Challenge of Evaluating AI Agents'], response=None, multi_responses=None, reference='Ragas supports the evaluation of LLM applications by providing metrics such as faithfulness, which can be applied to single-turn samples as shown in the introductory workflow. For AI agents, Ragas introduces specialized metrics designed to assess more complex behaviors, including goal accuracy, tool call accuracy, and topic adherence. These metrics enable the evaluation of AI agents that perform multi-turn interactions, utilize tools, and work toward specific goals, thus addressing the advanced requirements of evaluating agent-based applications.', rubrics=None), synthesizer_name='multi_hop_abstract_query_synthesizer'), TestsetSample(eval_sample=SingleTurnSample(user_input='How does Metric-Driven Development (MDD) utilize combined performance metrics to guide project outcomes, and what are some examples of such metrics in practice?', retrieved_contexts=None, reference_contexts=['<1-hop>\\n\\n## What Exactly is Metric-Driven Development?\\n\\nMetric-Driven Development (MDD) is a simple but effective framework where teams:\\n\\n1.  **Define Clear, Measurable Goals:** Set specific numerical targets (e.g., \"Increase user sign-ups by 20% this quarter\").\\n2.  **Base Decisions on Data:** Rely on evidence and measurements, not just opinions or assumptions.\\n3.  **Iterate and Learn Quickly:** Continuously measure the impact of changes to see what works and what doesn\\'t.\\n\\nThink of MDD as a **GPS for your project**. Without clear metrics, you\\'re driving in the fog, hoping you\\'re heading in the right direction. With MDD, you get real-time feedback, ensuring you\\'re moving towards your destination efficiently.\\n\\n## Why Teams Struggle Without Clear Metrics\\n\\nWithout a metric-driven approach, teams often fall into common traps:', '<2-hop>\\n\\n*   **Metric:** A combined score, e.g., `Points Scored - (Time Taken * Penalty Factor)`.\\n    *   **Impact:** Trains AI opponents that are challenging but fair, balancing speed and skill.\\n*   **Autonomous Vehicles: Safety & Comfort Score**\\n    *   **Metric:** Combination of factors like smooth acceleration/braking, lane adherence, and deductions for interventions or near-misses.\\n    *   **Impact:** Guides development towards vehicles that are not only safe but also provide a comfortable ride.'], response=None, multi_responses=None, reference=\"Metric-Driven Development (MDD) utilizes combined performance metrics by defining clear, measurable goals and basing decisions on data rather than assumptions. This approach ensures that teams receive real-time feedback and can iterate quickly to improve outcomes. Examples of combined performance metrics in practice include a score such as 'Points Scored - (Time Taken * Penalty Factor)' to train AI opponents that balance speed and skill, and a 'Safety & Comfort Score' for autonomous vehicles, which combines factors like smooth acceleration, lane adherence, and deductions for interventions or near-misses. These combined metrics help guide development towards achieving specific, balanced objectives.\", rubrics=None), synthesizer_name='multi_hop_abstract_query_synthesizer'), TestsetSample(eval_sample=SingleTurnSample(user_input='How does Ragas provide specialized evaluation metrics for LLMs, and what steps are involved in creating a custom metric to assess technical accuracy in programming explanations?', retrieved_contexts=None, reference_contexts=[\"<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\", '<2-hop>\\n\\n## Creating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python\\nfrom dataclasses import dataclass, field\\nfrom typing import Dict, Optional, Set\\nimport typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric\\nfrom ragas.prompt import PydanticPrompt\\nfrom ragas.metrics import MetricType, MetricOutputType\\nfrom pydantic import BaseModel\\n\\n# Define input/output models for the prompt\\nclass TechnicalAccuracyInput(BaseModel):\\n    question: str\\n    context: str\\n    response: str\\n    programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel):\\n    score: float\\n    feedback: str'], response=None, multi_responses=None, reference='Ragas is an open-source evaluation framework specifically designed for LLM applications, offering specialized metrics that address challenges unique to LLM-powered systems, such as ensuring factual accuracy, consistency with retrieved context, and appropriate query handling. To create a custom metric for evaluating technical accuracy in programming explanations, Ragas allows developers to define input and output models (for example, using Pydantic BaseModel classes for technical accuracy input and output), and implement the metric logic using its extensible metric classes. This enables tailored evaluation beyond traditional NLP metrics, supporting the needs of high-stakes LLM applications.', rubrics=None), synthesizer_name='multi_hop_abstract_query_synthesizer'), TestsetSample(eval_sample=SingleTurnSample(user_input='How do observability best practices contribute to building production-ready AI systems?', retrieved_contexts=None, reference_contexts=['<1-hop>\\n\\n## Best Practices for Observability\\n\\n1. **Define clear thresholds**: Establish performance baselines and alert thresholds for each metric\\n2. **Segment evaluations**: Break down results by query type, data source, or other relevant factors\\n3. **Historical tracking**: Maintain historical evaluation data to identify trends and regressions\\n4. **Correlation analysis**: Link evaluation metrics to user feedback and business outcomes\\n5. **Regular benchmarking**: Periodically evaluate against fixed test sets to ensure consistency\\n6. **Alert on regressions**: Implement automated alerts when metrics drop below thresholds\\n7. **Contextualize metrics**: Include example failures alongside aggregate metrics for better understanding\\n\\n## Building a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:', \"<2-hop>\\n\\n- **Production-ready**: Understanding software engineering best practices helps create AI systems that can operate reliably at scale.\\n- **User-focused**: Experience with UX principles ensures AI solutions are designed with actual human users in mind.\\n- **Integrated**: Knowledge of enterprise systems makes it easier to connect AI capabilities with existing business processes.\\n- **Simplified**: My experience in streamlining complex business processes helps me identify where AI can have the greatest impact through intelligent automation.\\n- **Business-oriented**: I understand that AI isn't just about the technology—it's about solving real business problems and creating measurable value.\\n- **Practical**: I focus on practical applications that deliver immediate benefits rather than getting caught up in theoretical possibilities.\\n\\n## What's Next\\n\\nAs I return to my AI roots, I'm excited to share this journey with you through this blog. In the coming months, I plan to write about:\"], response=None, multi_responses=None, reference='Observability best practices, such as defining clear thresholds, segmenting evaluations, maintaining historical tracking, and alerting on regressions, ensure that AI systems are continuously monitored and improved. These practices are essential for creating production-ready AI systems that can operate reliably at scale, as they enable consistent performance evaluation and rapid response to issues.', rubrics=None), synthesizer_name='multi_hop_abstract_query_synthesizer')])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create ragas test set from the dataframe\n",
    "from ragas.testset import Testset\n",
    "import ast\n",
    "\n",
    "# Convert string representations of lists to actual Python lists\n",
    "df['reference_contexts'] = df['reference_contexts'].apply(ast.literal_eval)\n",
    "\n",
    "testset = Testset.from_pandas(df)\n",
    "testset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6a98c358",
   "metadata": {},
   "outputs": [],
   "source": [
    "from lets_talk.rag import rag_chain\n",
    "response = rag_chain.invoke({\"question\": \"How are Large Language Models integrated into modern applications, and why is their performance evaluation considered critical according to the context?\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8f527f07",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Large Language Models (LLMs) are integrated into modern applications by extending frameworks like Ragas with custom metrics, which allows for the creation of evaluation frameworks that measure what matters most for specific applications. This leads to more meaningful improvements and better user experiences.\\n\\nPerformance evaluation of LLMs is considered critical because it ensures that the models generate relevant and accurate responses, aligning with user queries and the context provided. Metrics such as faithfulness, answer relevancy, context relevancy, and context precision help teams make precise improvements, guiding development towards success.\\n\\nFor more information, you can explore the following links:\\n\\n- [Metric-Driven Development: Make Smarter Decisions, Faster](https://thedataguy.pro/blog/metric-driven-development/)\\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\\n- [Data is King: Why Your Data Strategy IS Your Business Strategy](https://thedataguy.pro/blog/data-is-king/)'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response[\"response\"].content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "86ab0d3b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [00:33<00:00,  3.31s/it]\n"
     ]
    }
   ],
   "source": [
    "evalset = eval.run_rag_chain(testset,rag_chain)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "704669a4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "user_input",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "retrieved_contexts",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "reference_contexts",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "response",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "reference",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "synthesizer_name",
         "rawType": "object",
         "type": "string"
        }
       ],
       "conversionMethod": "pd.DataFrame",
       "ref": "b2e8379b-091e-4072-a11b-fbb83cabe0fc",
       "rows": [
        [
         "0",
         "How are Large Language Models integrated into modern applications, and why is their performance evaluation considered critical according to the context?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"* **Faithfulness**: Measures how well the generated answer aligns with the retrieved context\\n* **Answer Relevancy**: Evaluates whether the response correctly addresses the user's query\\n* **Context Relevancy**: Assesses if the system retrieves information that's actually needed\\n* **Context Precision**: Quantifies how efficiently the system uses retrieved information\\n\\nThese targeted metrics provide clearer direction than general-purpose evaluations, allowing teams to make precise improvements where they matter most.\\nImagine two teams building a new feature for a streaming platform:\"]",
         "['---\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\\ndate: 2025-04-26T18:00:00-06:00\\nlayout: blog\\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\\nreadingTime: 7\\npublished: true\\n---\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\n## What is Ragas?']",
         "Large Language Models (LLMs) are integrated into modern applications by extending frameworks like Ragas with custom metrics, which allows for the creation of evaluation frameworks that measure what matters most for specific applications. This leads to more meaningful improvements and better user experiences.\n\nPerformance evaluation of LLMs is considered critical because it ensures that the models generate relevant and accurate responses, aligning with user queries and the context provided. Targeted metrics such as faithfulness, answer relevancy, context relevancy, and context precision help teams make precise improvements, enhancing the overall effectiveness of the applications.\n\nFor more information, you can explore the following links:\n\n- [Metric-Driven Development: Make Smarter Decisions, Faster](https://thedataguy.pro/blog/metric-driven-development/)\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Data is King: Why Your Data Strategy IS Your Business Strategy](https://thedataguy.pro/blog/data-is-king/)",
         "Large Language Models (LLMs) are becoming fundamental components of modern applications, such as question-answering systems, document retrieval tools, and conversational agents. Effectively evaluating their performance is considered increasingly critical to ensure reliable metrics for assessing how well these applications perform.",
         "single_hop_specifc_query_synthesizer"
        ],
        [
         "1",
         "Howw does Ragas help evalute LLM applikations in the helthcare industree, and why is this importent for ensuring akuracy and reliabilitee?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"These columns enable comprehensive evaluation by linking each question to its supporting evidence and expected answer, while also providing insight into the diversity and difficulty of the generated queries.\\n\\n\\n## Deep Dive into Test Data Generation\\n\\nSo you have a collection of documents and want to create a robust evaluation dataset for your RAG system using Ragas. The `TestsetGenerator`'s `generate_with_langchain_docs` method is your starting point. But what exactly happens when you call it? Let's peek under the hood.\\n\\n**The Goal:** To take raw Langchain `Document` objects and transform them into a structured Ragas `Testset` containing diverse question-answer pairs grounded in those documents.\\n\\n**The Workflow:**\"]",
         "[\"## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\"]",
         "I don't know.",
         "Ragas is an open-source evaluation framework designed for LLM applications, with strengths in Retrieval-Augmented Generation systems. It provides specialized metrics to address challenges unique to LLM-powered systems, such as determining if the application retrieves the right information and if responses are factually accurate and consistent with the retrieved context. Evaluating LLM applications is especially important in healthcare, where accuracy and reliability are critical, because LLMs can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. Proper evaluation with frameworks like Ragas is non-negotiable in healthcare to ensure the system's accuracy and reliability.",
         "single_hop_specifc_query_synthesizer"
        ],
        [
         "2",
         "What specialized metrics does Ragas provide for evaluating LLM applications?",
         "[\"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", \"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"## Conclusion\\n\\nEvaluating AI agents requires specialized metrics that go beyond traditional RAG evaluation. Ragas' `agent_goal_accuracy`, `tool_call_accuracy`, and `topic_adherence` provide crucial insights into whether an agent can successfully complete tasks, use tools correctly, and stay within designated boundaries.\\n\\nBy incorporating these metrics into your evaluation pipeline, you can build more reliable and effective AI agents that truly deliver on the promise of helpful, goal-oriented AI assistants.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\"]",
         "[\"Evaluation serves several key purposes:\\n- **Quality assurance**: Identify and fix issues before they reach users\\n- **Performance tracking**: Monitor how changes impact system performance\\n- **Benchmarking**: Compare different approaches objectively\\n- **Continuous improvement**: Build feedback loops to enhance your application\\n\\n## Key Features of Ragas\\n\\n### 🎯 Specialized Metrics\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\\n- **Answer Relevancy**: Assesses if the response addresses the user's question\\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic\"]",
         "Ragas provides specialized metrics for evaluating LLM applications, including:\n\n- **Agent Goal Accuracy**: Measures how well the AI agent achieves its intended goals.\n- **Tool Call Accuracy**: Assesses the correctness of the tools used by the agent.\n- **Topic Adherence**: Evaluates whether the agent stays within the designated topics during interactions.\n\nThese metrics help in building more reliable and effective AI agents.\n\nFor more information, you can explore the following links:\n- [Evaluating AI Agents: Beyond Simple Answers with Ragas](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/)\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)",
         "Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications, including faithfulness, context relevancy, answer relevancy, and topic adherence.",
         "single_hop_specifc_query_synthesizer"
        ],
        [
         "3",
         "me wanna know how LangSmith work with Ragas, like is it for test data or what, and how it help me as LLM app builder, can you tell me all about LangSmith from this info?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", \"### Ready to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the [official documentation](https://docs.ragas.io/en/stable/). Share your thoughts, challenges, or success stories. If you're facing specific evaluation hurdles, don't hesitate to [reach out](https://www.linkedin.com/in/muhammadafzaal/)—we'd love to help!\", \"*How are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you're facing specific evaluation hurdles, don't hesitate to [reach out](https://www.linkedin.com/in/muhammadafzaal/)—we'd love to help!*\", \"*How are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you're facing specific evaluation hurdles, don't hesitate to [reach out](https://www.linkedin.com/in/muhammadafzaal/)—we'd love to help!*\"]",
         "[\"### 🧪 Test Data Generation\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n### 🔗 Seamless Integrations\\nRagas works with popular LLM frameworks and tools:\\n- [LangChain](https://www.langchain.com/)\\n- [LlamaIndex](https://www.llamaindex.ai/)\\n- [Haystack](https://haystack.deepset.ai/)\\n- [OpenAI](https://openai.com/)\\n\\nObservability platforms \\n- [Phoenix](https://phoenix.arize.com/)\\n- [LangSmith](https://python.langchain.com/docs/introduction/)\\n- [Langfuse](https://www.langfuse.com/)\\n\\n### 📊 Comprehensive Analysis\\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\\n\\n## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere's a simple example of evaluating a response using Ragas:\"]",
         "I don't know.",
         "LangSmith is listed as one of the observability platforms that Ragas works with. The context does not provide details about how LangSmith specifically functions, but it shows that Ragas integrates with LangSmith to support observability in LLM-powered systems.",
         "single_hop_specifc_query_synthesizer"
        ],
        [
         "4",
         "How do I use the OPENAI API key when initializing an LLM for evaluation with Ragas?",
         "[\"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", \"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '## Implementing Agent Evaluation in Practice\\n\\nLet\\'s look at a practical example of evaluating an AI agent using these metrics:\\n\\n```python\\nfrom ragas.metrics import AgentGoalAccuracyWithoutReference, ToolCallAccuracy, TopicAdherenceScore\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import MultiTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\n\\n# Initialize the LLM\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))', '*   **`conditional`:** Creates questions with \"if/then\" clauses based on information in the graph.\\n*   **Generation Process:** `self.generate()` calculates how many questions of each type to create based on the `testset_size` and the distribution probabilities. It then uses an `Executor` to run the appropriate synthesizers, generating the final `TestsetSample` objects that make up your evaluation dataset.']",
         "['## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python\\nfrom ragas.metrics import Faithfulness\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import SingleTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\nfrom langchain_openai import ChatOpenAI\\n\\n# Initialize the LLM, you are going to new OPENAI API key\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \\n\\n# Your evaluation data\\ntest_data = {\\n    \"user_input\": \"What is the capital of France?\",\\n    \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\\n    \"response\": \"The capital of France is Paris.\"\\n}\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data)  # Unpack the dictionary into the constructor']",
         "To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you typically set the API key in your environment or directly in your code before initializing the LLM. Here's a basic example of how to do this:\n\n```python\nimport os\nfrom langchain_openai import ChatOpenAI\n\n# Set your OpenAI API key\nos.environ[\"OPENAI_API_KEY\"] = \"your_api_key_here\"\n\n# Initialize the LLM\nevaluator_llm = ChatOpenAI(model=\"gpt-4o\")\n```\n\nMake sure to replace `\"your_api_key_here\"` with your actual OpenAI API key.\n\nFor more detailed information on evaluating AI agents with Ragas, you can refer to the following link:\n\n- [Evaluating AI Agents: Beyond Simple Answers with Ragas](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/)",
         "To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you need to provide your OPENAI API key during the initialization of the ChatOpenAI model, as shown in the example: evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")).",
         "single_hop_specifc_query_synthesizer"
        ],
        [
         "5",
         "How does synthetic data generation contribute to the importance of data quantity in improving RAG evaluation, and what paradigm shift does this represent according to Peter Norvig's perspective?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"* **Faithfulness**: Measures how well the generated answer aligns with the retrieved context\\n* **Answer Relevancy**: Evaluates whether the response correctly addresses the user's query\\n* **Context Relevancy**: Assesses if the system retrieves information that's actually needed\\n* **Context Precision**: Quantifies how efficiently the system uses retrieved information\\n\\nThese targeted metrics provide clearer direction than general-purpose evaluations, allowing teams to make precise improvements where they matter most.\\nImagine two teams building a new feature for a streaming platform:\", 'Essentially, the default transformations build a knowledge graph populated with embedded, filtered document chunks and corresponding simple, extractive question-answer pairs.\\n\\n**Spotlight: Query Synthesizers (via `self.generate()` and `default_query_distribution`)**\\n\\nThe `self.generate()` method, called by `generate_with_langchain_docs`, is responsible for taking the foundational graph and creating the final, potentially complex, test questions using **Query Synthesizers** (also referred to as \"evolutions\" or \"scenarios\").']",
         "['<1-hop>\\n\\n## Why and How to Generate Synthetic Data for RAG Evaluation\\n\\nIn the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, **synthetic data generation** is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like [RAGAS](https://github.com/explodinggradients/ragas) and [LangSmith](https://smith.langchain.com/).\\n\\n---\\n\\n### Why Generate Synthetic Data?\\n\\n1. **Early Signal, Fast Iteration**  \\n   Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production.\\n\\n2. **Controlled Complexity**  \\n   You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases.', '<2-hop>\\n\\n## The Origin of \"Data is King\"\\n\\nPeter Norvig famously stated, \"We don\\'t have better algorithms. We just have more data.\" This statement emerged during a time when Google\\'s approach to machine translation was yielding surprisingly effective results not through algorithmic innovations, but through the sheer volume of multilingual data they had amassed. \\n\\nThis perspective represented a paradigm shift. Prior to this, the field had largely focused on crafting ever more sophisticated algorithms, with the assumption that smarter code would yield better results. Norvig\\'s insight suggested something different: even relatively simple algorithms could outperform more sophisticated ones when trained on sufficiently large datasets.\\n\\n## The Business Imperative of Data Ownership']",
         "I don't know.",
         "Synthetic data generation enables rapid creation of test sets that mimic real user queries and contexts, allowing for early evaluation and iteration of RAG systems even when real-world data is scarce or expensive to label. This approach increases the quantity of data available for evaluation, supporting robust system development. According to Peter Norvig's perspective, as described in the context, the importance of data quantity represents a paradigm shift: rather than relying solely on more sophisticated algorithms, having more data—even with simpler algorithms—can lead to better results. Thus, synthetic data generation aligns with this shift by providing the large datasets necessary to improve system performance.",
         "multi_hop_abstract_query_synthesizer"
        ],
        [
         "6",
         "How does Ragas support the evaluation of both LLM applications and AI agents, and what specialized metrics are introduced for evaluating AI agents?",
         "[\"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", \"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"## Conclusion\\n\\nEvaluating AI agents requires specialized metrics that go beyond traditional RAG evaluation. Ragas' `agent_goal_accuracy`, `tool_call_accuracy`, and `topic_adherence` provide crucial insights into whether an agent can successfully complete tasks, use tools correctly, and stay within designated boundaries.\\n\\nBy incorporating these metrics into your evaluation pipeline, you can build more reliable and effective AI agents that truly deliver on the promise of helpful, goal-oriented AI assistants.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\"]",
         "['<1-hop>\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data)  # Unpack the dictionary into the constructor\\n\\n# Create metric\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n# Calculate the score\\nresult = await faithfulness.single_turn_ascore(sample)\\nprint(f\"Faithfulness score: {result}\")\\n```\\n\\n> 💡 **Try it yourself:**  \\n> Explore the hands-on notebook for this workflow:  \\n> [01_Introduction_to_Ragas](https://github.com/mafzaal/intro-to-ragas/blob/master/01_Introduction_to_Ragas.ipynb)\\n\\n## What\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)**  \\nWe\\'ll explore each metric in detail, explaining when and how to use them effectively.', \"<2-hop>\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)**  \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)**  \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)**   \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)**  \\n**Part 5: Advanced Evaluation Techniques — _You are here_**  \\n*Next up in the series:*  \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)**  \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)**  \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '<3-hop>\\n\\n---\\ntitle: \"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"\\ndate: 2025-04-28T06:00:00-06:00\\nlayout: blog\\ndescription: \"Learn how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications.\"\\ncategories: [\"AI\", \"Agents\", \"Evaluation\", \"Ragas\", \"LLM\"]\\ncoverImage: \"/images/ai_agent_evaluation.png\"   \\nreadingTime: 8\\npublished: true\\n---\\n\\nIn our previous posts, we\\'ve explored how Ragas evaluates RAG systems and enables custom metrics for specialized applications. As LLMs evolve beyond simple question-answering to become powerful AI agents, evaluation needs have grown more sophisticated too. In this post, we\\'ll explore Ragas\\' specialized metrics for evaluating AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\n## The Challenge of Evaluating AI Agents']",
         "Ragas supports the evaluation of LLM applications by allowing the extension of custom metrics, which helps create evaluation frameworks that measure what matters most for these applications. This leads to more meaningful improvements and better user experiences.\n\nFor evaluating AI agents, Ragas introduces specialized metrics such as `agent_goal_accuracy`, `tool_call_accuracy`, and `topic_adherence`. These metrics provide insights into an agent's ability to complete tasks, use tools correctly, and stay within designated boundaries, enhancing the reliability and effectiveness of AI agents.\n\nFor more information, you can explore the following links:\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Evaluating AI Agents: Beyond Simple Answers with Ragas](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/)",
         "Ragas supports the evaluation of LLM applications by providing metrics such as faithfulness, which can be applied to single-turn samples as shown in the introductory workflow. For AI agents, Ragas introduces specialized metrics designed to assess more complex behaviors, including goal accuracy, tool call accuracy, and topic adherence. These metrics enable the evaluation of AI agents that perform multi-turn interactions, utilize tools, and work toward specific goals, thus addressing the advanced requirements of evaluating agent-based applications.",
         "multi_hop_abstract_query_synthesizer"
        ],
        [
         "7",
         "How does Metric-Driven Development (MDD) utilize combined performance metrics to guide project outcomes, and what are some examples of such metrics in practice?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"* **Faithfulness**: Measures how well the generated answer aligns with the retrieved context\\n* **Answer Relevancy**: Evaluates whether the response correctly addresses the user's query\\n* **Context Relevancy**: Assesses if the system retrieves information that's actually needed\\n* **Context Precision**: Quantifies how efficiently the system uses retrieved information\\n\\nThese targeted metrics provide clearer direction than general-purpose evaluations, allowing teams to make precise improvements where they matter most.\\nImagine two teams building a new feature for a streaming platform:\"]",
         "['<1-hop>\\n\\n## What Exactly is Metric-Driven Development?\\n\\nMetric-Driven Development (MDD) is a simple but effective framework where teams:\\n\\n1.  **Define Clear, Measurable Goals:** Set specific numerical targets (e.g., \"Increase user sign-ups by 20% this quarter\").\\n2.  **Base Decisions on Data:** Rely on evidence and measurements, not just opinions or assumptions.\\n3.  **Iterate and Learn Quickly:** Continuously measure the impact of changes to see what works and what doesn\\'t.\\n\\nThink of MDD as a **GPS for your project**. Without clear metrics, you\\'re driving in the fog, hoping you\\'re heading in the right direction. With MDD, you get real-time feedback, ensuring you\\'re moving towards your destination efficiently.\\n\\n## Why Teams Struggle Without Clear Metrics\\n\\nWithout a metric-driven approach, teams often fall into common traps:', '<2-hop>\\n\\n*   **Metric:** A combined score, e.g., `Points Scored - (Time Taken * Penalty Factor)`.\\n    *   **Impact:** Trains AI opponents that are challenging but fair, balancing speed and skill.\\n*   **Autonomous Vehicles: Safety & Comfort Score**\\n    *   **Metric:** Combination of factors like smooth acceleration/braking, lane adherence, and deductions for interventions or near-misses.\\n    *   **Impact:** Guides development towards vehicles that are not only safe but also provide a comfortable ride.']",
         "Metric-Driven Development (MDD) utilizes combined performance metrics to set clear, measurable goals that guide actions towards successful project outcomes. By focusing on specific metrics, teams can make informed decisions and improvements based on precise evaluations rather than general assessments.\n\n### Examples of Metrics in MDD:\n- **Faithfulness**: Measures alignment of generated answers with retrieved context.\n- **Answer Relevancy**: Evaluates if the response addresses the user's query correctly.\n- **Context Relevancy**: Assesses if the retrieved information is necessary.\n- **Context Precision**: Quantifies the efficiency of using retrieved information.\n\nThese metrics help teams identify areas for improvement and ensure that development efforts are aligned with user needs and project goals.\n\nFor more information, you can explore the following links:\n- [Metric-Driven Development: Make Smarter Decisions, Faster](https://thedataguy.pro/blog/metric-driven-development/)\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Data is King: Why Your Data Strategy IS Your Business Strategy](https://thedataguy.pro/blog/data-is-king/)",
         "Metric-Driven Development (MDD) utilizes combined performance metrics by defining clear, measurable goals and basing decisions on data rather than assumptions. This approach ensures that teams receive real-time feedback and can iterate quickly to improve outcomes. Examples of combined performance metrics in practice include a score such as 'Points Scored - (Time Taken * Penalty Factor)' to train AI opponents that balance speed and skill, and a 'Safety & Comfort Score' for autonomous vehicles, which combines factors like smooth acceleration, lane adherence, and deductions for interventions or near-misses. These combined metrics help guide development towards achieving specific, balanced objectives.",
         "multi_hop_abstract_query_synthesizer"
        ],
        [
         "8",
         "How does Ragas provide specialized evaluation metrics for LLMs, and what steps are involved in creating a custom metric to assess technical accuracy in programming explanations?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', '*Have you set up your first Ragas evaluation? What aspects of your LLM application are you most interested in measuring? If you’re facing specific evaluation hurdles, don’t hesitate to [reach out](https://www.linkedin.com/in/muhammadafzaal/)—we’d love to help!*']",
         "[\"<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\", '<2-hop>\\n\\n## Creating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python\\nfrom dataclasses import dataclass, field\\nfrom typing import Dict, Optional, Set\\nimport typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric\\nfrom ragas.prompt import PydanticPrompt\\nfrom ragas.metrics import MetricType, MetricOutputType\\nfrom pydantic import BaseModel\\n\\n# Define input/output models for the prompt\\nclass TechnicalAccuracyInput(BaseModel):\\n    question: str\\n    context: str\\n    response: str\\n    programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel):\\n    score: float\\n    feedback: str']",
         "Ragas provides specialized evaluation metrics for LLMs (Large Language Models) by allowing users to extend its framework with custom metrics tailored to specific needs. This customization enables the creation of evaluation frameworks that can measure what matters most for LLM applications, leading to more meaningful improvements and better user experiences.\n\nTo create a custom metric to assess technical accuracy in programming explanations, the following steps are typically involved:\n\n1. **Define the Metric**: Clearly outline what constitutes technical accuracy in the context of programming explanations.\n2. **Data Collection**: Gather relevant data that will be used to evaluate the programming explanations.\n3. **Metric Implementation**: Use Ragas to implement the custom metric, ensuring it aligns with the defined criteria for technical accuracy.\n4. **Testing and Validation**: Test the metric on sample data to validate its effectiveness and make adjustments as necessary.\n5. **Integration**: Integrate the custom metric into the evaluation workflow for ongoing assessments.\n\nFor more detailed information, you can explore the following links:\n\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Basic Evaluation Workflow with Ragas](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)",
         "Ragas is an open-source evaluation framework specifically designed for LLM applications, offering specialized metrics that address challenges unique to LLM-powered systems, such as ensuring factual accuracy, consistency with retrieved context, and appropriate query handling. To create a custom metric for evaluating technical accuracy in programming explanations, Ragas allows developers to define input and output models (for example, using Pydantic BaseModel classes for technical accuracy input and output), and implement the metric logic using its extensible metric classes. This enables tailored evaluation beyond traditional NLP metrics, supporting the needs of high-stakes LLM applications.",
         "multi_hop_abstract_query_synthesizer"
        ],
        [
         "9",
         "How do observability best practices contribute to building production-ready AI systems?",
         "[\"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", \"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', '## Creating Custom Dashboards\\n\\nBuilding custom dashboards gives you a comprehensive view of your evaluation results. Dashboards can display current performance, trends, and detailed breakdowns of recent evaluations, making it easier to monitor your system and identify areas for improvement.\\n\\nWith these practices, you can make evaluation an ongoing, automated, and visible part of your development workflow, leading to more reliable and robust RAG systems.\\n\\n## Best Practices for Observability']",
         "['<1-hop>\\n\\n## Best Practices for Observability\\n\\n1. **Define clear thresholds**: Establish performance baselines and alert thresholds for each metric\\n2. **Segment evaluations**: Break down results by query type, data source, or other relevant factors\\n3. **Historical tracking**: Maintain historical evaluation data to identify trends and regressions\\n4. **Correlation analysis**: Link evaluation metrics to user feedback and business outcomes\\n5. **Regular benchmarking**: Periodically evaluate against fixed test sets to ensure consistency\\n6. **Alert on regressions**: Implement automated alerts when metrics drop below thresholds\\n7. **Contextualize metrics**: Include example failures alongside aggregate metrics for better understanding\\n\\n## Building a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:', \"<2-hop>\\n\\n- **Production-ready**: Understanding software engineering best practices helps create AI systems that can operate reliably at scale.\\n- **User-focused**: Experience with UX principles ensures AI solutions are designed with actual human users in mind.\\n- **Integrated**: Knowledge of enterprise systems makes it easier to connect AI capabilities with existing business processes.\\n- **Simplified**: My experience in streamlining complex business processes helps me identify where AI can have the greatest impact through intelligent automation.\\n- **Business-oriented**: I understand that AI isn't just about the technology—it's about solving real business problems and creating measurable value.\\n- **Practical**: I focus on practical applications that deliver immediate benefits rather than getting caught up in theoretical possibilities.\\n\\n## What's Next\\n\\nAs I return to my AI roots, I'm excited to share this journey with you through this blog. In the coming months, I plan to write about:\"]",
         "Observability best practices contribute to building production-ready AI systems by enabling continuous monitoring and evaluation of system performance. They help in identifying issues, understanding system behavior, and ensuring that the AI models are functioning as intended. By implementing custom dashboards and automated evaluation workflows, developers can gain insights into performance trends and areas for improvement, leading to more reliable and robust AI systems.\n\nFor more information, you can explore the following resources:\n\n- [Integrations and Observability with Ragas](https://thedataguy.pro/blog/integrations-and-observability-with-ragas/)\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)",
         "Observability best practices, such as defining clear thresholds, segmenting evaluations, maintaining historical tracking, and alerting on regressions, ensure that AI systems are continuously monitored and improved. These practices are essential for creating production-ready AI systems that can operate reliably at scale, as they enable consistent performance evaluation and rapid response to issues.",
         "multi_hop_abstract_query_synthesizer"
        ]
       ],
       "shape": {
        "columns": 6,
        "rows": 10
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_input</th>\n",
       "      <th>retrieved_contexts</th>\n",
       "      <th>reference_contexts</th>\n",
       "      <th>response</th>\n",
       "      <th>reference</th>\n",
       "      <th>synthesizer_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>How are Large Language Models integrated into ...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[---\\ntitle: \"Part 1: Introduction to Ragas: T...</td>\n",
       "      <td>Large Language Models (LLMs) are integrated in...</td>\n",
       "      <td>Large Language Models (LLMs) are becoming fund...</td>\n",
       "      <td>single_hop_specifc_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Howw does Ragas help evalute LLM applikations ...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[## What is Ragas?\\n\\n[Ragas](https://docs.rag...</td>\n",
       "      <td>I don't know.</td>\n",
       "      <td>Ragas is an open-source evaluation framework d...</td>\n",
       "      <td>single_hop_specifc_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>What specialized metrics does Ragas provide fo...</td>\n",
       "      <td>[By extending Ragas with custom metrics, you c...</td>\n",
       "      <td>[Evaluation serves several key purposes:\\n- **...</td>\n",
       "      <td>Ragas provides specialized metrics for evaluat...</td>\n",
       "      <td>Ragas offers both LLM-based and computational ...</td>\n",
       "      <td>single_hop_specifc_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>me wanna know how LangSmith work with Ragas, l...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[### 🧪 Test Data Generation\\nCreating high-qua...</td>\n",
       "      <td>I don't know.</td>\n",
       "      <td>LangSmith is listed as one of the observabilit...</td>\n",
       "      <td>single_hop_specifc_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>How do I use the OPENAI API key when initializ...</td>\n",
       "      <td>[By extending Ragas with custom metrics, you c...</td>\n",
       "      <td>[## Getting Started with Ragas\\n\\nInstalling R...</td>\n",
       "      <td>To use the OPENAI API key when initializing an...</td>\n",
       "      <td>To use the OPENAI API key when initializing an...</td>\n",
       "      <td>single_hop_specifc_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>How does synthetic data generation contribute ...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[&lt;1-hop&gt;\\n\\n## Why and How to Generate Synthet...</td>\n",
       "      <td>I don't know.</td>\n",
       "      <td>Synthetic data generation enables rapid creati...</td>\n",
       "      <td>multi_hop_abstract_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>How does Ragas support the evaluation of both ...</td>\n",
       "      <td>[By extending Ragas with custom metrics, you c...</td>\n",
       "      <td>[&lt;1-hop&gt;\\n\\n# Create a sample\\nsample = Single...</td>\n",
       "      <td>Ragas supports the evaluation of LLM applicati...</td>\n",
       "      <td>Ragas supports the evaluation of LLM applicati...</td>\n",
       "      <td>multi_hop_abstract_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>How does Metric-Driven Development (MDD) utili...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[&lt;1-hop&gt;\\n\\n## What Exactly is Metric-Driven D...</td>\n",
       "      <td>Metric-Driven Development (MDD) utilizes combi...</td>\n",
       "      <td>Metric-Driven Development (MDD) utilizes combi...</td>\n",
       "      <td>multi_hop_abstract_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>How does Ragas provide specialized evaluation ...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[&lt;1-hop&gt;\\n\\n## What is Ragas?\\n\\n[Ragas](https...</td>\n",
       "      <td>Ragas provides specialized evaluation metrics ...</td>\n",
       "      <td>Ragas is an open-source evaluation framework s...</td>\n",
       "      <td>multi_hop_abstract_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>How do observability best practices contribute...</td>\n",
       "      <td>[By extending Ragas with custom metrics, you c...</td>\n",
       "      <td>[&lt;1-hop&gt;\\n\\n## Best Practices for Observabilit...</td>\n",
       "      <td>Observability best practices contribute to bui...</td>\n",
       "      <td>Observability best practices, such as defining...</td>\n",
       "      <td>multi_hop_abstract_query_synthesizer</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          user_input  \\\n",
       "0  How are Large Language Models integrated into ...   \n",
       "1  Howw does Ragas help evalute LLM applikations ...   \n",
       "2  What specialized metrics does Ragas provide fo...   \n",
       "3  me wanna know how LangSmith work with Ragas, l...   \n",
       "4  How do I use the OPENAI API key when initializ...   \n",
       "5  How does synthetic data generation contribute ...   \n",
       "6  How does Ragas support the evaluation of both ...   \n",
       "7  How does Metric-Driven Development (MDD) utili...   \n",
       "8  How does Ragas provide specialized evaluation ...   \n",
       "9  How do observability best practices contribute...   \n",
       "\n",
       "                                  retrieved_contexts  \\\n",
       "0  [Reward functions embody the core MDD idea: se...   \n",
       "1  [Reward functions embody the core MDD idea: se...   \n",
       "2  [By extending Ragas with custom metrics, you c...   \n",
       "3  [Reward functions embody the core MDD idea: se...   \n",
       "4  [By extending Ragas with custom metrics, you c...   \n",
       "5  [Reward functions embody the core MDD idea: se...   \n",
       "6  [By extending Ragas with custom metrics, you c...   \n",
       "7  [Reward functions embody the core MDD idea: se...   \n",
       "8  [Reward functions embody the core MDD idea: se...   \n",
       "9  [By extending Ragas with custom metrics, you c...   \n",
       "\n",
       "                                  reference_contexts  \\\n",
       "0  [---\\ntitle: \"Part 1: Introduction to Ragas: T...   \n",
       "1  [## What is Ragas?\\n\\n[Ragas](https://docs.rag...   \n",
       "2  [Evaluation serves several key purposes:\\n- **...   \n",
       "3  [### 🧪 Test Data Generation\\nCreating high-qua...   \n",
       "4  [## Getting Started with Ragas\\n\\nInstalling R...   \n",
       "5  [<1-hop>\\n\\n## Why and How to Generate Synthet...   \n",
       "6  [<1-hop>\\n\\n# Create a sample\\nsample = Single...   \n",
       "7  [<1-hop>\\n\\n## What Exactly is Metric-Driven D...   \n",
       "8  [<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https...   \n",
       "9  [<1-hop>\\n\\n## Best Practices for Observabilit...   \n",
       "\n",
       "                                            response  \\\n",
       "0  Large Language Models (LLMs) are integrated in...   \n",
       "1                                      I don't know.   \n",
       "2  Ragas provides specialized metrics for evaluat...   \n",
       "3                                      I don't know.   \n",
       "4  To use the OPENAI API key when initializing an...   \n",
       "5                                      I don't know.   \n",
       "6  Ragas supports the evaluation of LLM applicati...   \n",
       "7  Metric-Driven Development (MDD) utilizes combi...   \n",
       "8  Ragas provides specialized evaluation metrics ...   \n",
       "9  Observability best practices contribute to bui...   \n",
       "\n",
       "                                           reference  \\\n",
       "0  Large Language Models (LLMs) are becoming fund...   \n",
       "1  Ragas is an open-source evaluation framework d...   \n",
       "2  Ragas offers both LLM-based and computational ...   \n",
       "3  LangSmith is listed as one of the observabilit...   \n",
       "4  To use the OPENAI API key when initializing an...   \n",
       "5  Synthetic data generation enables rapid creati...   \n",
       "6  Ragas supports the evaluation of LLM applicati...   \n",
       "7  Metric-Driven Development (MDD) utilizes combi...   \n",
       "8  Ragas is an open-source evaluation framework s...   \n",
       "9  Observability best practices, such as defining...   \n",
       "\n",
       "                       synthesizer_name  \n",
       "0  single_hop_specifc_query_synthesizer  \n",
       "1  single_hop_specifc_query_synthesizer  \n",
       "2  single_hop_specifc_query_synthesizer  \n",
       "3  single_hop_specifc_query_synthesizer  \n",
       "4  single_hop_specifc_query_synthesizer  \n",
       "5  multi_hop_abstract_query_synthesizer  \n",
       "6  multi_hop_abstract_query_synthesizer  \n",
       "7  multi_hop_abstract_query_synthesizer  \n",
       "8  multi_hop_abstract_query_synthesizer  \n",
       "9  multi_hop_abstract_query_synthesizer  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eval_df = evalset.to_pandas()\n",
    "eval_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f5d50d7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "eval_df.to_csv(\"evals/rag_eval_ft.csv\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "fb7d4a45",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d36d168762624026a5fdb58092c85d34",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception raised in Job[11]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29361, Requested 1237. Please try again in 1.196s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
      "Exception raised in Job[25]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29046, Requested 1507. Please try again in 1.106s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
      "Exception raised in Job[31]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29426, Requested 1328. Please try again in 1.508s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
      "Exception raised in Job[38]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29960, Requested 1295. Please try again in 2.51s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
      "Exception raised in Job[2]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 28951, Requested 1281. Please try again in 464ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
      "Exception raised in Job[5]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29345, Requested 1292. Please try again in 1.274s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
      "Exception raised in Job[30]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29680, Requested 1601. Please try again in 2.562s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
      "Exception raised in Job[13]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29157, Requested 1528. Please try again in 1.37s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
      "Exception raised in Job[37]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29024, Requested 1639. Please try again in 1.326s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
      "Exception raised in Job[36]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29890, Requested 1515. Please try again in 2.81s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
      "Exception raised in Job[44]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 28950, Requested 1456. Please try again in 812ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
      "Exception raised in Job[42]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29619, Requested 1541. Please try again in 2.32s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n"
     ]
    }
   ],
   "source": [
    "result = eval.run_ragas_evaluation(evalset)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "74aab82a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'context_recall': 0.1905, 'faithfulness': 0.6923, 'factual_correctness(mode=f1)': 0.2957, 'answer_relevancy': 0.6845, 'context_entity_recall': 0.2130, 'noise_sensitivity(mode=relevant)': 0.2478}"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "49fa29f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "result.to_pandas().to_csv(\"evals/rag_eval_result_ft.csv\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "90e11932",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "user_input",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "retrieved_contexts",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "reference_contexts",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "response",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "reference",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "context_recall",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "faithfulness",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "factual_correctness(mode=f1)",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "answer_relevancy",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "context_entity_recall",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "noise_sensitivity(mode=relevant)",
         "rawType": "float64",
         "type": "float"
        }
       ],
       "conversionMethod": "pd.DataFrame",
       "ref": "cab268a1-2c05-4d8c-bc7c-3daa528696b9",
       "rows": [
        [
         "0",
         "How are Large Language Models integrated into modern applications, and why is their performance evaluation considered critical according to the context?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"* **Faithfulness**: Measures how well the generated answer aligns with the retrieved context\\n* **Answer Relevancy**: Evaluates whether the response correctly addresses the user's query\\n* **Context Relevancy**: Assesses if the system retrieves information that's actually needed\\n* **Context Precision**: Quantifies how efficiently the system uses retrieved information\\n\\nThese targeted metrics provide clearer direction than general-purpose evaluations, allowing teams to make precise improvements where they matter most.\\nImagine two teams building a new feature for a streaming platform:\"]",
         "['---\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\\ndate: 2025-04-26T18:00:00-06:00\\nlayout: blog\\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\\nreadingTime: 7\\npublished: true\\n---\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\n## What is Ragas?']",
         "Large Language Models (LLMs) are integrated into modern applications by extending frameworks like Ragas with custom metrics, which allows for the creation of evaluation frameworks that measure what matters most for specific applications. This leads to more meaningful improvements and better user experiences.\n\nPerformance evaluation of LLMs is considered critical because it ensures that the models generate relevant and accurate responses, aligning with user queries and the context provided. Targeted metrics such as faithfulness, answer relevancy, context relevancy, and context precision help teams make precise improvements, enhancing the overall effectiveness of the applications.\n\nFor more information, you can explore the following links:\n\n- [Metric-Driven Development: Make Smarter Decisions, Faster](https://thedataguy.pro/blog/metric-driven-development/)\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Data is King: Why Your Data Strategy IS Your Business Strategy](https://thedataguy.pro/blog/data-is-king/)",
         "Large Language Models (LLMs) are becoming fundamental components of modern applications, such as question-answering systems, document retrieval tools, and conversational agents. Effectively evaluating their performance is considered increasingly critical to ensure reliable metrics for assessing how well these applications perform.",
         "0.5",
         "0.4444444444444444",
         null,
         "0.9648619592096089",
         "0.12499999984374999",
         null
        ],
        [
         "1",
         "Howw does Ragas help evalute LLM applikations in the helthcare industree, and why is this importent for ensuring akuracy and reliabilitee?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"These columns enable comprehensive evaluation by linking each question to its supporting evidence and expected answer, while also providing insight into the diversity and difficulty of the generated queries.\\n\\n\\n## Deep Dive into Test Data Generation\\n\\nSo you have a collection of documents and want to create a robust evaluation dataset for your RAG system using Ragas. The `TestsetGenerator`'s `generate_with_langchain_docs` method is your starting point. But what exactly happens when you call it? Let's peek under the hood.\\n\\n**The Goal:** To take raw Langchain `Document` objects and transform them into a structured Ragas `Testset` containing diverse question-answer pairs grounded in those documents.\\n\\n**The Workflow:**\"]",
         "[\"## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\"]",
         "I don't know.",
         "Ragas is an open-source evaluation framework designed for LLM applications, with strengths in Retrieval-Augmented Generation systems. It provides specialized metrics to address challenges unique to LLM-powered systems, such as determining if the application retrieves the right information and if responses are factually accurate and consistent with the retrieved context. Evaluating LLM applications is especially important in healthcare, where accuracy and reliability are critical, because LLMs can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. Proper evaluation with frameworks like Ragas is non-negotiable in healthcare to ensure the system's accuracy and reliability.",
         "0.5",
         "1.0",
         "0.0",
         "0.0",
         "0.33333333277777777",
         null
        ],
        [
         "2",
         "What specialized metrics does Ragas provide for evaluating LLM applications?",
         "[\"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", \"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"## Conclusion\\n\\nEvaluating AI agents requires specialized metrics that go beyond traditional RAG evaluation. Ragas' `agent_goal_accuracy`, `tool_call_accuracy`, and `topic_adherence` provide crucial insights into whether an agent can successfully complete tasks, use tools correctly, and stay within designated boundaries.\\n\\nBy incorporating these metrics into your evaluation pipeline, you can build more reliable and effective AI agents that truly deliver on the promise of helpful, goal-oriented AI assistants.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\"]",
         "[\"Evaluation serves several key purposes:\\n- **Quality assurance**: Identify and fix issues before they reach users\\n- **Performance tracking**: Monitor how changes impact system performance\\n- **Benchmarking**: Compare different approaches objectively\\n- **Continuous improvement**: Build feedback loops to enhance your application\\n\\n## Key Features of Ragas\\n\\n### 🎯 Specialized Metrics\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\\n- **Answer Relevancy**: Assesses if the response addresses the user's question\\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic\"]",
         "Ragas provides specialized metrics for evaluating LLM applications, including:\n\n- **Agent Goal Accuracy**: Measures how well the AI agent achieves its intended goals.\n- **Tool Call Accuracy**: Assesses the correctness of the tools used by the agent.\n- **Topic Adherence**: Evaluates whether the agent stays within the designated topics during interactions.\n\nThese metrics help in building more reliable and effective AI agents.\n\nFor more information, you can explore the following links:\n- [Evaluating AI Agents: Beyond Simple Answers with Ragas](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/)\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)",
         "Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications, including faithfulness, context relevancy, answer relevancy, and topic adherence.",
         "0.0",
         null,
         "0.4",
         "1.0",
         "0.24999999968749997",
         "0.625"
        ],
        [
         "3",
         "me wanna know how LangSmith work with Ragas, like is it for test data or what, and how it help me as LLM app builder, can you tell me all about LangSmith from this info?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", \"### Ready to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the [official documentation](https://docs.ragas.io/en/stable/). Share your thoughts, challenges, or success stories. If you're facing specific evaluation hurdles, don't hesitate to [reach out](https://www.linkedin.com/in/muhammadafzaal/)—we'd love to help!\", \"*How are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you're facing specific evaluation hurdles, don't hesitate to [reach out](https://www.linkedin.com/in/muhammadafzaal/)—we'd love to help!*\", \"*How are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you're facing specific evaluation hurdles, don't hesitate to [reach out](https://www.linkedin.com/in/muhammadafzaal/)—we'd love to help!*\"]",
         "[\"### 🧪 Test Data Generation\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n### 🔗 Seamless Integrations\\nRagas works with popular LLM frameworks and tools:\\n- [LangChain](https://www.langchain.com/)\\n- [LlamaIndex](https://www.llamaindex.ai/)\\n- [Haystack](https://haystack.deepset.ai/)\\n- [OpenAI](https://openai.com/)\\n\\nObservability platforms \\n- [Phoenix](https://phoenix.arize.com/)\\n- [LangSmith](https://python.langchain.com/docs/introduction/)\\n- [Langfuse](https://www.langfuse.com/)\\n\\n### 📊 Comprehensive Analysis\\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\\n\\n## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere's a simple example of evaluating a response using Ragas:\"]",
         "I don't know.",
         "LangSmith is listed as one of the observability platforms that Ragas works with. The context does not provide details about how LangSmith specifically functions, but it shows that Ragas integrates with LangSmith to support observability in LLM-powered systems.",
         "0.0",
         "1.0",
         "0.5",
         "0.0",
         "0.249999999375",
         "0.25"
        ],
        [
         "4",
         "How do I use the OPENAI API key when initializing an LLM for evaluation with Ragas?",
         "[\"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", \"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '## Implementing Agent Evaluation in Practice\\n\\nLet\\'s look at a practical example of evaluating an AI agent using these metrics:\\n\\n```python\\nfrom ragas.metrics import AgentGoalAccuracyWithoutReference, ToolCallAccuracy, TopicAdherenceScore\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import MultiTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\n\\n# Initialize the LLM\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))', '*   **`conditional`:** Creates questions with \"if/then\" clauses based on information in the graph.\\n*   **Generation Process:** `self.generate()` calculates how many questions of each type to create based on the `testset_size` and the distribution probabilities. It then uses an `Executor` to run the appropriate synthesizers, generating the final `TestsetSample` objects that make up your evaluation dataset.']",
         "['## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python\\nfrom ragas.metrics import Faithfulness\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import SingleTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\nfrom langchain_openai import ChatOpenAI\\n\\n# Initialize the LLM, you are going to new OPENAI API key\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \\n\\n# Your evaluation data\\ntest_data = {\\n    \"user_input\": \"What is the capital of France?\",\\n    \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\\n    \"response\": \"The capital of France is Paris.\"\\n}\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data)  # Unpack the dictionary into the constructor']",
         "To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you typically set the API key in your environment or directly in your code before initializing the LLM. Here's a basic example of how to do this:\n\n```python\nimport os\nfrom langchain_openai import ChatOpenAI\n\n# Set your OpenAI API key\nos.environ[\"OPENAI_API_KEY\"] = \"your_api_key_here\"\n\n# Initialize the LLM\nevaluator_llm = ChatOpenAI(model=\"gpt-4o\")\n```\n\nMake sure to replace `\"your_api_key_here\"` with your actual OpenAI API key.\n\nFor more detailed information on evaluating AI agents with Ragas, you can refer to the following link:\n\n- [Evaluating AI Agents: Beyond Simple Answers with Ragas](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/)",
         "To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you need to provide your OPENAI API key during the initialization of the ChatOpenAI model, as shown in the example: evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")).",
         "0.0",
         null,
         "0.22",
         "0.9947663251451272",
         "0.571428570612245",
         "0.0"
        ],
        [
         "5",
         "How does synthetic data generation contribute to the importance of data quantity in improving RAG evaluation, and what paradigm shift does this represent according to Peter Norvig's perspective?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"* **Faithfulness**: Measures how well the generated answer aligns with the retrieved context\\n* **Answer Relevancy**: Evaluates whether the response correctly addresses the user's query\\n* **Context Relevancy**: Assesses if the system retrieves information that's actually needed\\n* **Context Precision**: Quantifies how efficiently the system uses retrieved information\\n\\nThese targeted metrics provide clearer direction than general-purpose evaluations, allowing teams to make precise improvements where they matter most.\\nImagine two teams building a new feature for a streaming platform:\", 'Essentially, the default transformations build a knowledge graph populated with embedded, filtered document chunks and corresponding simple, extractive question-answer pairs.\\n\\n**Spotlight: Query Synthesizers (via `self.generate()` and `default_query_distribution`)**\\n\\nThe `self.generate()` method, called by `generate_with_langchain_docs`, is responsible for taking the foundational graph and creating the final, potentially complex, test questions using **Query Synthesizers** (also referred to as \"evolutions\" or \"scenarios\").']",
         "['<1-hop>\\n\\n## Why and How to Generate Synthetic Data for RAG Evaluation\\n\\nIn the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, **synthetic data generation** is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like [RAGAS](https://github.com/explodinggradients/ragas) and [LangSmith](https://smith.langchain.com/).\\n\\n---\\n\\n### Why Generate Synthetic Data?\\n\\n1. **Early Signal, Fast Iteration**  \\n   Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production.\\n\\n2. **Controlled Complexity**  \\n   You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases.', '<2-hop>\\n\\n## The Origin of \"Data is King\"\\n\\nPeter Norvig famously stated, \"We don\\'t have better algorithms. We just have more data.\" This statement emerged during a time when Google\\'s approach to machine translation was yielding surprisingly effective results not through algorithmic innovations, but through the sheer volume of multilingual data they had amassed. \\n\\nThis perspective represented a paradigm shift. Prior to this, the field had largely focused on crafting ever more sophisticated algorithms, with the assumption that smarter code would yield better results. Norvig\\'s insight suggested something different: even relatively simple algorithms could outperform more sophisticated ones when trained on sufficiently large datasets.\\n\\n## The Business Imperative of Data Ownership']",
         "I don't know.",
         "Synthetic data generation enables rapid creation of test sets that mimic real user queries and contexts, allowing for early evaluation and iteration of RAG systems even when real-world data is scarce or expensive to label. This approach increases the quantity of data available for evaluation, supporting robust system development. According to Peter Norvig's perspective, as described in the context, the importance of data quantity represents a paradigm shift: rather than relying solely on more sophisticated algorithms, having more data—even with simpler algorithms—can lead to better results. Thus, synthetic data generation aligns with this shift by providing the large datasets necessary to improve system performance.",
         null,
         null,
         "0.0",
         "0.0",
         "0.0",
         "0.0"
        ],
        [
         "6",
         "How does Ragas support the evaluation of both LLM applications and AI agents, and what specialized metrics are introduced for evaluating AI agents?",
         "[\"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", \"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"## Conclusion\\n\\nEvaluating AI agents requires specialized metrics that go beyond traditional RAG evaluation. Ragas' `agent_goal_accuracy`, `tool_call_accuracy`, and `topic_adherence` provide crucial insights into whether an agent can successfully complete tasks, use tools correctly, and stay within designated boundaries.\\n\\nBy incorporating these metrics into your evaluation pipeline, you can build more reliable and effective AI agents that truly deliver on the promise of helpful, goal-oriented AI assistants.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\"]",
         "['<1-hop>\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data)  # Unpack the dictionary into the constructor\\n\\n# Create metric\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n# Calculate the score\\nresult = await faithfulness.single_turn_ascore(sample)\\nprint(f\"Faithfulness score: {result}\")\\n```\\n\\n> 💡 **Try it yourself:**  \\n> Explore the hands-on notebook for this workflow:  \\n> [01_Introduction_to_Ragas](https://github.com/mafzaal/intro-to-ragas/blob/master/01_Introduction_to_Ragas.ipynb)\\n\\n## What\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)**  \\nWe\\'ll explore each metric in detail, explaining when and how to use them effectively.', \"<2-hop>\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)**  \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)**  \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)**   \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)**  \\n**Part 5: Advanced Evaluation Techniques — _You are here_**  \\n*Next up in the series:*  \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)**  \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)**  \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '<3-hop>\\n\\n---\\ntitle: \"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"\\ndate: 2025-04-28T06:00:00-06:00\\nlayout: blog\\ndescription: \"Learn how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications.\"\\ncategories: [\"AI\", \"Agents\", \"Evaluation\", \"Ragas\", \"LLM\"]\\ncoverImage: \"/images/ai_agent_evaluation.png\"   \\nreadingTime: 8\\npublished: true\\n---\\n\\nIn our previous posts, we\\'ve explored how Ragas evaluates RAG systems and enables custom metrics for specialized applications. As LLMs evolve beyond simple question-answering to become powerful AI agents, evaluation needs have grown more sophisticated too. In this post, we\\'ll explore Ragas\\' specialized metrics for evaluating AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\n## The Challenge of Evaluating AI Agents']",
         "Ragas supports the evaluation of LLM applications by allowing the extension of custom metrics, which helps create evaluation frameworks that measure what matters most for these applications. This leads to more meaningful improvements and better user experiences.\n\nFor evaluating AI agents, Ragas introduces specialized metrics such as `agent_goal_accuracy`, `tool_call_accuracy`, and `topic_adherence`. These metrics provide insights into an agent's ability to complete tasks, use tools correctly, and stay within designated boundaries, enhancing the reliability and effectiveness of AI agents.\n\nFor more information, you can explore the following links:\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Evaluating AI Agents: Beyond Simple Answers with Ragas](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/)",
         "Ragas supports the evaluation of LLM applications by providing metrics such as faithfulness, which can be applied to single-turn samples as shown in the introductory workflow. For AI agents, Ragas introduces specialized metrics designed to assess more complex behaviors, including goal accuracy, tool call accuracy, and topic adherence. These metrics enable the evaluation of AI agents that perform multi-turn interactions, utilize tools, and work toward specific goals, thus addressing the advanced requirements of evaluating agent-based applications.",
         null,
         null,
         null,
         "0.9734328051066976",
         "0.29999999969999996",
         "0.4166666666666667"
        ],
        [
         "7",
         "How does Metric-Driven Development (MDD) utilize combined performance metrics to guide project outcomes, and what are some examples of such metrics in practice?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', \"* **Faithfulness**: Measures how well the generated answer aligns with the retrieved context\\n* **Answer Relevancy**: Evaluates whether the response correctly addresses the user's query\\n* **Context Relevancy**: Assesses if the system retrieves information that's actually needed\\n* **Context Precision**: Quantifies how efficiently the system uses retrieved information\\n\\nThese targeted metrics provide clearer direction than general-purpose evaluations, allowing teams to make precise improvements where they matter most.\\nImagine two teams building a new feature for a streaming platform:\"]",
         "['<1-hop>\\n\\n## What Exactly is Metric-Driven Development?\\n\\nMetric-Driven Development (MDD) is a simple but effective framework where teams:\\n\\n1.  **Define Clear, Measurable Goals:** Set specific numerical targets (e.g., \"Increase user sign-ups by 20% this quarter\").\\n2.  **Base Decisions on Data:** Rely on evidence and measurements, not just opinions or assumptions.\\n3.  **Iterate and Learn Quickly:** Continuously measure the impact of changes to see what works and what doesn\\'t.\\n\\nThink of MDD as a **GPS for your project**. Without clear metrics, you\\'re driving in the fog, hoping you\\'re heading in the right direction. With MDD, you get real-time feedback, ensuring you\\'re moving towards your destination efficiently.\\n\\n## Why Teams Struggle Without Clear Metrics\\n\\nWithout a metric-driven approach, teams often fall into common traps:', '<2-hop>\\n\\n*   **Metric:** A combined score, e.g., `Points Scored - (Time Taken * Penalty Factor)`.\\n    *   **Impact:** Trains AI opponents that are challenging but fair, balancing speed and skill.\\n*   **Autonomous Vehicles: Safety & Comfort Score**\\n    *   **Metric:** Combination of factors like smooth acceleration/braking, lane adherence, and deductions for interventions or near-misses.\\n    *   **Impact:** Guides development towards vehicles that are not only safe but also provide a comfortable ride.']",
         "Metric-Driven Development (MDD) utilizes combined performance metrics to set clear, measurable goals that guide actions towards successful project outcomes. By focusing on specific metrics, teams can make informed decisions and improvements based on precise evaluations rather than general assessments.\n\n### Examples of Metrics in MDD:\n- **Faithfulness**: Measures alignment of generated answers with retrieved context.\n- **Answer Relevancy**: Evaluates if the response addresses the user's query correctly.\n- **Context Relevancy**: Assesses if the retrieved information is necessary.\n- **Context Precision**: Quantifies the efficiency of using retrieved information.\n\nThese metrics help teams identify areas for improvement and ensure that development efforts are aligned with user needs and project goals.\n\nFor more information, you can explore the following links:\n- [Metric-Driven Development: Make Smarter Decisions, Faster](https://thedataguy.pro/blog/metric-driven-development/)\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Data is King: Why Your Data Strategy IS Your Business Strategy](https://thedataguy.pro/blog/data-is-king/)",
         "Metric-Driven Development (MDD) utilizes combined performance metrics by defining clear, measurable goals and basing decisions on data rather than assumptions. This approach ensures that teams receive real-time feedback and can iterate quickly to improve outcomes. Examples of combined performance metrics in practice include a score such as 'Points Scored - (Time Taken * Penalty Factor)' to train AI opponents that balance speed and skill, and a 'Safety & Comfort Score' for autonomous vehicles, which combines factors like smooth acceleration, lane adherence, and deductions for interventions or near-misses. These combined metrics help guide development towards achieving specific, balanced objectives.",
         null,
         "0.8",
         null,
         "0.9649751124291034",
         "0.16666666652777776",
         "0.5"
        ],
        [
         "8",
         "How does Ragas provide specialized evaluation metrics for LLMs, and what steps are involved in creating a custom metric to assess technical accuracy in programming explanations?",
         "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', '*Have you set up your first Ragas evaluation? What aspects of your LLM application are you most interested in measuring? If you’re facing specific evaluation hurdles, don’t hesitate to [reach out](https://www.linkedin.com/in/muhammadafzaal/)—we’d love to help!*']",
         "[\"<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\", '<2-hop>\\n\\n## Creating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python\\nfrom dataclasses import dataclass, field\\nfrom typing import Dict, Optional, Set\\nimport typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric\\nfrom ragas.prompt import PydanticPrompt\\nfrom ragas.metrics import MetricType, MetricOutputType\\nfrom pydantic import BaseModel\\n\\n# Define input/output models for the prompt\\nclass TechnicalAccuracyInput(BaseModel):\\n    question: str\\n    context: str\\n    response: str\\n    programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel):\\n    score: float\\n    feedback: str']",
         "Ragas provides specialized evaluation metrics for LLMs (Large Language Models) by allowing users to extend its framework with custom metrics tailored to specific needs. This customization enables the creation of evaluation frameworks that can measure what matters most for LLM applications, leading to more meaningful improvements and better user experiences.\n\nTo create a custom metric to assess technical accuracy in programming explanations, the following steps are typically involved:\n\n1. **Define the Metric**: Clearly outline what constitutes technical accuracy in the context of programming explanations.\n2. **Data Collection**: Gather relevant data that will be used to evaluate the programming explanations.\n3. **Metric Implementation**: Use Ragas to implement the custom metric, ensuring it aligns with the defined criteria for technical accuracy.\n4. **Testing and Validation**: Test the metric on sample data to validate its effectiveness and make adjustments as necessary.\n5. **Integration**: Integrate the custom metric into the evaluation workflow for ongoing assessments.\n\nFor more detailed information, you can explore the following links:\n\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Basic Evaluation Workflow with Ragas](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)",
         "Ragas is an open-source evaluation framework specifically designed for LLM applications, offering specialized metrics that address challenges unique to LLM-powered systems, such as ensuring factual accuracy, consistency with retrieved context, and appropriate query handling. To create a custom metric for evaluating technical accuracy in programming explanations, Ragas allows developers to define input and output models (for example, using Pydantic BaseModel classes for technical accuracy input and output), and implement the metric logic using its extensible metric classes. This enables tailored evaluation beyond traditional NLP metrics, supporting the needs of high-stakes LLM applications.",
         "0.3333333333333333",
         "0.2727272727272727",
         "0.45",
         "0.9471243648744894",
         "0.13333333324444444",
         "0.09090909090909091"
        ],
        [
         "9",
         "How do observability best practices contribute to building production-ready AI systems?",
         "[\"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", \"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '3. **Organization & Governance**: Establishing metadata frameworks, quality control processes, and governance structures that make data discoverable and trustworthy.\\n\\n4. **Insight Extraction**: Applying analytics techniques from basic reporting to advanced machine learning that convert data into actionable business intelligence.', '## Creating Custom Dashboards\\n\\nBuilding custom dashboards gives you a comprehensive view of your evaluation results. Dashboards can display current performance, trends, and detailed breakdowns of recent evaluations, making it easier to monitor your system and identify areas for improvement.\\n\\nWith these practices, you can make evaluation an ongoing, automated, and visible part of your development workflow, leading to more reliable and robust RAG systems.\\n\\n## Best Practices for Observability']",
         "['<1-hop>\\n\\n## Best Practices for Observability\\n\\n1. **Define clear thresholds**: Establish performance baselines and alert thresholds for each metric\\n2. **Segment evaluations**: Break down results by query type, data source, or other relevant factors\\n3. **Historical tracking**: Maintain historical evaluation data to identify trends and regressions\\n4. **Correlation analysis**: Link evaluation metrics to user feedback and business outcomes\\n5. **Regular benchmarking**: Periodically evaluate against fixed test sets to ensure consistency\\n6. **Alert on regressions**: Implement automated alerts when metrics drop below thresholds\\n7. **Contextualize metrics**: Include example failures alongside aggregate metrics for better understanding\\n\\n## Building a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:', \"<2-hop>\\n\\n- **Production-ready**: Understanding software engineering best practices helps create AI systems that can operate reliably at scale.\\n- **User-focused**: Experience with UX principles ensures AI solutions are designed with actual human users in mind.\\n- **Integrated**: Knowledge of enterprise systems makes it easier to connect AI capabilities with existing business processes.\\n- **Simplified**: My experience in streamlining complex business processes helps me identify where AI can have the greatest impact through intelligent automation.\\n- **Business-oriented**: I understand that AI isn't just about the technology—it's about solving real business problems and creating measurable value.\\n- **Practical**: I focus on practical applications that deliver immediate benefits rather than getting caught up in theoretical possibilities.\\n\\n## What's Next\\n\\nAs I return to my AI roots, I'm excited to share this journey with you through this blog. In the coming months, I plan to write about:\"]",
         "Observability best practices contribute to building production-ready AI systems by enabling continuous monitoring and evaluation of system performance. They help in identifying issues, understanding system behavior, and ensuring that the AI models are functioning as intended. By implementing custom dashboards and automated evaluation workflows, developers can gain insights into performance trends and areas for improvement, leading to more reliable and robust AI systems.\n\nFor more information, you can explore the following resources:\n\n- [Integrations and Observability with Ragas](https://thedataguy.pro/blog/integrations-and-observability-with-ragas/)\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)",
         "Observability best practices, such as defining clear thresholds, segmenting evaluations, maintaining historical tracking, and alerting on regressions, ensure that AI systems are continuously monitored and improved. These practices are essential for creating production-ready AI systems that can operate reliably at scale, as they enable consistent performance evaluation and rapid response to issues.",
         "0.0",
         "0.6363636363636364",
         "0.5",
         "0.9999999999999997",
         "0.0",
         "0.1"
        ]
       ],
       "shape": {
        "columns": 11,
        "rows": 10
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_input</th>\n",
       "      <th>retrieved_contexts</th>\n",
       "      <th>reference_contexts</th>\n",
       "      <th>response</th>\n",
       "      <th>reference</th>\n",
       "      <th>context_recall</th>\n",
       "      <th>faithfulness</th>\n",
       "      <th>factual_correctness(mode=f1)</th>\n",
       "      <th>answer_relevancy</th>\n",
       "      <th>context_entity_recall</th>\n",
       "      <th>noise_sensitivity(mode=relevant)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>How are Large Language Models integrated into ...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[---\\ntitle: \"Part 1: Introduction to Ragas: T...</td>\n",
       "      <td>Large Language Models (LLMs) are integrated in...</td>\n",
       "      <td>Large Language Models (LLMs) are becoming fund...</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.964862</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Howw does Ragas help evalute LLM applikations ...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[## What is Ragas?\\n\\n[Ragas](https://docs.rag...</td>\n",
       "      <td>I don't know.</td>\n",
       "      <td>Ragas is an open-source evaluation framework d...</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>What specialized metrics does Ragas provide fo...</td>\n",
       "      <td>[By extending Ragas with custom metrics, you c...</td>\n",
       "      <td>[Evaluation serves several key purposes:\\n- **...</td>\n",
       "      <td>Ragas provides specialized metrics for evaluat...</td>\n",
       "      <td>Ragas offers both LLM-based and computational ...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.40</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.625000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>me wanna know how LangSmith work with Ragas, l...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[### 🧪 Test Data Generation\\nCreating high-qua...</td>\n",
       "      <td>I don't know.</td>\n",
       "      <td>LangSmith is listed as one of the observabilit...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.250000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>How do I use the OPENAI API key when initializ...</td>\n",
       "      <td>[By extending Ragas with custom metrics, you c...</td>\n",
       "      <td>[## Getting Started with Ragas\\n\\nInstalling R...</td>\n",
       "      <td>To use the OPENAI API key when initializing an...</td>\n",
       "      <td>To use the OPENAI API key when initializing an...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.22</td>\n",
       "      <td>0.994766</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>How does synthetic data generation contribute ...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[&lt;1-hop&gt;\\n\\n## Why and How to Generate Synthet...</td>\n",
       "      <td>I don't know.</td>\n",
       "      <td>Synthetic data generation enables rapid creati...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>How does Ragas support the evaluation of both ...</td>\n",
       "      <td>[By extending Ragas with custom metrics, you c...</td>\n",
       "      <td>[&lt;1-hop&gt;\\n\\n# Create a sample\\nsample = Single...</td>\n",
       "      <td>Ragas supports the evaluation of LLM applicati...</td>\n",
       "      <td>Ragas supports the evaluation of LLM applicati...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.973433</td>\n",
       "      <td>0.300000</td>\n",
       "      <td>0.416667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>How does Metric-Driven Development (MDD) utili...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[&lt;1-hop&gt;\\n\\n## What Exactly is Metric-Driven D...</td>\n",
       "      <td>Metric-Driven Development (MDD) utilizes combi...</td>\n",
       "      <td>Metric-Driven Development (MDD) utilizes combi...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.964975</td>\n",
       "      <td>0.166667</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>How does Ragas provide specialized evaluation ...</td>\n",
       "      <td>[Reward functions embody the core MDD idea: se...</td>\n",
       "      <td>[&lt;1-hop&gt;\\n\\n## What is Ragas?\\n\\n[Ragas](https...</td>\n",
       "      <td>Ragas provides specialized evaluation metrics ...</td>\n",
       "      <td>Ragas is an open-source evaluation framework s...</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.272727</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.947124</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.090909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>How do observability best practices contribute...</td>\n",
       "      <td>[By extending Ragas with custom metrics, you c...</td>\n",
       "      <td>[&lt;1-hop&gt;\\n\\n## Best Practices for Observabilit...</td>\n",
       "      <td>Observability best practices contribute to bui...</td>\n",
       "      <td>Observability best practices, such as defining...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.636364</td>\n",
       "      <td>0.50</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.100000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          user_input  \\\n",
       "0  How are Large Language Models integrated into ...   \n",
       "1  Howw does Ragas help evalute LLM applikations ...   \n",
       "2  What specialized metrics does Ragas provide fo...   \n",
       "3  me wanna know how LangSmith work with Ragas, l...   \n",
       "4  How do I use the OPENAI API key when initializ...   \n",
       "5  How does synthetic data generation contribute ...   \n",
       "6  How does Ragas support the evaluation of both ...   \n",
       "7  How does Metric-Driven Development (MDD) utili...   \n",
       "8  How does Ragas provide specialized evaluation ...   \n",
       "9  How do observability best practices contribute...   \n",
       "\n",
       "                                  retrieved_contexts  \\\n",
       "0  [Reward functions embody the core MDD idea: se...   \n",
       "1  [Reward functions embody the core MDD idea: se...   \n",
       "2  [By extending Ragas with custom metrics, you c...   \n",
       "3  [Reward functions embody the core MDD idea: se...   \n",
       "4  [By extending Ragas with custom metrics, you c...   \n",
       "5  [Reward functions embody the core MDD idea: se...   \n",
       "6  [By extending Ragas with custom metrics, you c...   \n",
       "7  [Reward functions embody the core MDD idea: se...   \n",
       "8  [Reward functions embody the core MDD idea: se...   \n",
       "9  [By extending Ragas with custom metrics, you c...   \n",
       "\n",
       "                                  reference_contexts  \\\n",
       "0  [---\\ntitle: \"Part 1: Introduction to Ragas: T...   \n",
       "1  [## What is Ragas?\\n\\n[Ragas](https://docs.rag...   \n",
       "2  [Evaluation serves several key purposes:\\n- **...   \n",
       "3  [### 🧪 Test Data Generation\\nCreating high-qua...   \n",
       "4  [## Getting Started with Ragas\\n\\nInstalling R...   \n",
       "5  [<1-hop>\\n\\n## Why and How to Generate Synthet...   \n",
       "6  [<1-hop>\\n\\n# Create a sample\\nsample = Single...   \n",
       "7  [<1-hop>\\n\\n## What Exactly is Metric-Driven D...   \n",
       "8  [<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https...   \n",
       "9  [<1-hop>\\n\\n## Best Practices for Observabilit...   \n",
       "\n",
       "                                            response  \\\n",
       "0  Large Language Models (LLMs) are integrated in...   \n",
       "1                                      I don't know.   \n",
       "2  Ragas provides specialized metrics for evaluat...   \n",
       "3                                      I don't know.   \n",
       "4  To use the OPENAI API key when initializing an...   \n",
       "5                                      I don't know.   \n",
       "6  Ragas supports the evaluation of LLM applicati...   \n",
       "7  Metric-Driven Development (MDD) utilizes combi...   \n",
       "8  Ragas provides specialized evaluation metrics ...   \n",
       "9  Observability best practices contribute to bui...   \n",
       "\n",
       "                                           reference  context_recall  \\\n",
       "0  Large Language Models (LLMs) are becoming fund...        0.500000   \n",
       "1  Ragas is an open-source evaluation framework d...        0.500000   \n",
       "2  Ragas offers both LLM-based and computational ...        0.000000   \n",
       "3  LangSmith is listed as one of the observabilit...        0.000000   \n",
       "4  To use the OPENAI API key when initializing an...        0.000000   \n",
       "5  Synthetic data generation enables rapid creati...             NaN   \n",
       "6  Ragas supports the evaluation of LLM applicati...             NaN   \n",
       "7  Metric-Driven Development (MDD) utilizes combi...             NaN   \n",
       "8  Ragas is an open-source evaluation framework s...        0.333333   \n",
       "9  Observability best practices, such as defining...        0.000000   \n",
       "\n",
       "   faithfulness  factual_correctness(mode=f1)  answer_relevancy  \\\n",
       "0      0.444444                           NaN          0.964862   \n",
       "1      1.000000                          0.00          0.000000   \n",
       "2           NaN                          0.40          1.000000   \n",
       "3      1.000000                          0.50          0.000000   \n",
       "4           NaN                          0.22          0.994766   \n",
       "5           NaN                          0.00          0.000000   \n",
       "6           NaN                           NaN          0.973433   \n",
       "7      0.800000                           NaN          0.964975   \n",
       "8      0.272727                          0.45          0.947124   \n",
       "9      0.636364                          0.50          1.000000   \n",
       "\n",
       "   context_entity_recall  noise_sensitivity(mode=relevant)  \n",
       "0               0.125000                               NaN  \n",
       "1               0.333333                               NaN  \n",
       "2               0.250000                          0.625000  \n",
       "3               0.250000                          0.250000  \n",
       "4               0.571429                          0.000000  \n",
       "5               0.000000                          0.000000  \n",
       "6               0.300000                          0.416667  \n",
       "7               0.166667                          0.500000  \n",
       "8               0.133333                          0.090909  \n",
       "9               0.000000                          0.100000  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result.to_pandas()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}