File size: 157,552 Bytes
8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e a018353 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e a018353 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e a018353 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e 6f89cef 8825f6e a018353 8825f6e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "ca8bd0e4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Adding package root to sys.path: /home/mafzaal/source/lets-talk/py-src\n",
"Current notebook directory: /home/mafzaal/source/lets-talk/py-src/notebooks\n",
"Project root: /home/mafzaal/source/lets-talk\n"
]
}
],
"source": [
"import sys\n",
"import os\n",
"\n",
"# Add the project root to the Python path\n",
"package_root = os.path.abspath(os.path.join(os.getcwd(), \"../\"))\n",
"print(f\"Adding package root to sys.path: {package_root}\")\n",
"if package_root not in sys.path:\n",
"\tsys.path.append(package_root)\n",
"\n",
"notebook_dir = os.getcwd()\n",
"print(f\"Current notebook directory: {notebook_dir}\")\n",
"# change to the directory to the root of the project\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), \"../../\"))\n",
"print(f\"Project root: {project_root}\")\n",
"os.chdir(project_root)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b48fa7d4",
"metadata": {},
"outputs": [],
"source": [
"# set LANGCHAIN_TRACING_V2 to false to disable tracing\n",
"os.environ[\"LANGCHAIN_TRACING_V2\"] = \"false\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cd3c7329",
"metadata": {},
"outputs": [],
"source": [
"# -*- coding: utf-8 -*-\n",
"import lets_talk.utils.blog as blog\n",
"import lets_talk.utils.eval as eval\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1f9f2076",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<module 'lets_talk.utils.blog' from '/home/mafzaal/source/lets-talk/py-src/lets_talk/utils/blog.py'>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# hot reload the module\n",
"import importlib\n",
"importlib.reload(eval)\n",
"importlib.reload(blog)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cc282d9c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 14/14 [00:00<00:00, 3266.23it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 14 documents from data/\n",
"Split 14 documents into 162 chunks\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"#docs = blog.load_blog_posts()\n",
"#docs = blog.update_document_metadata(docs)\n",
"\n",
"blog_posts = blog.process_blog_posts(create_embeddings=False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e768b97b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'total_documents': 162, 'total_characters': 127917, 'min_length': 172, 'max_length': 998, 'avg_length': 789.6111111111111, 'documents': [{'url': 'https://thedataguy.pro/blog/introduction-to-ragas/', 'source': 'data/introduction-to-ragas/index.md', 'title': '\"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"', 'text_length': 6994}, {'url': 'https://thedataguy.pro/blog/introduction-to-ragas/', 'source': 'data/introduction-to-ragas/index.md', 'title': '\"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"', 'text_length': 6994}, {'url': 'https://thedataguy.pro/blog/introduction-to-ragas/', 'source': 'data/introduction-to-ragas/index.md', 'title': '\"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"', 'text_length': 6994}, {'url': 'https://thedataguy.pro/blog/introduction-to-ragas/', 'source': 'data/introduction-to-ragas/index.md', 'title': '\"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"', 'text_length': 6994}, {'url': 'https://thedataguy.pro/blog/introduction-to-ragas/', 'source': 'data/introduction-to-ragas/index.md', 'title': '\"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"', 'text_length': 6994}, {'url': 'https://thedataguy.pro/blog/introduction-to-ragas/', 'source': 'data/introduction-to-ragas/index.md', 'title': '\"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"', 'text_length': 6994}, {'url': 'https://thedataguy.pro/blog/introduction-to-ragas/', 'source': 'data/introduction-to-ragas/index.md', 'title': '\"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"', 'text_length': 6994}, {'url': 'https://thedataguy.pro/blog/introduction-to-ragas/', 'source': 'data/introduction-to-ragas/index.md', 'title': '\"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"', 'text_length': 6994}, {'url': 'https://thedataguy.pro/blog/introduction-to-ragas/', 'source': 'data/introduction-to-ragas/index.md', 'title': '\"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"', 'text_length': 6994}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/generating-test-data-with-ragas/', 'source': 'data/generating-test-data-with-ragas/index.md', 'title': '\"Part 4: Generating Test Data with Ragas\"', 'text_length': 14680}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/', 'source': 'data/advanced-metrics-and-customization-with-ragas/index.md', 'title': '\"Part 5: Advanced Metrics and Customization with Ragas\"', 'text_length': 11530}, {'url': 'https://thedataguy.pro/blog/building-research-agent/', 'source': 'data/building-research-agent/index.md', 'title': 'Building a Research Agent with RSS Feed Support', 'text_length': 7320}, {'url': 'https://thedataguy.pro/blog/building-research-agent/', 'source': 'data/building-research-agent/index.md', 'title': 'Building a Research Agent with RSS Feed Support', 'text_length': 7320}, {'url': 'https://thedataguy.pro/blog/building-research-agent/', 'source': 'data/building-research-agent/index.md', 'title': 'Building a Research Agent with RSS Feed Support', 'text_length': 7320}, {'url': 'https://thedataguy.pro/blog/building-research-agent/', 'source': 'data/building-research-agent/index.md', 'title': 'Building a Research Agent with RSS Feed Support', 'text_length': 7320}, {'url': 'https://thedataguy.pro/blog/building-research-agent/', 'source': 'data/building-research-agent/index.md', 'title': 'Building a Research Agent with RSS Feed Support', 'text_length': 7320}, {'url': 'https://thedataguy.pro/blog/building-research-agent/', 'source': 'data/building-research-agent/index.md', 'title': 'Building a Research Agent with RSS Feed Support', 'text_length': 7320}, {'url': 'https://thedataguy.pro/blog/building-research-agent/', 'source': 'data/building-research-agent/index.md', 'title': 'Building a Research Agent with RSS Feed Support', 'text_length': 7320}, {'url': 'https://thedataguy.pro/blog/building-research-agent/', 'source': 'data/building-research-agent/index.md', 'title': 'Building a Research Agent with RSS Feed Support', 'text_length': 7320}, {'url': 'https://thedataguy.pro/blog/building-research-agent/', 'source': 'data/building-research-agent/index.md', 'title': 'Building a Research Agent with RSS Feed Support', 'text_length': 7320}, {'url': 'https://thedataguy.pro/blog/building-research-agent/', 'source': 'data/building-research-agent/index.md', 'title': 'Building a Research Agent with RSS Feed Support', 'text_length': 7320}, {'url': 'https://thedataguy.pro/blog/building-research-agent/', 'source': 'data/building-research-agent/index.md', 'title': 'Building a Research Agent with RSS Feed Support', 'text_length': 7320}, {'url': 'https://thedataguy.pro/blog/rss-feed-announcement/', 'source': 'data/rss-feed-announcement/index.md', 'title': '\"Subscribe to Our Blog via RSS\"', 'text_length': 2139}, {'url': 'https://thedataguy.pro/blog/rss-feed-announcement/', 'source': 'data/rss-feed-announcement/index.md', 'title': '\"Subscribe to Our Blog via RSS\"', 'text_length': 2139}, {'url': 'https://thedataguy.pro/blog/rss-feed-announcement/', 'source': 'data/rss-feed-announcement/index.md', 'title': '\"Subscribe to Our Blog via RSS\"', 'text_length': 2139}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/metric-driven-development/', 'source': 'data/metric-driven-development/index.md', 'title': '\"Metric-Driven Development: Make Smarter Decisions, Faster\"', 'text_length': 12450}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/', 'source': 'data/basic-evaluation-workflow-with-ragas/index.md', 'title': '\"Part 2: Basic Evaluation Workflow with Ragas\"', 'text_length': 11222}, {'url': 'https://thedataguy.pro/blog/langchain-experience-csharp-perspective/', 'source': 'data/langchain-experience-csharp-perspective/index.md', 'title': \"A C# Programmer's Perspective on LangChain Expression Language\", 'text_length': 3361}, {'url': 'https://thedataguy.pro/blog/langchain-experience-csharp-perspective/', 'source': 'data/langchain-experience-csharp-perspective/index.md', 'title': \"A C# Programmer's Perspective on LangChain Expression Language\", 'text_length': 3361}, {'url': 'https://thedataguy.pro/blog/langchain-experience-csharp-perspective/', 'source': 'data/langchain-experience-csharp-perspective/index.md', 'title': \"A C# Programmer's Perspective on LangChain Expression Language\", 'text_length': 3361}, {'url': 'https://thedataguy.pro/blog/langchain-experience-csharp-perspective/', 'source': 'data/langchain-experience-csharp-perspective/index.md', 'title': \"A C# Programmer's Perspective on LangChain Expression Language\", 'text_length': 3361}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/', 'source': 'data/evaluating-ai-agents-with-ragas/index.md', 'title': '\"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"', 'text_length': 9821}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/integrations-and-observability-with-ragas/', 'source': 'data/integrations-and-observability-with-ragas/index.md', 'title': '\"Part 7: Integrations and Observability with Ragas\"', 'text_length': 9098}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/building-feedback-loops-with-ragas/', 'source': 'data/building-feedback-loops-with-ragas/index.md', 'title': '\"Part 8: Building Feedback Loops with Ragas\"', 'text_length': 8160}, {'url': 'https://thedataguy.pro/blog/coming-back-to-ai-roots/', 'source': 'data/coming-back-to-ai-roots/index.md', 'title': 'Coming Back to AI Roots - My Professional Journey', 'text_length': 5827}, {'url': 'https://thedataguy.pro/blog/coming-back-to-ai-roots/', 'source': 'data/coming-back-to-ai-roots/index.md', 'title': 'Coming Back to AI Roots - My Professional Journey', 'text_length': 5827}, {'url': 'https://thedataguy.pro/blog/coming-back-to-ai-roots/', 'source': 'data/coming-back-to-ai-roots/index.md', 'title': 'Coming Back to AI Roots - My Professional Journey', 'text_length': 5827}, {'url': 'https://thedataguy.pro/blog/coming-back-to-ai-roots/', 'source': 'data/coming-back-to-ai-roots/index.md', 'title': 'Coming Back to AI Roots - My Professional Journey', 'text_length': 5827}, {'url': 'https://thedataguy.pro/blog/coming-back-to-ai-roots/', 'source': 'data/coming-back-to-ai-roots/index.md', 'title': 'Coming Back to AI Roots - My Professional Journey', 'text_length': 5827}, {'url': 'https://thedataguy.pro/blog/coming-back-to-ai-roots/', 'source': 'data/coming-back-to-ai-roots/index.md', 'title': 'Coming Back to AI Roots - My Professional Journey', 'text_length': 5827}, {'url': 'https://thedataguy.pro/blog/coming-back-to-ai-roots/', 'source': 'data/coming-back-to-ai-roots/index.md', 'title': 'Coming Back to AI Roots - My Professional Journey', 'text_length': 5827}, {'url': 'https://thedataguy.pro/blog/coming-back-to-ai-roots/', 'source': 'data/coming-back-to-ai-roots/index.md', 'title': 'Coming Back to AI Roots - My Professional Journey', 'text_length': 5827}, {'url': 'https://thedataguy.pro/blog/data-is-king/', 'source': 'data/data-is-king/index.md', 'title': '\"Data is King: Why Your Data Strategy IS Your Business Strategy\"', 'text_length': 6197}, {'url': 'https://thedataguy.pro/blog/data-is-king/', 'source': 'data/data-is-king/index.md', 'title': '\"Data is King: Why Your Data Strategy IS Your Business Strategy\"', 'text_length': 6197}, {'url': 'https://thedataguy.pro/blog/data-is-king/', 'source': 'data/data-is-king/index.md', 'title': '\"Data is King: Why Your Data Strategy IS Your Business Strategy\"', 'text_length': 6197}, {'url': 'https://thedataguy.pro/blog/data-is-king/', 'source': 'data/data-is-king/index.md', 'title': '\"Data is King: Why Your Data Strategy IS Your Business Strategy\"', 'text_length': 6197}, {'url': 'https://thedataguy.pro/blog/data-is-king/', 'source': 'data/data-is-king/index.md', 'title': '\"Data is King: Why Your Data Strategy IS Your Business Strategy\"', 'text_length': 6197}, {'url': 'https://thedataguy.pro/blog/data-is-king/', 'source': 'data/data-is-king/index.md', 'title': '\"Data is King: Why Your Data Strategy IS Your Business Strategy\"', 'text_length': 6197}, {'url': 'https://thedataguy.pro/blog/data-is-king/', 'source': 'data/data-is-king/index.md', 'title': '\"Data is King: Why Your Data Strategy IS Your Business Strategy\"', 'text_length': 6197}, {'url': 'https://thedataguy.pro/blog/data-is-king/', 'source': 'data/data-is-king/index.md', 'title': '\"Data is King: Why Your Data Strategy IS Your Business Strategy\"', 'text_length': 6197}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}, {'url': 'https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/', 'source': 'data/evaluating-rag-systems-with-ragas/index.md', 'title': '\"Part 3: Evaluating RAG Systems with Ragas\"', 'text_length': 8811}]}\n"
]
}
],
"source": [
"print(blog_posts[\"stats\"])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "21d9a2df",
"metadata": {},
"outputs": [],
"source": [
"docs = blog_posts[\"documents\"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "33949cc9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': 'data/introduction-to-ragas/index.md', 'url': 'https://thedataguy.pro/blog/introduction-to-ragas/', 'post_slug': 'introduction-to-ragas', 'post_title': '\"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"', 'content_length': 6994}, page_content='---\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\\ndate: 2025-04-26T18:00:00-06:00\\nlayout: blog\\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\\nreadingTime: 7\\npublished: true\\n---\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\n## What is Ragas?')"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "03663a91",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f0e20da2b88e472b93e5bde8529eb108",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Applying SummaryExtractor: 0%| | 0/144 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c8b35bd7bed24f5593565b0169a69348",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Applying CustomNodeFilter: 0%| | 0/162 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Node 76ad2116-072f-45ff-bb1c-6ffa106832b1 does not have a summary. Skipping filtering.\n",
"Node 10f1ca7c-9bba-42d3-a541-1ea9e7ae053f does not have a summary. Skipping filtering.\n",
"Node e3248992-4a79-4834-b22b-a3d1c4bcc300 does not have a summary. Skipping filtering.\n",
"Node 416280df-e766-4357-9b0e-496d414f4386 does not have a summary. Skipping filtering.\n",
"Node 2ad382ab-85a4-4e83-bf06-9d7f6f404368 does not have a summary. Skipping filtering.\n",
"Node 716820c2-bea4-4ca2-a24e-d3772d4ca4ad does not have a summary. Skipping filtering.\n",
"Node be468d9d-5003-437c-b241-a40413e814f2 does not have a summary. Skipping filtering.\n",
"Node 67bb4686-e17d-48e6-bb21-ed2aad303ad5 does not have a summary. Skipping filtering.\n",
"Node 80f581ab-9140-42a7-b62a-25cfb11f30a7 does not have a summary. Skipping filtering.\n",
"Node c2192eb9-7f0c-4049-b820-c0aab14a0ed4 does not have a summary. Skipping filtering.\n",
"Node 6754a45c-b627-4dca-8cab-328e54bf11b2 does not have a summary. Skipping filtering.\n",
"Node e14a7612-4e54-4b8a-b998-b66fa0dce67b does not have a summary. Skipping filtering.\n",
"Node 20bf3229-08e2-41c7-8e00-bdabb170134e does not have a summary. Skipping filtering.\n",
"Node 390eea13-57ac-4346-bae5-7babeffd265b does not have a summary. Skipping filtering.\n",
"Node 870b2ea9-6d39-42a9-8c59-9d90bfd74a35 does not have a summary. Skipping filtering.\n",
"Node 61fad07e-fdba-43c8-865b-8125a5b0f019 does not have a summary. Skipping filtering.\n",
"Node 4650c5c0-612b-4e52-ae7a-9fc68bc997e5 does not have a summary. Skipping filtering.\n",
"unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29690, Requested 787. Please try again in 954ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}\n",
"Node 3007936f-5abb-425d-a730-bb62c4146a04 does not have a summary. Skipping filtering.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c78c2a6672a94c859c64c32d51784572",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]: 0%| | 0/468 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29958, Requested 524. Please try again in 964ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}\n",
"unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29826, Requested 525. Please try again in 702ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}\n",
"unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29953, Requested 534. Please try again in 973ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}\n",
"unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29518, Requested 519. Please try again in 74ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3cb4e2df16384f5e8a8384d77d1a23c5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Applying [CosineSimilarityBuilder, OverlapScoreBuilder]: 0%| | 0/2 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"unable to apply transformation: Node d26ac341-a574-4587-810b-94f50a48ed11 or 5050d4cc-68f8-40b4-923c-137c3c77b0c3 has no entities\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "63e0701766b04b4bb2383ef0b54b1692",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating personas: 0%| | 0/3 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a1e6e0f7932a459d8071b3269854821e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating Scenarios: 0%| | 0/2 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "72f1062d21c64a5caabab276d99f8c4e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating Samples: 0%| | 0/10 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from lets_talk.config import EMBEDDING_MODEL,SDG_LLM_MODLEL,EVAL_LLM_MODEL\n",
"testset = eval.generate_testset(docs=docs,llm_model = SDG_LLM_MODLEL, embedding_model = EMBEDDING_MODEL,testset_size=10)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "cad859be",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "user_input",
"rawType": "object",
"type": "string"
},
{
"name": "reference_contexts",
"rawType": "object",
"type": "unknown"
},
{
"name": "reference",
"rawType": "object",
"type": "string"
},
{
"name": "synthesizer_name",
"rawType": "object",
"type": "string"
}
],
"conversionMethod": "pd.DataFrame",
"ref": "fcd0dccd-8b42-4bf0-a20d-f3935815a6d5",
"rows": [
[
"0",
"How are Large Language Models integrated into modern applications, and why is their performance evaluation considered critical according to the context?",
"['---\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\\ndate: 2025-04-26T18:00:00-06:00\\nlayout: blog\\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\\nreadingTime: 7\\npublished: true\\n---\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\n## What is Ragas?']",
"Large Language Models (LLMs) are becoming fundamental components of modern applications, such as question-answering systems, document retrieval tools, and conversational agents. Effectively evaluating their performance is considered increasingly critical to ensure reliable metrics for assessing how well these applications perform.",
"single_hop_specifc_query_synthesizer"
],
[
"1",
"Howw does Ragas help evalute LLM applikations in the helthcare industree, and why is this importent for ensuring akuracy and reliabilitee?",
"[\"## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\"]",
"Ragas is an open-source evaluation framework designed for LLM applications, with strengths in Retrieval-Augmented Generation systems. It provides specialized metrics to address challenges unique to LLM-powered systems, such as determining if the application retrieves the right information and if responses are factually accurate and consistent with the retrieved context. Evaluating LLM applications is especially important in healthcare, where accuracy and reliability are critical, because LLMs can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. Proper evaluation with frameworks like Ragas is non-negotiable in healthcare to ensure the system's accuracy and reliability.",
"single_hop_specifc_query_synthesizer"
],
[
"2",
"What specialized metrics does Ragas provide for evaluating LLM applications?",
"[\"Evaluation serves several key purposes:\\n- **Quality assurance**: Identify and fix issues before they reach users\\n- **Performance tracking**: Monitor how changes impact system performance\\n- **Benchmarking**: Compare different approaches objectively\\n- **Continuous improvement**: Build feedback loops to enhance your application\\n\\n## Key Features of Ragas\\n\\n### 🎯 Specialized Metrics\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\\n- **Answer Relevancy**: Assesses if the response addresses the user's question\\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic\"]",
"Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications, including faithfulness, context relevancy, answer relevancy, and topic adherence.",
"single_hop_specifc_query_synthesizer"
],
[
"3",
"me wanna know how LangSmith work with Ragas, like is it for test data or what, and how it help me as LLM app builder, can you tell me all about LangSmith from this info?",
"[\"### 🧪 Test Data Generation\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n### 🔗 Seamless Integrations\\nRagas works with popular LLM frameworks and tools:\\n- [LangChain](https://www.langchain.com/)\\n- [LlamaIndex](https://www.llamaindex.ai/)\\n- [Haystack](https://haystack.deepset.ai/)\\n- [OpenAI](https://openai.com/)\\n\\nObservability platforms \\n- [Phoenix](https://phoenix.arize.com/)\\n- [LangSmith](https://python.langchain.com/docs/introduction/)\\n- [Langfuse](https://www.langfuse.com/)\\n\\n### 📊 Comprehensive Analysis\\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\\n\\n## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere's a simple example of evaluating a response using Ragas:\"]",
"LangSmith is listed as one of the observability platforms that Ragas works with. The context does not provide details about how LangSmith specifically functions, but it shows that Ragas integrates with LangSmith to support observability in LLM-powered systems.",
"single_hop_specifc_query_synthesizer"
],
[
"4",
"How do I use the OPENAI API key when initializing an LLM for evaluation with Ragas?",
"['## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python\\nfrom ragas.metrics import Faithfulness\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import SingleTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\nfrom langchain_openai import ChatOpenAI\\n\\n# Initialize the LLM, you are going to new OPENAI API key\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \\n\\n# Your evaluation data\\ntest_data = {\\n \"user_input\": \"What is the capital of France?\",\\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\\n \"response\": \"The capital of France is Paris.\"\\n}\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor']",
"To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you need to provide your OPENAI API key during the initialization of the ChatOpenAI model, as shown in the example: evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")).",
"single_hop_specifc_query_synthesizer"
],
[
"5",
"How does synthetic data generation contribute to the importance of data quantity in improving RAG evaluation, and what paradigm shift does this represent according to Peter Norvig's perspective?",
"['<1-hop>\\n\\n## Why and How to Generate Synthetic Data for RAG Evaluation\\n\\nIn the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, **synthetic data generation** is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like [RAGAS](https://github.com/explodinggradients/ragas) and [LangSmith](https://smith.langchain.com/).\\n\\n---\\n\\n### Why Generate Synthetic Data?\\n\\n1. **Early Signal, Fast Iteration** \\n Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production.\\n\\n2. **Controlled Complexity** \\n You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases.', '<2-hop>\\n\\n## The Origin of \"Data is King\"\\n\\nPeter Norvig famously stated, \"We don\\'t have better algorithms. We just have more data.\" This statement emerged during a time when Google\\'s approach to machine translation was yielding surprisingly effective results not through algorithmic innovations, but through the sheer volume of multilingual data they had amassed. \\n\\nThis perspective represented a paradigm shift. Prior to this, the field had largely focused on crafting ever more sophisticated algorithms, with the assumption that smarter code would yield better results. Norvig\\'s insight suggested something different: even relatively simple algorithms could outperform more sophisticated ones when trained on sufficiently large datasets.\\n\\n## The Business Imperative of Data Ownership']",
"Synthetic data generation enables rapid creation of test sets that mimic real user queries and contexts, allowing for early evaluation and iteration of RAG systems even when real-world data is scarce or expensive to label. This approach increases the quantity of data available for evaluation, supporting robust system development. According to Peter Norvig's perspective, as described in the context, the importance of data quantity represents a paradigm shift: rather than relying solely on more sophisticated algorithms, having more data—even with simpler algorithms—can lead to better results. Thus, synthetic data generation aligns with this shift by providing the large datasets necessary to improve system performance.",
"multi_hop_abstract_query_synthesizer"
],
[
"6",
"How does Ragas support the evaluation of both LLM applications and AI agents, and what specialized metrics are introduced for evaluating AI agents?",
"['<1-hop>\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\n# Create metric\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n# Calculate the score\\nresult = await faithfulness.single_turn_ascore(sample)\\nprint(f\"Faithfulness score: {result}\")\\n```\\n\\n> 💡 **Try it yourself:** \\n> Explore the hands-on notebook for this workflow: \\n> [01_Introduction_to_Ragas](https://github.com/mafzaal/intro-to-ragas/blob/master/01_Introduction_to_Ragas.ipynb)\\n\\n## What\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\nWe\\'ll explore each metric in detail, explaining when and how to use them effectively.', \"<2-hop>\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '<3-hop>\\n\\n---\\ntitle: \"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"\\ndate: 2025-04-28T06:00:00-06:00\\nlayout: blog\\ndescription: \"Learn how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications.\"\\ncategories: [\"AI\", \"Agents\", \"Evaluation\", \"Ragas\", \"LLM\"]\\ncoverImage: \"/images/ai_agent_evaluation.png\" \\nreadingTime: 8\\npublished: true\\n---\\n\\nIn our previous posts, we\\'ve explored how Ragas evaluates RAG systems and enables custom metrics for specialized applications. As LLMs evolve beyond simple question-answering to become powerful AI agents, evaluation needs have grown more sophisticated too. In this post, we\\'ll explore Ragas\\' specialized metrics for evaluating AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\n## The Challenge of Evaluating AI Agents']",
"Ragas supports the evaluation of LLM applications by providing metrics such as faithfulness, which can be applied to single-turn samples as shown in the introductory workflow. For AI agents, Ragas introduces specialized metrics designed to assess more complex behaviors, including goal accuracy, tool call accuracy, and topic adherence. These metrics enable the evaluation of AI agents that perform multi-turn interactions, utilize tools, and work toward specific goals, thus addressing the advanced requirements of evaluating agent-based applications.",
"multi_hop_abstract_query_synthesizer"
],
[
"7",
"How does Metric-Driven Development (MDD) utilize combined performance metrics to guide project outcomes, and what are some examples of such metrics in practice?",
"['<1-hop>\\n\\n## What Exactly is Metric-Driven Development?\\n\\nMetric-Driven Development (MDD) is a simple but effective framework where teams:\\n\\n1. **Define Clear, Measurable Goals:** Set specific numerical targets (e.g., \"Increase user sign-ups by 20% this quarter\").\\n2. **Base Decisions on Data:** Rely on evidence and measurements, not just opinions or assumptions.\\n3. **Iterate and Learn Quickly:** Continuously measure the impact of changes to see what works and what doesn\\'t.\\n\\nThink of MDD as a **GPS for your project**. Without clear metrics, you\\'re driving in the fog, hoping you\\'re heading in the right direction. With MDD, you get real-time feedback, ensuring you\\'re moving towards your destination efficiently.\\n\\n## Why Teams Struggle Without Clear Metrics\\n\\nWithout a metric-driven approach, teams often fall into common traps:', '<2-hop>\\n\\n* **Metric:** A combined score, e.g., `Points Scored - (Time Taken * Penalty Factor)`.\\n * **Impact:** Trains AI opponents that are challenging but fair, balancing speed and skill.\\n* **Autonomous Vehicles: Safety & Comfort Score**\\n * **Metric:** Combination of factors like smooth acceleration/braking, lane adherence, and deductions for interventions or near-misses.\\n * **Impact:** Guides development towards vehicles that are not only safe but also provide a comfortable ride.']",
"Metric-Driven Development (MDD) utilizes combined performance metrics by defining clear, measurable goals and basing decisions on data rather than assumptions. This approach ensures that teams receive real-time feedback and can iterate quickly to improve outcomes. Examples of combined performance metrics in practice include a score such as 'Points Scored - (Time Taken * Penalty Factor)' to train AI opponents that balance speed and skill, and a 'Safety & Comfort Score' for autonomous vehicles, which combines factors like smooth acceleration, lane adherence, and deductions for interventions or near-misses. These combined metrics help guide development towards achieving specific, balanced objectives.",
"multi_hop_abstract_query_synthesizer"
],
[
"8",
"How does Ragas provide specialized evaluation metrics for LLMs, and what steps are involved in creating a custom metric to assess technical accuracy in programming explanations?",
"[\"<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\", '<2-hop>\\n\\n## Creating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python\\nfrom dataclasses import dataclass, field\\nfrom typing import Dict, Optional, Set\\nimport typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric\\nfrom ragas.prompt import PydanticPrompt\\nfrom ragas.metrics import MetricType, MetricOutputType\\nfrom pydantic import BaseModel\\n\\n# Define input/output models for the prompt\\nclass TechnicalAccuracyInput(BaseModel):\\n question: str\\n context: str\\n response: str\\n programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel):\\n score: float\\n feedback: str']",
"Ragas is an open-source evaluation framework specifically designed for LLM applications, offering specialized metrics that address challenges unique to LLM-powered systems, such as ensuring factual accuracy, consistency with retrieved context, and appropriate query handling. To create a custom metric for evaluating technical accuracy in programming explanations, Ragas allows developers to define input and output models (for example, using Pydantic BaseModel classes for technical accuracy input and output), and implement the metric logic using its extensible metric classes. This enables tailored evaluation beyond traditional NLP metrics, supporting the needs of high-stakes LLM applications.",
"multi_hop_abstract_query_synthesizer"
],
[
"9",
"How do observability best practices contribute to building production-ready AI systems?",
"['<1-hop>\\n\\n## Best Practices for Observability\\n\\n1. **Define clear thresholds**: Establish performance baselines and alert thresholds for each metric\\n2. **Segment evaluations**: Break down results by query type, data source, or other relevant factors\\n3. **Historical tracking**: Maintain historical evaluation data to identify trends and regressions\\n4. **Correlation analysis**: Link evaluation metrics to user feedback and business outcomes\\n5. **Regular benchmarking**: Periodically evaluate against fixed test sets to ensure consistency\\n6. **Alert on regressions**: Implement automated alerts when metrics drop below thresholds\\n7. **Contextualize metrics**: Include example failures alongside aggregate metrics for better understanding\\n\\n## Building a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:', \"<2-hop>\\n\\n- **Production-ready**: Understanding software engineering best practices helps create AI systems that can operate reliably at scale.\\n- **User-focused**: Experience with UX principles ensures AI solutions are designed with actual human users in mind.\\n- **Integrated**: Knowledge of enterprise systems makes it easier to connect AI capabilities with existing business processes.\\n- **Simplified**: My experience in streamlining complex business processes helps me identify where AI can have the greatest impact through intelligent automation.\\n- **Business-oriented**: I understand that AI isn't just about the technology—it's about solving real business problems and creating measurable value.\\n- **Practical**: I focus on practical applications that deliver immediate benefits rather than getting caught up in theoretical possibilities.\\n\\n## What's Next\\n\\nAs I return to my AI roots, I'm excited to share this journey with you through this blog. In the coming months, I plan to write about:\"]",
"Observability best practices, such as defining clear thresholds, segmenting evaluations, maintaining historical tracking, and alerting on regressions, ensure that AI systems are continuously monitored and improved. These practices are essential for creating production-ready AI systems that can operate reliably at scale, as they enable consistent performance evaluation and rapid response to issues.",
"multi_hop_abstract_query_synthesizer"
]
],
"shape": {
"columns": 4,
"rows": 10
}
},
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_input</th>\n",
" <th>reference_contexts</th>\n",
" <th>reference</th>\n",
" <th>synthesizer_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>How are Large Language Models integrated into ...</td>\n",
" <td>[---\\ntitle: \"Part 1: Introduction to Ragas: T...</td>\n",
" <td>Large Language Models (LLMs) are becoming fund...</td>\n",
" <td>single_hop_specifc_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Howw does Ragas help evalute LLM applikations ...</td>\n",
" <td>[## What is Ragas?\\n\\n[Ragas](https://docs.rag...</td>\n",
" <td>Ragas is an open-source evaluation framework d...</td>\n",
" <td>single_hop_specifc_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What specialized metrics does Ragas provide fo...</td>\n",
" <td>[Evaluation serves several key purposes:\\n- **...</td>\n",
" <td>Ragas offers both LLM-based and computational ...</td>\n",
" <td>single_hop_specifc_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>me wanna know how LangSmith work with Ragas, l...</td>\n",
" <td>[### 🧪 Test Data Generation\\nCreating high-qua...</td>\n",
" <td>LangSmith is listed as one of the observabilit...</td>\n",
" <td>single_hop_specifc_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>How do I use the OPENAI API key when initializ...</td>\n",
" <td>[## Getting Started with Ragas\\n\\nInstalling R...</td>\n",
" <td>To use the OPENAI API key when initializing an...</td>\n",
" <td>single_hop_specifc_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>How does synthetic data generation contribute ...</td>\n",
" <td>[<1-hop>\\n\\n## Why and How to Generate Synthet...</td>\n",
" <td>Synthetic data generation enables rapid creati...</td>\n",
" <td>multi_hop_abstract_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>How does Ragas support the evaluation of both ...</td>\n",
" <td>[<1-hop>\\n\\n# Create a sample\\nsample = Single...</td>\n",
" <td>Ragas supports the evaluation of LLM applicati...</td>\n",
" <td>multi_hop_abstract_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>How does Metric-Driven Development (MDD) utili...</td>\n",
" <td>[<1-hop>\\n\\n## What Exactly is Metric-Driven D...</td>\n",
" <td>Metric-Driven Development (MDD) utilizes combi...</td>\n",
" <td>multi_hop_abstract_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>How does Ragas provide specialized evaluation ...</td>\n",
" <td>[<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https...</td>\n",
" <td>Ragas is an open-source evaluation framework s...</td>\n",
" <td>multi_hop_abstract_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>How do observability best practices contribute...</td>\n",
" <td>[<1-hop>\\n\\n## Best Practices for Observabilit...</td>\n",
" <td>Observability best practices, such as defining...</td>\n",
" <td>multi_hop_abstract_query_synthesizer</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_input \\\n",
"0 How are Large Language Models integrated into ... \n",
"1 Howw does Ragas help evalute LLM applikations ... \n",
"2 What specialized metrics does Ragas provide fo... \n",
"3 me wanna know how LangSmith work with Ragas, l... \n",
"4 How do I use the OPENAI API key when initializ... \n",
"5 How does synthetic data generation contribute ... \n",
"6 How does Ragas support the evaluation of both ... \n",
"7 How does Metric-Driven Development (MDD) utili... \n",
"8 How does Ragas provide specialized evaluation ... \n",
"9 How do observability best practices contribute... \n",
"\n",
" reference_contexts \\\n",
"0 [---\\ntitle: \"Part 1: Introduction to Ragas: T... \n",
"1 [## What is Ragas?\\n\\n[Ragas](https://docs.rag... \n",
"2 [Evaluation serves several key purposes:\\n- **... \n",
"3 [### 🧪 Test Data Generation\\nCreating high-qua... \n",
"4 [## Getting Started with Ragas\\n\\nInstalling R... \n",
"5 [<1-hop>\\n\\n## Why and How to Generate Synthet... \n",
"6 [<1-hop>\\n\\n# Create a sample\\nsample = Single... \n",
"7 [<1-hop>\\n\\n## What Exactly is Metric-Driven D... \n",
"8 [<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https... \n",
"9 [<1-hop>\\n\\n## Best Practices for Observabilit... \n",
"\n",
" reference \\\n",
"0 Large Language Models (LLMs) are becoming fund... \n",
"1 Ragas is an open-source evaluation framework d... \n",
"2 Ragas offers both LLM-based and computational ... \n",
"3 LangSmith is listed as one of the observabilit... \n",
"4 To use the OPENAI API key when initializing an... \n",
"5 Synthetic data generation enables rapid creati... \n",
"6 Ragas supports the evaluation of LLM applicati... \n",
"7 Metric-Driven Development (MDD) utilizes combi... \n",
"8 Ragas is an open-source evaluation framework s... \n",
"9 Observability best practices, such as defining... \n",
"\n",
" synthesizer_name \n",
"0 single_hop_specifc_query_synthesizer \n",
"1 single_hop_specifc_query_synthesizer \n",
"2 single_hop_specifc_query_synthesizer \n",
"3 single_hop_specifc_query_synthesizer \n",
"4 single_hop_specifc_query_synthesizer \n",
"5 multi_hop_abstract_query_synthesizer \n",
"6 multi_hop_abstract_query_synthesizer \n",
"7 multi_hop_abstract_query_synthesizer \n",
"8 multi_hop_abstract_query_synthesizer \n",
"9 multi_hop_abstract_query_synthesizer "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = testset.to_pandas()\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "4ae903d8",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"evals/testset_2.csv\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "86ab0d3b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded vector store from ./db/vector_store_5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 10/10 [00:31<00:00, 3.12s/it]\n"
]
}
],
"source": [
"from lets_talk.rag import rag_chain\n",
"evalset = eval.run_rag_chain(testset,rag_chain)\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "704669a4",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "user_input",
"rawType": "object",
"type": "string"
},
{
"name": "retrieved_contexts",
"rawType": "object",
"type": "unknown"
},
{
"name": "reference_contexts",
"rawType": "object",
"type": "unknown"
},
{
"name": "response",
"rawType": "object",
"type": "string"
},
{
"name": "reference",
"rawType": "object",
"type": "string"
},
{
"name": "synthesizer_name",
"rawType": "object",
"type": "string"
}
],
"conversionMethod": "pd.DataFrame",
"ref": "492e8746-11d6-4694-8db9-159cbebf395c",
"rows": [
[
"0",
"How are Large Language Models integrated into modern applications, and why is their performance evaluation considered critical according to the context?",
"['| **Harmfulness** | | ✓ | |\\n| **Coherence** | | ✓ | |\\n| **Context Relevancy** | | | ✓ |', '| **Metric** | **Comprehensive RAG Evaluation** | **Content Quality Evaluation** | **Retrieval Quality Evaluation** |\\n|-----------------------------|----------------------------------|---------------------------------|-----------------------------------|\\n| **Faithfulness** | ✓ | ✓ | |\\n| **Answer Relevancy** | ✓ | ✓ | |\\n| **Context Recall** | ✓ | | ✓ |\\n| **Context Precision** | ✓ | | ✓ |\\n| **Harmfulness** | | ✓ | |', \"**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\nWe'll explore each metric in detail, explaining when and how to use them effectively.\\n\\n**[Part 3: Evaluating RAG Systems](/blog/evaluating-rag-systems-with-ragas/)** \\nLearn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\nDiscover how to create high-quality test datasets that thoroughly exercise your application's capabilities.\\n\\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\nGo beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\nLearn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\", \"What excites me most is that my experience in enterprise software development gives me a unique perspective on AI implementation. I understand not just the algorithms and models, but also how to integrate them into robust, production-ready systems that deliver real value.\\n\\n## The Best of Both Worlds\\n\\nComing back to AI doesn't mean leaving behind everything I learned in web and enterprise development. Quite the opposite - I believe my background gives me a particular advantage in building AI systems that are:\"]",
"['---\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\\ndate: 2025-04-26T18:00:00-06:00\\nlayout: blog\\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\\nreadingTime: 7\\npublished: true\\n---\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\n## What is Ragas?']",
"I don't know.",
"Large Language Models (LLMs) are becoming fundamental components of modern applications, such as question-answering systems, document retrieval tools, and conversational agents. Effectively evaluating their performance is considered increasingly critical to ensure reliable metrics for assessing how well these applications perform.",
"single_hop_specifc_query_synthesizer"
],
[
"1",
"Howw does Ragas help evalute LLM applikations in the helthcare industree, and why is this importent for ensuring akuracy and reliabilitee?",
"[\"In our next post, we'll explore advanced metrics and customization techniques for specialized evaluation needs.\\n\\n---\\n\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**Part 4: Test Data Generation — _You are here_** \\n*Next up in the series:* \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\n---\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**Part 3: Evaluating RAG Systems with Ragas — _You are here_** \\n*Next up in the series:* \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**Part 6: Evaluating AI Agents — _You are here_** \\n*Next up in the series:* \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\"]",
"[\"## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\"]",
"I don't know.",
"Ragas is an open-source evaluation framework designed for LLM applications, with strengths in Retrieval-Augmented Generation systems. It provides specialized metrics to address challenges unique to LLM-powered systems, such as determining if the application retrieves the right information and if responses are factually accurate and consistent with the retrieved context. Evaluating LLM applications is especially important in healthcare, where accuracy and reliability are critical, because LLMs can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. Proper evaluation with frameworks like Ragas is non-negotiable in healthcare to ensure the system's accuracy and reliability.",
"single_hop_specifc_query_synthesizer"
],
[
"2",
"What specialized metrics does Ragas provide for evaluating LLM applications?",
"[\"In our next post, we'll explore advanced metrics and customization techniques for specialized evaluation needs.\\n\\n---\\n\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**Part 4: Test Data Generation — _You are here_** \\n*Next up in the series:* \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '---\\n*This is the eighth part of a series on Ragas, a research-driven evaluation framework for LLM and RAG systems. If you missed the previous parts, check them out below:*\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**Part 8: Building Feedback Loops — _You are here_**', \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**Part 6: Evaluating AI Agents — _You are here_** \\n*Next up in the series:* \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\"]",
"[\"Evaluation serves several key purposes:\\n- **Quality assurance**: Identify and fix issues before they reach users\\n- **Performance tracking**: Monitor how changes impact system performance\\n- **Benchmarking**: Compare different approaches objectively\\n- **Continuous improvement**: Build feedback loops to enhance your application\\n\\n## Key Features of Ragas\\n\\n### 🎯 Specialized Metrics\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\\n- **Answer Relevancy**: Assesses if the response addresses the user's question\\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic\"]",
"I don't know.",
"Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications, including faithfulness, context relevancy, answer relevancy, and topic adherence.",
"single_hop_specifc_query_synthesizer"
],
[
"3",
"me wanna know how LangSmith work with Ragas, like is it for test data or what, and how it help me as LLM app builder, can you tell me all about LangSmith from this info?",
"[\"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**Part 6: Evaluating AI Agents — _You are here_** \\n*Next up in the series:* \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\n---\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**Part 3: Evaluating RAG Systems with Ragas — _You are here_** \\n*Next up in the series:* \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\"]",
"[\"### 🧪 Test Data Generation\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n### 🔗 Seamless Integrations\\nRagas works with popular LLM frameworks and tools:\\n- [LangChain](https://www.langchain.com/)\\n- [LlamaIndex](https://www.llamaindex.ai/)\\n- [Haystack](https://haystack.deepset.ai/)\\n- [OpenAI](https://openai.com/)\\n\\nObservability platforms \\n- [Phoenix](https://phoenix.arize.com/)\\n- [LangSmith](https://python.langchain.com/docs/introduction/)\\n- [Langfuse](https://www.langfuse.com/)\\n\\n### 📊 Comprehensive Analysis\\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\\n\\n## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere's a simple example of evaluating a response using Ragas:\"]",
"I don't know.",
"LangSmith is listed as one of the observability platforms that Ragas works with. The context does not provide details about how LangSmith specifically functions, but it shows that Ragas integrates with LangSmith to support observability in LLM-powered systems.",
"single_hop_specifc_query_synthesizer"
],
[
"4",
"How do I use the OPENAI API key when initializing an LLM for evaluation with Ragas?",
"[\"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**Part 6: Evaluating AI Agents — _You are here_** \\n*Next up in the series:* \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python\\nfrom ragas.metrics import Faithfulness\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import SingleTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\nfrom langchain_openai import ChatOpenAI\\n\\n# Initialize the LLM, you are going to new OPENAI API key\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \\n\\n# Your evaluation data\\ntest_data = {\\n \"user_input\": \"What is the capital of France?\",\\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\\n \"response\": \"The capital of France is Paris.\"\\n}\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor', '## Implementing Agent Evaluation in Practice\\n\\nLet\\'s look at a practical example of evaluating an AI agent using these metrics:\\n\\n```python\\nfrom ragas.metrics import AgentGoalAccuracyWithoutReference, ToolCallAccuracy, TopicAdherenceScore\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import MultiTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\n\\n# Initialize the LLM\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))']",
"['## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python\\nfrom ragas.metrics import Faithfulness\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import SingleTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\nfrom langchain_openai import ChatOpenAI\\n\\n# Initialize the LLM, you are going to new OPENAI API key\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \\n\\n# Your evaluation data\\ntest_data = {\\n \"user_input\": \"What is the capital of France?\",\\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\\n \"response\": \"The capital of France is Paris.\"\\n}\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor']",
"To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you can follow this example:\n\n```python\nfrom langchain_openai import ChatOpenAI\nfrom ragas.llms import LangchainLLMWrapper\n\n# Initialize the LLM with your OPENAI API key\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\n```\n\nMake sure to replace `\"gpt-4o\"` with the appropriate model you wish to use. You will need to set your API key in your environment or configuration as required by the `ChatOpenAI` class.\n\nFor more details, you can refer to the following links:\n- [Part 1: Introduction to Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Part 5: Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)",
"To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you need to provide your OPENAI API key during the initialization of the ChatOpenAI model, as shown in the example: evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")).",
"single_hop_specifc_query_synthesizer"
],
[
"5",
"How does synthetic data generation contribute to the importance of data quantity in improving RAG evaluation, and what paradigm shift does this represent according to Peter Norvig's perspective?",
"[\"In our next post, we'll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\n---\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**Part 3: Evaluating RAG Systems with Ragas — _You are here_** \\n*Next up in the series:* \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '| **Metric** | **Comprehensive RAG Evaluation** | **Content Quality Evaluation** | **Retrieval Quality Evaluation** |\\n|-----------------------------|----------------------------------|---------------------------------|-----------------------------------|\\n| **Faithfulness** | ✓ | ✓ | |\\n| **Answer Relevancy** | ✓ | ✓ | |\\n| **Context Recall** | ✓ | | ✓ |\\n| **Context Precision** | ✓ | | ✓ |\\n| **Harmfulness** | | ✓ | |', '| **Harmfulness** | | ✓ | |\\n| **Coherence** | | ✓ | |\\n| **Context Relevancy** | | | ✓ |', \"**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\nWe'll explore each metric in detail, explaining when and how to use them effectively.\\n\\n**[Part 3: Evaluating RAG Systems](/blog/evaluating-rag-systems-with-ragas/)** \\nLearn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\nDiscover how to create high-quality test datasets that thoroughly exercise your application's capabilities.\\n\\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\nGo beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\nLearn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\"]",
"['<1-hop>\\n\\n## Why and How to Generate Synthetic Data for RAG Evaluation\\n\\nIn the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, **synthetic data generation** is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like [RAGAS](https://github.com/explodinggradients/ragas) and [LangSmith](https://smith.langchain.com/).\\n\\n---\\n\\n### Why Generate Synthetic Data?\\n\\n1. **Early Signal, Fast Iteration** \\n Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production.\\n\\n2. **Controlled Complexity** \\n You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases.', '<2-hop>\\n\\n## The Origin of \"Data is King\"\\n\\nPeter Norvig famously stated, \"We don\\'t have better algorithms. We just have more data.\" This statement emerged during a time when Google\\'s approach to machine translation was yielding surprisingly effective results not through algorithmic innovations, but through the sheer volume of multilingual data they had amassed. \\n\\nThis perspective represented a paradigm shift. Prior to this, the field had largely focused on crafting ever more sophisticated algorithms, with the assumption that smarter code would yield better results. Norvig\\'s insight suggested something different: even relatively simple algorithms could outperform more sophisticated ones when trained on sufficiently large datasets.\\n\\n## The Business Imperative of Data Ownership']",
"I don't know.",
"Synthetic data generation enables rapid creation of test sets that mimic real user queries and contexts, allowing for early evaluation and iteration of RAG systems even when real-world data is scarce or expensive to label. This approach increases the quantity of data available for evaluation, supporting robust system development. According to Peter Norvig's perspective, as described in the context, the importance of data quantity represents a paradigm shift: rather than relying solely on more sophisticated algorithms, having more data—even with simpler algorithms—can lead to better results. Thus, synthetic data generation aligns with this shift by providing the large datasets necessary to improve system performance.",
"multi_hop_abstract_query_synthesizer"
],
[
"6",
"How does Ragas support the evaluation of both LLM applications and AI agents, and what specialized metrics are introduced for evaluating AI agents?",
"[\"In our next post, we'll explore advanced metrics and customization techniques for specialized evaluation needs.\\n\\n---\\n\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**Part 4: Test Data Generation — _You are here_** \\n*Next up in the series:* \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**Part 6: Evaluating AI Agents — _You are here_** \\n*Next up in the series:* \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '---\\n*This is the eighth part of a series on Ragas, a research-driven evaluation framework for LLM and RAG systems. If you missed the previous parts, check them out below:*\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**Part 8: Building Feedback Loops — _You are here_**']",
"['<1-hop>\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\n# Create metric\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n# Calculate the score\\nresult = await faithfulness.single_turn_ascore(sample)\\nprint(f\"Faithfulness score: {result}\")\\n```\\n\\n> 💡 **Try it yourself:** \\n> Explore the hands-on notebook for this workflow: \\n> [01_Introduction_to_Ragas](https://github.com/mafzaal/intro-to-ragas/blob/master/01_Introduction_to_Ragas.ipynb)\\n\\n## What\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\nWe\\'ll explore each metric in detail, explaining when and how to use them effectively.', \"<2-hop>\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '<3-hop>\\n\\n---\\ntitle: \"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"\\ndate: 2025-04-28T06:00:00-06:00\\nlayout: blog\\ndescription: \"Learn how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications.\"\\ncategories: [\"AI\", \"Agents\", \"Evaluation\", \"Ragas\", \"LLM\"]\\ncoverImage: \"/images/ai_agent_evaluation.png\" \\nreadingTime: 8\\npublished: true\\n---\\n\\nIn our previous posts, we\\'ve explored how Ragas evaluates RAG systems and enables custom metrics for specialized applications. As LLMs evolve beyond simple question-answering to become powerful AI agents, evaluation needs have grown more sophisticated too. In this post, we\\'ll explore Ragas\\' specialized metrics for evaluating AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\n## The Challenge of Evaluating AI Agents']",
"Ragas is an evaluation framework designed to support both LLM (Large Language Model) applications and AI agents by providing a structured approach to assess their performance. It introduces specialized metrics tailored for evaluating AI agents, focusing on aspects beyond simple answer accuracy, such as contextual understanding and interaction quality.\n\nFor more detailed insights, you can explore the following parts of the Ragas series:\n\n- **[Part 1: Introduction to Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)**\n- **[Part 2: Basic Evaluation Workflow](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)**\n- **[Part 3: Evaluating RAG Systems with Ragas](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)**\n- **[Part 5: Advanced Metrics and Customization](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)**\n- **[Part 6: Evaluating AI Agents](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/)**\n\nThese resources will provide a comprehensive understanding of how Ragas enhances evaluation processes for both LLMs and AI agents.",
"Ragas supports the evaluation of LLM applications by providing metrics such as faithfulness, which can be applied to single-turn samples as shown in the introductory workflow. For AI agents, Ragas introduces specialized metrics designed to assess more complex behaviors, including goal accuracy, tool call accuracy, and topic adherence. These metrics enable the evaluation of AI agents that perform multi-turn interactions, utilize tools, and work toward specific goals, thus addressing the advanced requirements of evaluating agent-based applications.",
"multi_hop_abstract_query_synthesizer"
],
[
"7",
"How does Metric-Driven Development (MDD) utilize combined performance metrics to guide project outcomes, and what are some examples of such metrics in practice?",
"[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '* **Team A** is stuck in debates. Should they focus on improving video load speed or making the recommendation engine more accurate? One engineer insists, \"Faster videos keep users from leaving!\" Another counters, \"But better recommendations are what make them subscribe!\" They argue based on gut feelings.\\n* **Team B** operates differently. They have a clear, agreed-upon goal: ***Improve the average \"Watch Time per User\" metric, while ensuring video buffering times stay below 2 seconds.*** They rapidly test ideas, measuring the impact of each change against this specific target.\\n\\nWhich team do you think will make faster, smarter progress?\\n\\n\\nTeam B has the edge because they\\'re using **Metric-Driven Development (MDD)**. This is a powerful strategy where teams unite around measurable goals to eliminate guesswork and make real strides. Let\\'s break down how it works, what makes a metric truly useful, and see how industries from healthcare to e-commerce use it to succeed.', '| **Harmfulness** | | ✓ | |\\n| **Coherence** | | ✓ | |\\n| **Context Relevancy** | | | ✓ |', 'MDD reminds us to track both:\\n* **Leading indicators** (like training loss, code coverage) to monitor progress during development.\\n* **Lagging indicators** (like user engagement, revenue, customer support tickets) to measure the actual impact.\\n\\n## The Takeaway: Use Metrics as Your Compass\\nMetric-Driven Development isn\\'t a complex theory reserved for tech giants. It\\'s a fundamental mindset applicable everywhere:\\n\\n* A local bakery might track *\"Daily Units Sold per Pastry Type\"* to optimize baking schedules.\\n* A city planner could use *\"Average Commute Time Reduction\"* to evaluate the success of new traffic light patterns.\\n* A project manager might measure progress through *\"Sprint Velocity\"* or *\"Percentage of On-Time Task Completions\"* rather than subjective assessments of how \"busy\" the team appears.\\n\\n\\nBy choosing metrics that are **measurable, actionable, aligned, simple, and robust**, you transform ambiguity into clarity and opinion into evidence.']",
"['<1-hop>\\n\\n## What Exactly is Metric-Driven Development?\\n\\nMetric-Driven Development (MDD) is a simple but effective framework where teams:\\n\\n1. **Define Clear, Measurable Goals:** Set specific numerical targets (e.g., \"Increase user sign-ups by 20% this quarter\").\\n2. **Base Decisions on Data:** Rely on evidence and measurements, not just opinions or assumptions.\\n3. **Iterate and Learn Quickly:** Continuously measure the impact of changes to see what works and what doesn\\'t.\\n\\nThink of MDD as a **GPS for your project**. Without clear metrics, you\\'re driving in the fog, hoping you\\'re heading in the right direction. With MDD, you get real-time feedback, ensuring you\\'re moving towards your destination efficiently.\\n\\n## Why Teams Struggle Without Clear Metrics\\n\\nWithout a metric-driven approach, teams often fall into common traps:', '<2-hop>\\n\\n* **Metric:** A combined score, e.g., `Points Scored - (Time Taken * Penalty Factor)`.\\n * **Impact:** Trains AI opponents that are challenging but fair, balancing speed and skill.\\n* **Autonomous Vehicles: Safety & Comfort Score**\\n * **Metric:** Combination of factors like smooth acceleration/braking, lane adherence, and deductions for interventions or near-misses.\\n * **Impact:** Guides development towards vehicles that are not only safe but also provide a comfortable ride.']",
"Metric-Driven Development (MDD) utilizes combined performance metrics to provide clear, measurable goals that guide project outcomes. By focusing on specific metrics, teams can eliminate guesswork and make informed decisions based on data rather than opinions. \n\n### Key Aspects of MDD:\n- **Leading Indicators**: Metrics like training loss and code coverage help monitor progress during development.\n- **Lagging Indicators**: Metrics such as user engagement, revenue, and customer support tickets measure the actual impact of changes made.\n\n### Examples of Metrics in Practice:\n- A bakery might track **\"Daily Units Sold per Pastry Type\"** to optimize baking schedules.\n- A city planner could use **\"Average Commute Time Reduction\"** to evaluate the success of new traffic light patterns.\n- A project manager might measure progress through **\"Sprint Velocity\"** or **\"Percentage of On-Time Task Completions\"**.\n\nThese metrics transform ambiguity into clarity, allowing teams to focus on actionable outcomes.\n\nFor more information, you can explore the following links:\n- [Metric-Driven Development: Make Smarter Decisions, Faster](https://thedataguy.pro/blog/metric-driven-development/)",
"Metric-Driven Development (MDD) utilizes combined performance metrics by defining clear, measurable goals and basing decisions on data rather than assumptions. This approach ensures that teams receive real-time feedback and can iterate quickly to improve outcomes. Examples of combined performance metrics in practice include a score such as 'Points Scored - (Time Taken * Penalty Factor)' to train AI opponents that balance speed and skill, and a 'Safety & Comfort Score' for autonomous vehicles, which combines factors like smooth acceleration, lane adherence, and deductions for interventions or near-misses. These combined metrics help guide development towards achieving specific, balanced objectives.",
"multi_hop_abstract_query_synthesizer"
],
[
"8",
"How does Ragas provide specialized evaluation metrics for LLMs, and what steps are involved in creating a custom metric to assess technical accuracy in programming explanations?",
"[\"In our next post, we'll explore advanced metrics and customization techniques for specialized evaluation needs.\\n\\n---\\n\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**Part 4: Test Data Generation — _You are here_** \\n*Next up in the series:* \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", \"In our next post, we'll delve deeper into specialized evaluation techniques for RAG systems, exploring advanced metrics and evaluation strategies for retrieval-augmented generation applications.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**Part 2: Basic Evaluation Workflow — _You are here_** \\n*Next up in the series:* \\n**[Part 3: Evaluating RAG Systems](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\n---\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**Part 3: Evaluating RAG Systems with Ragas — _You are here_** \\n*Next up in the series:* \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\"]",
"[\"<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\", '<2-hop>\\n\\n## Creating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python\\nfrom dataclasses import dataclass, field\\nfrom typing import Dict, Optional, Set\\nimport typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric\\nfrom ragas.prompt import PydanticPrompt\\nfrom ragas.metrics import MetricType, MetricOutputType\\nfrom pydantic import BaseModel\\n\\n# Define input/output models for the prompt\\nclass TechnicalAccuracyInput(BaseModel):\\n question: str\\n context: str\\n response: str\\n programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel):\\n score: float\\n feedback: str']",
"Ragas provides specialized evaluation metrics for LLMs by allowing users to extend its framework with custom metrics tailored to specific evaluation needs. This enables the measurement of aspects that are most relevant for particular applications, such as technical accuracy in programming explanations.\n\nTo create a custom metric for assessing technical accuracy, the following steps are typically involved:\n\n1. **Define the Evaluation Criteria**: Identify what constitutes technical accuracy in the context of programming explanations.\n2. **Develop the Metric**: Create a formula or algorithm that quantifies the defined criteria.\n3. **Integrate with Ragas**: Implement the custom metric within the Ragas framework to ensure it can be used in evaluations.\n4. **Test and Validate**: Run evaluations using the custom metric to ensure it accurately reflects the technical accuracy of programming explanations.\n\nFor more detailed information, you can explore the following links:\n\n- [Part 1: Introduction to Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Part 2: Basic Evaluation Workflow](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)\n- [Part 3: Evaluating RAG Systems](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)\n- [Part 5: Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Part 6: Evaluating AI Agents](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/)",
"Ragas is an open-source evaluation framework specifically designed for LLM applications, offering specialized metrics that address challenges unique to LLM-powered systems, such as ensuring factual accuracy, consistency with retrieved context, and appropriate query handling. To create a custom metric for evaluating technical accuracy in programming explanations, Ragas allows developers to define input and output models (for example, using Pydantic BaseModel classes for technical accuracy input and output), and implement the metric logic using its extensible metric classes. This enables tailored evaluation beyond traditional NLP metrics, supporting the needs of high-stakes LLM applications.",
"multi_hop_abstract_query_synthesizer"
],
[
"9",
"How do observability best practices contribute to building production-ready AI systems?",
"[\"What excites me most is that my experience in enterprise software development gives me a unique perspective on AI implementation. I understand not just the algorithms and models, but also how to integrate them into robust, production-ready systems that deliver real value.\\n\\n## The Best of Both Worlds\\n\\nComing back to AI doesn't mean leaving behind everything I learned in web and enterprise development. Quite the opposite - I believe my background gives me a particular advantage in building AI systems that are:\", '| **Harmfulness** | | ✓ | |\\n| **Coherence** | | ✓ | |\\n| **Context Relevancy** | | | ✓ |', \"- **Production-ready**: Understanding software engineering best practices helps create AI systems that can operate reliably at scale.\\n- **User-focused**: Experience with UX principles ensures AI solutions are designed with actual human users in mind.\\n- **Integrated**: Knowledge of enterprise systems makes it easier to connect AI capabilities with existing business processes.\\n- **Simplified**: My experience in streamlining complex business processes helps me identify where AI can have the greatest impact through intelligent automation.\\n- **Business-oriented**: I understand that AI isn't just about the technology—it's about solving real business problems and creating measurable value.\\n- **Practical**: I focus on practical applications that deliver immediate benefits rather than getting caught up in theoretical possibilities.\\n\\n## What's Next\\n\\nAs I return to my AI roots, I'm excited to share this journey with you through this blog. In the coming months, I plan to write about:\", '| **Metric** | **Comprehensive RAG Evaluation** | **Content Quality Evaluation** | **Retrieval Quality Evaluation** |\\n|-----------------------------|----------------------------------|---------------------------------|-----------------------------------|\\n| **Faithfulness** | ✓ | ✓ | |\\n| **Answer Relevancy** | ✓ | ✓ | |\\n| **Context Recall** | ✓ | | ✓ |\\n| **Context Precision** | ✓ | | ✓ |\\n| **Harmfulness** | | ✓ | |']",
"['<1-hop>\\n\\n## Best Practices for Observability\\n\\n1. **Define clear thresholds**: Establish performance baselines and alert thresholds for each metric\\n2. **Segment evaluations**: Break down results by query type, data source, or other relevant factors\\n3. **Historical tracking**: Maintain historical evaluation data to identify trends and regressions\\n4. **Correlation analysis**: Link evaluation metrics to user feedback and business outcomes\\n5. **Regular benchmarking**: Periodically evaluate against fixed test sets to ensure consistency\\n6. **Alert on regressions**: Implement automated alerts when metrics drop below thresholds\\n7. **Contextualize metrics**: Include example failures alongside aggregate metrics for better understanding\\n\\n## Building a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:', \"<2-hop>\\n\\n- **Production-ready**: Understanding software engineering best practices helps create AI systems that can operate reliably at scale.\\n- **User-focused**: Experience with UX principles ensures AI solutions are designed with actual human users in mind.\\n- **Integrated**: Knowledge of enterprise systems makes it easier to connect AI capabilities with existing business processes.\\n- **Simplified**: My experience in streamlining complex business processes helps me identify where AI can have the greatest impact through intelligent automation.\\n- **Business-oriented**: I understand that AI isn't just about the technology—it's about solving real business problems and creating measurable value.\\n- **Practical**: I focus on practical applications that deliver immediate benefits rather than getting caught up in theoretical possibilities.\\n\\n## What's Next\\n\\nAs I return to my AI roots, I'm excited to share this journey with you through this blog. In the coming months, I plan to write about:\"]",
"I don't know.",
"Observability best practices, such as defining clear thresholds, segmenting evaluations, maintaining historical tracking, and alerting on regressions, ensure that AI systems are continuously monitored and improved. These practices are essential for creating production-ready AI systems that can operate reliably at scale, as they enable consistent performance evaluation and rapid response to issues.",
"multi_hop_abstract_query_synthesizer"
]
],
"shape": {
"columns": 6,
"rows": 10
}
},
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_input</th>\n",
" <th>retrieved_contexts</th>\n",
" <th>reference_contexts</th>\n",
" <th>response</th>\n",
" <th>reference</th>\n",
" <th>synthesizer_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>How are Large Language Models integrated into ...</td>\n",
" <td>[| **Harmfulness** | ...</td>\n",
" <td>[---\\ntitle: \"Part 1: Introduction to Ragas: T...</td>\n",
" <td>I don't know.</td>\n",
" <td>Large Language Models (LLMs) are becoming fund...</td>\n",
" <td>single_hop_specifc_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Howw does Ragas help evalute LLM applikations ...</td>\n",
" <td>[In our next post, we'll explore advanced metr...</td>\n",
" <td>[## What is Ragas?\\n\\n[Ragas](https://docs.rag...</td>\n",
" <td>I don't know.</td>\n",
" <td>Ragas is an open-source evaluation framework d...</td>\n",
" <td>single_hop_specifc_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What specialized metrics does Ragas provide fo...</td>\n",
" <td>[In our next post, we'll explore advanced metr...</td>\n",
" <td>[Evaluation serves several key purposes:\\n- **...</td>\n",
" <td>I don't know.</td>\n",
" <td>Ragas offers both LLM-based and computational ...</td>\n",
" <td>single_hop_specifc_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>me wanna know how LangSmith work with Ragas, l...</td>\n",
" <td>[In our next post, we'll explore how to integr...</td>\n",
" <td>[### 🧪 Test Data Generation\\nCreating high-qua...</td>\n",
" <td>I don't know.</td>\n",
" <td>LangSmith is listed as one of the observabilit...</td>\n",
" <td>single_hop_specifc_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>How do I use the OPENAI API key when initializ...</td>\n",
" <td>[In our next post, we'll explore how to integr...</td>\n",
" <td>[## Getting Started with Ragas\\n\\nInstalling R...</td>\n",
" <td>To use the OPENAI API key when initializing an...</td>\n",
" <td>To use the OPENAI API key when initializing an...</td>\n",
" <td>single_hop_specifc_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>How does synthetic data generation contribute ...</td>\n",
" <td>[In our next post, we'll explore how to genera...</td>\n",
" <td>[<1-hop>\\n\\n## Why and How to Generate Synthet...</td>\n",
" <td>I don't know.</td>\n",
" <td>Synthetic data generation enables rapid creati...</td>\n",
" <td>multi_hop_abstract_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>How does Ragas support the evaluation of both ...</td>\n",
" <td>[In our next post, we'll explore advanced metr...</td>\n",
" <td>[<1-hop>\\n\\n# Create a sample\\nsample = Single...</td>\n",
" <td>Ragas is an evaluation framework designed to s...</td>\n",
" <td>Ragas supports the evaluation of LLM applicati...</td>\n",
" <td>multi_hop_abstract_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>How does Metric-Driven Development (MDD) utili...</td>\n",
" <td>[Reward functions embody the core MDD idea: se...</td>\n",
" <td>[<1-hop>\\n\\n## What Exactly is Metric-Driven D...</td>\n",
" <td>Metric-Driven Development (MDD) utilizes combi...</td>\n",
" <td>Metric-Driven Development (MDD) utilizes combi...</td>\n",
" <td>multi_hop_abstract_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>How does Ragas provide specialized evaluation ...</td>\n",
" <td>[In our next post, we'll explore advanced metr...</td>\n",
" <td>[<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https...</td>\n",
" <td>Ragas provides specialized evaluation metrics ...</td>\n",
" <td>Ragas is an open-source evaluation framework s...</td>\n",
" <td>multi_hop_abstract_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>How do observability best practices contribute...</td>\n",
" <td>[What excites me most is that my experience in...</td>\n",
" <td>[<1-hop>\\n\\n## Best Practices for Observabilit...</td>\n",
" <td>I don't know.</td>\n",
" <td>Observability best practices, such as defining...</td>\n",
" <td>multi_hop_abstract_query_synthesizer</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_input \\\n",
"0 How are Large Language Models integrated into ... \n",
"1 Howw does Ragas help evalute LLM applikations ... \n",
"2 What specialized metrics does Ragas provide fo... \n",
"3 me wanna know how LangSmith work with Ragas, l... \n",
"4 How do I use the OPENAI API key when initializ... \n",
"5 How does synthetic data generation contribute ... \n",
"6 How does Ragas support the evaluation of both ... \n",
"7 How does Metric-Driven Development (MDD) utili... \n",
"8 How does Ragas provide specialized evaluation ... \n",
"9 How do observability best practices contribute... \n",
"\n",
" retrieved_contexts \\\n",
"0 [| **Harmfulness** | ... \n",
"1 [In our next post, we'll explore advanced metr... \n",
"2 [In our next post, we'll explore advanced metr... \n",
"3 [In our next post, we'll explore how to integr... \n",
"4 [In our next post, we'll explore how to integr... \n",
"5 [In our next post, we'll explore how to genera... \n",
"6 [In our next post, we'll explore advanced metr... \n",
"7 [Reward functions embody the core MDD idea: se... \n",
"8 [In our next post, we'll explore advanced metr... \n",
"9 [What excites me most is that my experience in... \n",
"\n",
" reference_contexts \\\n",
"0 [---\\ntitle: \"Part 1: Introduction to Ragas: T... \n",
"1 [## What is Ragas?\\n\\n[Ragas](https://docs.rag... \n",
"2 [Evaluation serves several key purposes:\\n- **... \n",
"3 [### 🧪 Test Data Generation\\nCreating high-qua... \n",
"4 [## Getting Started with Ragas\\n\\nInstalling R... \n",
"5 [<1-hop>\\n\\n## Why and How to Generate Synthet... \n",
"6 [<1-hop>\\n\\n# Create a sample\\nsample = Single... \n",
"7 [<1-hop>\\n\\n## What Exactly is Metric-Driven D... \n",
"8 [<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https... \n",
"9 [<1-hop>\\n\\n## Best Practices for Observabilit... \n",
"\n",
" response \\\n",
"0 I don't know. \n",
"1 I don't know. \n",
"2 I don't know. \n",
"3 I don't know. \n",
"4 To use the OPENAI API key when initializing an... \n",
"5 I don't know. \n",
"6 Ragas is an evaluation framework designed to s... \n",
"7 Metric-Driven Development (MDD) utilizes combi... \n",
"8 Ragas provides specialized evaluation metrics ... \n",
"9 I don't know. \n",
"\n",
" reference \\\n",
"0 Large Language Models (LLMs) are becoming fund... \n",
"1 Ragas is an open-source evaluation framework d... \n",
"2 Ragas offers both LLM-based and computational ... \n",
"3 LangSmith is listed as one of the observabilit... \n",
"4 To use the OPENAI API key when initializing an... \n",
"5 Synthetic data generation enables rapid creati... \n",
"6 Ragas supports the evaluation of LLM applicati... \n",
"7 Metric-Driven Development (MDD) utilizes combi... \n",
"8 Ragas is an open-source evaluation framework s... \n",
"9 Observability best practices, such as defining... \n",
"\n",
" synthesizer_name \n",
"0 single_hop_specifc_query_synthesizer \n",
"1 single_hop_specifc_query_synthesizer \n",
"2 single_hop_specifc_query_synthesizer \n",
"3 single_hop_specifc_query_synthesizer \n",
"4 single_hop_specifc_query_synthesizer \n",
"5 multi_hop_abstract_query_synthesizer \n",
"6 multi_hop_abstract_query_synthesizer \n",
"7 multi_hop_abstract_query_synthesizer \n",
"8 multi_hop_abstract_query_synthesizer \n",
"9 multi_hop_abstract_query_synthesizer "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eval_df = evalset.to_pandas()\n",
"eval_df"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f5d50d7b",
"metadata": {},
"outputs": [],
"source": [
"eval_df.to_csv(\"evals/rag_eval_2.csv\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "fb7d4a45",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8960aa00f8a94925938bb108e127cf12",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Evaluating: 0%| | 0/60 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Exception raised in Job[11]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29843, Requested 1360. Please try again in 2.406s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
"Exception raised in Job[28]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29165, Requested 1477. Please try again in 1.284s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
"Exception raised in Job[19]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 30000, Requested 1733. Please try again in 3.466s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
"Exception raised in Job[24]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 30000, Requested 1837. Please try again in 3.674s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
"Exception raised in Job[31]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29338, Requested 1730. Please try again in 2.136s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
"Exception raised in Job[30]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29225, Requested 2003. Please try again in 2.456s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
"Exception raised in Job[34]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29778, Requested 1498. Please try again in 2.552s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
"Exception raised in Job[25]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29207, Requested 1958. Please try again in 2.33s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
"Exception raised in Job[36]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29319, Requested 2005. Please try again in 2.648s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
"Exception raised in Job[40]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29964, Requested 1557. Please try again in 3.042s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
"Exception raised in Job[37]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29893, Requested 2010. Please try again in 3.805s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
"Exception raised in Job[43]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29579, Requested 1876. Please try again in 2.91s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n"
]
}
],
"source": [
"result = eval.run_ragas_evaluation(evalset)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "74aab82a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'context_recall': 0.1905, 'faithfulness': 0.8545, 'factual_correctness(mode=f1)': 0.2490, 'answer_relevancy': 0.3892, 'context_entity_recall': 0.1503, 'noise_sensitivity(mode=relevant)': 0.2540}"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "49fa29f2",
"metadata": {},
"outputs": [],
"source": [
"result.to_pandas().to_csv(\"evals/rag_eval_result_2.csv\",index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|