diff --git "a/py-src/notebooks/05_SDG_Eval.ipynb" "b/py-src/notebooks/05_SDG_Eval.ipynb"
--- "a/py-src/notebooks/05_SDG_Eval.ipynb"
+++ "b/py-src/notebooks/05_SDG_Eval.ipynb"
@@ -168,10 +168,151 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"id": "03663a91",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f0e20da2b88e472b93e5bde8529eb108",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Applying SummaryExtractor: 0%| | 0/144 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c8b35bd7bed24f5593565b0169a69348",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Applying CustomNodeFilter: 0%| | 0/162 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Node 76ad2116-072f-45ff-bb1c-6ffa106832b1 does not have a summary. Skipping filtering.\n",
+ "Node 10f1ca7c-9bba-42d3-a541-1ea9e7ae053f does not have a summary. Skipping filtering.\n",
+ "Node e3248992-4a79-4834-b22b-a3d1c4bcc300 does not have a summary. Skipping filtering.\n",
+ "Node 416280df-e766-4357-9b0e-496d414f4386 does not have a summary. Skipping filtering.\n",
+ "Node 2ad382ab-85a4-4e83-bf06-9d7f6f404368 does not have a summary. Skipping filtering.\n",
+ "Node 716820c2-bea4-4ca2-a24e-d3772d4ca4ad does not have a summary. Skipping filtering.\n",
+ "Node be468d9d-5003-437c-b241-a40413e814f2 does not have a summary. Skipping filtering.\n",
+ "Node 67bb4686-e17d-48e6-bb21-ed2aad303ad5 does not have a summary. Skipping filtering.\n",
+ "Node 80f581ab-9140-42a7-b62a-25cfb11f30a7 does not have a summary. Skipping filtering.\n",
+ "Node c2192eb9-7f0c-4049-b820-c0aab14a0ed4 does not have a summary. Skipping filtering.\n",
+ "Node 6754a45c-b627-4dca-8cab-328e54bf11b2 does not have a summary. Skipping filtering.\n",
+ "Node e14a7612-4e54-4b8a-b998-b66fa0dce67b does not have a summary. Skipping filtering.\n",
+ "Node 20bf3229-08e2-41c7-8e00-bdabb170134e does not have a summary. Skipping filtering.\n",
+ "Node 390eea13-57ac-4346-bae5-7babeffd265b does not have a summary. Skipping filtering.\n",
+ "Node 870b2ea9-6d39-42a9-8c59-9d90bfd74a35 does not have a summary. Skipping filtering.\n",
+ "Node 61fad07e-fdba-43c8-865b-8125a5b0f019 does not have a summary. Skipping filtering.\n",
+ "Node 4650c5c0-612b-4e52-ae7a-9fc68bc997e5 does not have a summary. Skipping filtering.\n",
+ "unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29690, Requested 787. Please try again in 954ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}\n",
+ "Node 3007936f-5abb-425d-a730-bb62c4146a04 does not have a summary. Skipping filtering.\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c78c2a6672a94c859c64c32d51784572",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]: 0%| | 0/468 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29958, Requested 524. Please try again in 964ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}\n",
+ "unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29826, Requested 525. Please try again in 702ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}\n",
+ "unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29953, Requested 534. Please try again in 973ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}\n",
+ "unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29518, Requested 519. Please try again in 74ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "3cb4e2df16384f5e8a8384d77d1a23c5",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Applying [CosineSimilarityBuilder, OverlapScoreBuilder]: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "unable to apply transformation: Node d26ac341-a574-4587-810b-94f50a48ed11 or 5050d4cc-68f8-40b4-923c-137c3c77b0c3 has no entities\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "63e0701766b04b4bb2383ef0b54b1692",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Generating personas: 0%| | 0/3 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a1e6e0f7932a459d8071b3269854821e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Generating Scenarios: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "72f1062d21c64a5caabab276d99f8c4e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Generating Samples: 0%| | 0/10 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"from lets_talk.config import EMBEDDING_MODEL,SDG_LLM_MODLEL,EVAL_LLM_MODEL\n",
"testset = eval.generate_testset(docs=docs,llm_model = SDG_LLM_MODLEL, embedding_model = EMBEDDING_MODEL,testset_size=10)"
@@ -179,7 +320,7 @@
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 7,
"id": "cad859be",
"metadata": {},
"outputs": [
@@ -214,96 +355,82 @@
}
],
"conversionMethod": "pd.DataFrame",
- "ref": "63bbcbb3-e161-4f6f-84e2-2cbbd5548516",
+ "ref": "fcd0dccd-8b42-4bf0-a20d-f3935815a6d5",
"rows": [
[
"0",
- "Does Ragas support integration with Langfuse?",
- "['title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true As Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in. What is Ragas? Ragas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems. At its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations? Why Evaluate LLM Applications? LLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable. Evaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application Key Features of Ragas 🎯 Specialized Metrics Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications: Faithfulness: Measures if the response is factually consistent with the retrieved context Context Relevancy: Evaluates if the retrieved information is relevant to the query Answer Relevancy: Assesses if the response addresses the user\\'s question Topic Adherence: Gauges how well multi-turn conversations stay on topic 🧪 Test Data Generation Creating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage. 🔗 Seamless Integrations Ragas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI Observability platforms - Phoenix - LangSmith - Langfuse 📊 Comprehensive Analysis Beyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements. Getting Started with Ragas Installing Ragas is straightforward: bash uv init && uv add ragas Here\\'s a simple example of evaluating a response using Ragas: ```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI Initialize the LLM, you are going to new OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Your evaluation data test_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" } Create a sample sample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor Create metric faithfulness = Faithfulness(llm=evaluator_llm) Calculate the score result = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ``` 💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas']",
- "Yes, Ragas works with observability platforms such as Langfuse.",
+ "How are Large Language Models integrated into modern applications, and why is their performance evaluation considered critical according to the context?",
+ "['---\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\\ndate: 2025-04-26T18:00:00-06:00\\nlayout: blog\\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\\nreadingTime: 7\\npublished: true\\n---\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\n## What is Ragas?']",
+ "Large Language Models (LLMs) are becoming fundamental components of modern applications, such as question-answering systems, document retrieval tools, and conversational agents. Effectively evaluating their performance is considered increasingly critical to ensure reliable metrics for assessing how well these applications perform.",
"single_hop_specifc_query_synthesizer"
],
[
"1",
- "what Part 8: Building Feedback Loops do for LLM app devs?",
- "[\"What's Coming in This Blog Series This introduction is just the beginning. In the upcoming posts, we'll dive deeper into all aspects of evaluating LLM applications with Ragas: Part 2: Basic Evaluation Workflow We'll explore each metric in detail, explaining when and how to use them effectively. Part 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance. Part 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application's capabilities. Part 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments. Part 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals. Part 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows. Part 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications. Conclusion In a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications. Ready to Elevate Your LLM Applications? Start exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you're facing specific evaluation hurdles, don't hesitate to reach out—we'd love to help!\"]",
- "Part 8: Building Feedback Loops show how to implement feedback loops that drive continuous improvement in LLM applications and how to turn evaluation insights into concrete improvements for LLM applications.",
+ "Howw does Ragas help evalute LLM applikations in the helthcare industree, and why is this importent for ensuring akuracy and reliabilitee?",
+ "[\"## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\"]",
+ "Ragas is an open-source evaluation framework designed for LLM applications, with strengths in Retrieval-Augmented Generation systems. It provides specialized metrics to address challenges unique to LLM-powered systems, such as determining if the application retrieves the right information and if responses are factually accurate and consistent with the retrieved context. Evaluating LLM applications is especially important in healthcare, where accuracy and reliability are critical, because LLMs can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. Proper evaluation with frameworks like Ragas is non-negotiable in healthcare to ensure the system's accuracy and reliability.",
"single_hop_specifc_query_synthesizer"
],
[
"2",
- "How does Ragas assist with Evaluation of RAG systems?",
- "['title: \"Part 4: Generating Test Data with Ragas\" date: 2025-04-27T16:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/generating_test_data.png\" readingTime: 14 published: true In our previous post, we explored how to comprehensively evaluate RAG systems using specialized metrics. However, even the best evaluation framework requires high-quality test data to yield meaningful insights. In this post, we\\'ll dive into how Ragas helps you generate robust test datasets for evaluating your LLM applications. Why and']",
- "Ragas helps generate robust test datasets for evaluating Retrieval-Augmented Generation systems, including document-based, domain-specific, and adversarial test generation techniques.",
+ "What specialized metrics does Ragas provide for evaluating LLM applications?",
+ "[\"Evaluation serves several key purposes:\\n- **Quality assurance**: Identify and fix issues before they reach users\\n- **Performance tracking**: Monitor how changes impact system performance\\n- **Benchmarking**: Compare different approaches objectively\\n- **Continuous improvement**: Build feedback loops to enhance your application\\n\\n## Key Features of Ragas\\n\\n### 🎯 Specialized Metrics\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\\n- **Answer Relevancy**: Assesses if the response addresses the user's question\\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic\"]",
+ "Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications, including faithfulness, context relevancy, answer relevancy, and topic adherence.",
"single_hop_specifc_query_synthesizer"
],
[
"3",
- "Wut is OpenAIEmbeddings used for?",
- "['How to Generate Synthetic Data for RAG Evaluation In the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, synthetic data generation is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like RAGAS and LangSmith. Why Generate Synthetic Data? Early Signal, Fast Iteration Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production. Controlled Complexity You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases. Benchmarking and Comparison Synthetic test sets provide a repeatable, comparable way to measure improvements as you tweak your pipeline (e.g., changing chunk size, embeddings, or prompts). How to Generate Synthetic Data 1. Prepare Your Source Data Start with a set of documents relevant to your domain. For example, you might download and load HTML blog posts into a document format using tools like LangChain’s DirectoryLoader. 2. Build a Knowledge Graph Use RAGAS to convert your documents into a knowledge graph. This graph captures entities, relationships, and summaries, forming the backbone for generating meaningful queries. RAGAS applies default transformations are dependent on the corpus length, here are some examples: Producing Summaries -> produces summaries of the documents Extracting Headlines -> finding the overall headline for the document Theme Extractor -> extracts broad themes about the documents It then uses cosine-similarity and heuristics between the embeddings of the above transformations to construct relationships between the nodes. This is a crucial step, as the quality of your knowledge graph directly impacts the relevance and accuracy of the generated queries. 3. Configure Query Synthesizers RAGAS provides several query synthesizers: - SingleHopSpecificQuerySynthesizer: Generates direct, fact-based questions. - MultiHopAbstractQuerySynthesizer: Creates broader, multi-step reasoning questions. - MultiHopSpecificQuerySynthesizer: Focuses on questions that require connecting specific entities across documents. By mixing these, you get a diverse and challenging test set. 4. Generate the Test Set With your knowledge graph and query synthesizers, use RAGAS’s TestsetGenerator to create a synthetic dataset. This dataset will include questions, reference answers, and supporting contexts. 5. Evaluate and Iterate Load your synthetic dataset into an evaluation platform like LangSmith. Run your RAG pipeline against the test set, and use automated evaluators (for accuracy, helpfulness, style, etc.) to identify strengths and weaknesses. Tweak your pipeline and re-evaluate to drive improvements. Minimal Example Here’s a high-level pseudocode outline (see the notebook for full details): ````python 1. Load documents from langchain_community.document_loaders import DirectoryLoader path = \"data/\" loader = DirectoryLoader(path, glob=\"*.md\") docs = loader.load() 2. Generate data from ragas.testset import TestsetGenerator from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings Initialize the generator with the LLM and embedding model generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4.1\")) generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) Create the test set generator generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings) dataset = generator.generate_with_langchain_docs(docs, testset_size=10) ```` dataset will now contain a set of questions, answers, and contexts that you can use to evaluate your RAG system. 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 04_Synthetic_Data_Generation']",
- "OpenAIEmbeddings is used as an embedding model in the synthetic data generation process for RAG evaluation, as shown when initializing the generator with LangchainEmbeddingsWrapper(OpenAIEmbeddings()).",
+ "me wanna know how LangSmith work with Ragas, like is it for test data or what, and how it help me as LLM app builder, can you tell me all about LangSmith from this info?",
+ "[\"### 🧪 Test Data Generation\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n### 🔗 Seamless Integrations\\nRagas works with popular LLM frameworks and tools:\\n- [LangChain](https://www.langchain.com/)\\n- [LlamaIndex](https://www.llamaindex.ai/)\\n- [Haystack](https://haystack.deepset.ai/)\\n- [OpenAI](https://openai.com/)\\n\\nObservability platforms \\n- [Phoenix](https://phoenix.arize.com/)\\n- [LangSmith](https://python.langchain.com/docs/introduction/)\\n- [Langfuse](https://www.langfuse.com/)\\n\\n### 📊 Comprehensive Analysis\\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\\n\\n## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere's a simple example of evaluating a response using Ragas:\"]",
+ "LangSmith is listed as one of the observability platforms that Ragas works with. The context does not provide details about how LangSmith specifically functions, but it shows that Ragas integrates with LangSmith to support observability in LLM-powered systems.",
"single_hop_specifc_query_synthesizer"
],
[
"4",
- "Wht are the key steps in the Ragas evalution workflow for RAG systems, and wich specialized evalution metrics can be selected to asess system performance?",
- "['<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation Workflow with Ragas\" date: 2025-04-26T19:00:00-06:00 layout: blog description: \"Learn how to set up a basic evaluation workflow for LLM applications using Ragas. This guide walks you through data preparation, metric selection, and result analysis.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1600132806370-bf17e65e942f?q=80&w=1988&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 8 published: true In our previous post, we introduced Ragas as a powerful framework for evaluating LLM applications. Now, let\\'s dive into the practical aspects of setting up your first evaluation pipeline. Understanding the Evaluation Workflow A typical Ragas evaluation workflow consists of four key steps: Prepare your data: Collect queries, contexts, responses, and reference answers Select appropriate metrics: Choose metrics that align with what you want to evaluate Run the evaluation: Process your data through the selected metrics Analyze the results: Interpret scores and identify areas for improvement Let\\'s walk through each step with practical examples. Step 1: Setting Up Your Environment First, ensure you have Ragas installed: bash uv add ragas Next, import the necessary components: python import pandas as pd from ragas import EvaluationDataset from ragas import evaluate, RunConfig from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity Step 2: Preparing Your Evaluation Data For a RAG system evaluation, you\\'ll need: Questions: User queries to your system Contexts: Documents or chunks retrieved by your system Responses: Answers generated by your system Ground truth (optional): Reference answers or documents for comparison Here\\'s how to organize this data: ```python Sample data data = { \"user_input\": [ \"What are the main symptoms of COVID-19?\", \"How does machine learning differ from deep learning?\" ], \"retrieved_contexts\": [ [ \"Common symptoms of COVID-19 include fever, cough, and fatigue. Some patients also report loss of taste or smell, body aches, and difficulty breathing.\", \"COVID-19 is caused by the SARS-CoV-2 virus and spreads primarily through respiratory droplets.\" ], [ \"Machine learning is a subset of AI focused on algorithms that learn from data without being explicitly programmed.\", \"Deep learning is a specialized form of machine learning using neural networks with many layers (deep neural networks).\" ] ], \"response\": [ \"The main symptoms of COVID-19 include fever, cough, fatigue, and sometimes loss of taste or smell, body aches, and breathing difficulties.\", \"Machine learning is a subset of AI that focuses on algorithms learning from data, while deep learning is a specialized form of machine learning that uses deep neural networks with multiple layers.\" ], \"reference\": [ \"COVID-19 symptoms commonly include fever, dry cough, fatigue, loss of taste or smell, body aches, sore throat, and in severe cases, difficulty breathing.\", \"Machine learning is a branch of AI where systems learn from data, identify patterns, and make decisions with minimal human intervention. Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks) to analyze various factors of data.\" ] } eval_data = pd.DataFrame(data) Convert to a format Ragas can use evaluation_dataset = EvaluationDataset.from_pandas(eval_data) evaluation_dataset ``` Step 3: Selecting and Configuring Metrics Ragas offers various metrics to evaluate different aspects of your system: Core RAG Metrics: Faithfulness: Measures if the response is factually consistent with the provided context. Factual Correctness: Assesses if the response is accurate and free from factual errors. Response Relevancy: Evaluates if the response directly addresses the user query. Context Entity Recall: Measures how well the retrieved context captures relevant entities from the ground truth. Noise Sensitivity: Assesses the robustness of the response to irrelevant or noisy context. LLM Context Recall: Evaluates how effectively the LLM utilizes the provided context to generate the response. For metrics that require an LLM (like faithfulness), you need to configure the LLM provider: ```python Configure LLM for evaluation from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper Initialize the LLM, you are going to OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Define metrics to use metrics = [ Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity(), LLMContextRecall() ] ```', '<2-hop>\\n\\ntitle: \"Part 4: Generating Test Data with Ragas\" date: 2025-04-27T16:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/generating_test_data.png\" readingTime: 14 published: true In our previous post, we explored how to comprehensively evaluate RAG systems using specialized metrics. However, even the best evaluation framework requires high-quality test data to yield meaningful insights. In this post, we\\'ll dive into how Ragas helps you generate robust test datasets for evaluating your LLM applications. Why and']",
- "The key steps in the Ragas evaluation workflow for RAG systems include preparing your data (collecting queries, contexts, responses, and reference answers), selecting appropriate metrics that align with your evaluation goals, running the evaluation by processing your data through the selected metrics, and analyzing the results to interpret scores and identify areas for improvement. Specialized evaluation metrics offered by Ragas include Faithfulness (measuring factual consistency with context), Factual Correctness (assessing accuracy and freedom from factual errors), Response Relevancy (evaluating if the response addresses the user query), Context Entity Recall (measuring how well the retrieved context captures relevant entities), Noise Sensitivity (assessing robustness to irrelevant context), and LLM Context Recall (evaluating how effectively the LLM uses the provided context to generate the response).",
- "multi_hop_abstract_query_synthesizer"
+ "How do I use the OPENAI API key when initializing an LLM for evaluation with Ragas?",
+ "['## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python\\nfrom ragas.metrics import Faithfulness\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import SingleTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\nfrom langchain_openai import ChatOpenAI\\n\\n# Initialize the LLM, you are going to new OPENAI API key\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \\n\\n# Your evaluation data\\ntest_data = {\\n \"user_input\": \"What is the capital of France?\",\\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\\n \"response\": \"The capital of France is Paris.\"\\n}\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor']",
+ "To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you need to provide your OPENAI API key during the initialization of the ChatOpenAI model, as shown in the example: evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")).",
+ "single_hop_specifc_query_synthesizer"
],
[
"5",
- "How does Ragas facilitate both test data generation and synthetic data generation for evaluating Retrieval-Augmented Generation (RAG) systems, and what are the key steps and tools involved in creating robust synthetic test datasets as described in the blog series?",
- "['<1-hop>\\n\\ntitle: \"Part 4: Generating Test Data with Ragas\" date: 2025-04-27T16:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/generating_test_data.png\" readingTime: 14 published: true In our previous post, we explored how to comprehensively evaluate RAG systems using specialized metrics. However, even the best evaluation framework requires high-quality test data to yield meaningful insights. In this post, we\\'ll dive into how Ragas helps you generate robust test datasets for evaluating your LLM applications. Why and', '<2-hop>\\n\\nHow to Generate Synthetic Data for RAG Evaluation In the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, synthetic data generation is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like RAGAS and LangSmith. Why Generate Synthetic Data? Early Signal, Fast Iteration Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production. Controlled Complexity You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases. Benchmarking and Comparison Synthetic test sets provide a repeatable, comparable way to measure improvements as you tweak your pipeline (e.g., changing chunk size, embeddings, or prompts). How to Generate Synthetic Data 1. Prepare Your Source Data Start with a set of documents relevant to your domain. For example, you might download and load HTML blog posts into a document format using tools like LangChain’s DirectoryLoader. 2. Build a Knowledge Graph Use RAGAS to convert your documents into a knowledge graph. This graph captures entities, relationships, and summaries, forming the backbone for generating meaningful queries. RAGAS applies default transformations are dependent on the corpus length, here are some examples: Producing Summaries -> produces summaries of the documents Extracting Headlines -> finding the overall headline for the document Theme Extractor -> extracts broad themes about the documents It then uses cosine-similarity and heuristics between the embeddings of the above transformations to construct relationships between the nodes. This is a crucial step, as the quality of your knowledge graph directly impacts the relevance and accuracy of the generated queries. 3. Configure Query Synthesizers RAGAS provides several query synthesizers: - SingleHopSpecificQuerySynthesizer: Generates direct, fact-based questions. - MultiHopAbstractQuerySynthesizer: Creates broader, multi-step reasoning questions. - MultiHopSpecificQuerySynthesizer: Focuses on questions that require connecting specific entities across documents. By mixing these, you get a diverse and challenging test set. 4. Generate the Test Set With your knowledge graph and query synthesizers, use RAGAS’s TestsetGenerator to create a synthetic dataset. This dataset will include questions, reference answers, and supporting contexts. 5. Evaluate and Iterate Load your synthetic dataset into an evaluation platform like LangSmith. Run your RAG pipeline against the test set, and use automated evaluators (for accuracy, helpfulness, style, etc.) to identify strengths and weaknesses. Tweak your pipeline and re-evaluate to drive improvements. Minimal Example Here’s a high-level pseudocode outline (see the notebook for full details): ````python 1. Load documents from langchain_community.document_loaders import DirectoryLoader path = \"data/\" loader = DirectoryLoader(path, glob=\"*.md\") docs = loader.load() 2. Generate data from ragas.testset import TestsetGenerator from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings Initialize the generator with the LLM and embedding model generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4.1\")) generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) Create the test set generator generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings) dataset = generator.generate_with_langchain_docs(docs, testset_size=10) ```` dataset will now contain a set of questions, answers, and contexts that you can use to evaluate your RAG system. 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 04_Synthetic_Data_Generation', \"<3-hop>\\n\\nWhat's Coming in This Blog Series This introduction is just the beginning. In the upcoming posts, we'll dive deeper into all aspects of evaluating LLM applications with Ragas: Part 2: Basic Evaluation Workflow We'll explore each metric in detail, explaining when and how to use them effectively. Part 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance. Part 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application's capabilities. Part 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments. Part 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals. Part 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows. Part 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications. Conclusion In a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications. Ready to Elevate Your LLM Applications? Start exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you're facing specific evaluation hurdles, don't hesitate to reach out—we'd love to help!\"]",
- "Ragas facilitates test data generation and synthetic data generation for evaluating Retrieval-Augmented Generation (RAG) systems by providing a structured workflow and specialized tools. According to the blog series, high-quality test datasets are essential for meaningful evaluation of LLM applications. Ragas enables the creation of robust test datasets by supporting document-based, domain-specific, and adversarial test generation techniques (<1-hop>). For synthetic data generation, Ragas allows developers to quickly create test sets that mimic real user queries and contexts, which is especially useful when real-world data is scarce or expensive to label. The process involves several key steps: preparing source documents, building a knowledge graph using Ragas (which captures entities, relationships, and summaries), and configuring query synthesizers such as SingleHopSpecificQuerySynthesizer, MultiHopAbstractQuerySynthesizer, and MultiHopSpecificQuerySynthesizer to generate diverse and challenging questions. The TestsetGenerator in Ragas then creates a synthetic dataset containing questions, reference answers, and supporting contexts. This synthetic dataset can be loaded into evaluation platforms like LangSmith for automated assessment and iterative improvement of the RAG pipeline (<2-hop>). The blog series further outlines that these practices are part of a comprehensive approach to evaluating LLM applications, with future posts covering advanced evaluation techniques and feedback loops for continuous improvement (<3-hop>).",
+ "How does synthetic data generation contribute to the importance of data quantity in improving RAG evaluation, and what paradigm shift does this represent according to Peter Norvig's perspective?",
+ "['<1-hop>\\n\\n## Why and How to Generate Synthetic Data for RAG Evaluation\\n\\nIn the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, **synthetic data generation** is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like [RAGAS](https://github.com/explodinggradients/ragas) and [LangSmith](https://smith.langchain.com/).\\n\\n---\\n\\n### Why Generate Synthetic Data?\\n\\n1. **Early Signal, Fast Iteration** \\n Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production.\\n\\n2. **Controlled Complexity** \\n You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases.', '<2-hop>\\n\\n## The Origin of \"Data is King\"\\n\\nPeter Norvig famously stated, \"We don\\'t have better algorithms. We just have more data.\" This statement emerged during a time when Google\\'s approach to machine translation was yielding surprisingly effective results not through algorithmic innovations, but through the sheer volume of multilingual data they had amassed. \\n\\nThis perspective represented a paradigm shift. Prior to this, the field had largely focused on crafting ever more sophisticated algorithms, with the assumption that smarter code would yield better results. Norvig\\'s insight suggested something different: even relatively simple algorithms could outperform more sophisticated ones when trained on sufficiently large datasets.\\n\\n## The Business Imperative of Data Ownership']",
+ "Synthetic data generation enables rapid creation of test sets that mimic real user queries and contexts, allowing for early evaluation and iteration of RAG systems even when real-world data is scarce or expensive to label. This approach increases the quantity of data available for evaluation, supporting robust system development. According to Peter Norvig's perspective, as described in the context, the importance of data quantity represents a paradigm shift: rather than relying solely on more sophisticated algorithms, having more data—even with simpler algorithms—can lead to better results. Thus, synthetic data generation aligns with this shift by providing the large datasets necessary to improve system performance.",
"multi_hop_abstract_query_synthesizer"
],
[
"6",
- "Wht speshulized evalushun metrix does Ragas provied for LLMs, and how do you selekt and configure these metrix in a basic evalushun workflow?",
- "['<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation Workflow with Ragas\" date: 2025-04-26T19:00:00-06:00 layout: blog description: \"Learn how to set up a basic evaluation workflow for LLM applications using Ragas. This guide walks you through data preparation, metric selection, and result analysis.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1600132806370-bf17e65e942f?q=80&w=1988&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 8 published: true In our previous post, we introduced Ragas as a powerful framework for evaluating LLM applications. Now, let\\'s dive into the practical aspects of setting up your first evaluation pipeline. Understanding the Evaluation Workflow A typical Ragas evaluation workflow consists of four key steps: Prepare your data: Collect queries, contexts, responses, and reference answers Select appropriate metrics: Choose metrics that align with what you want to evaluate Run the evaluation: Process your data through the selected metrics Analyze the results: Interpret scores and identify areas for improvement Let\\'s walk through each step with practical examples. Step 1: Setting Up Your Environment First, ensure you have Ragas installed: bash uv add ragas Next, import the necessary components: python import pandas as pd from ragas import EvaluationDataset from ragas import evaluate, RunConfig from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity Step 2: Preparing Your Evaluation Data For a RAG system evaluation, you\\'ll need: Questions: User queries to your system Contexts: Documents or chunks retrieved by your system Responses: Answers generated by your system Ground truth (optional): Reference answers or documents for comparison Here\\'s how to organize this data: ```python Sample data data = { \"user_input\": [ \"What are the main symptoms of COVID-19?\", \"How does machine learning differ from deep learning?\" ], \"retrieved_contexts\": [ [ \"Common symptoms of COVID-19 include fever, cough, and fatigue. Some patients also report loss of taste or smell, body aches, and difficulty breathing.\", \"COVID-19 is caused by the SARS-CoV-2 virus and spreads primarily through respiratory droplets.\" ], [ \"Machine learning is a subset of AI focused on algorithms that learn from data without being explicitly programmed.\", \"Deep learning is a specialized form of machine learning using neural networks with many layers (deep neural networks).\" ] ], \"response\": [ \"The main symptoms of COVID-19 include fever, cough, fatigue, and sometimes loss of taste or smell, body aches, and breathing difficulties.\", \"Machine learning is a subset of AI that focuses on algorithms learning from data, while deep learning is a specialized form of machine learning that uses deep neural networks with multiple layers.\" ], \"reference\": [ \"COVID-19 symptoms commonly include fever, dry cough, fatigue, loss of taste or smell, body aches, sore throat, and in severe cases, difficulty breathing.\", \"Machine learning is a branch of AI where systems learn from data, identify patterns, and make decisions with minimal human intervention. Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks) to analyze various factors of data.\" ] } eval_data = pd.DataFrame(data) Convert to a format Ragas can use evaluation_dataset = EvaluationDataset.from_pandas(eval_data) evaluation_dataset ``` Step 3: Selecting and Configuring Metrics Ragas offers various metrics to evaluate different aspects of your system: Core RAG Metrics: Faithfulness: Measures if the response is factually consistent with the provided context. Factual Correctness: Assesses if the response is accurate and free from factual errors. Response Relevancy: Evaluates if the response directly addresses the user query. Context Entity Recall: Measures how well the retrieved context captures relevant entities from the ground truth. Noise Sensitivity: Assesses the robustness of the response to irrelevant or noisy context. LLM Context Recall: Evaluates how effectively the LLM utilizes the provided context to generate the response. For metrics that require an LLM (like faithfulness), you need to configure the LLM provider: ```python Configure LLM for evaluation from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper Initialize the LLM, you are going to OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Define metrics to use metrics = [ Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity(), LLMContextRecall() ] ```', '<2-hop>\\n\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true As Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in. What is Ragas? Ragas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems. At its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations? Why Evaluate LLM Applications? LLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable. Evaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application Key Features of Ragas 🎯 Specialized Metrics Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications: Faithfulness: Measures if the response is factually consistent with the retrieved context Context Relevancy: Evaluates if the retrieved information is relevant to the query Answer Relevancy: Assesses if the response addresses the user\\'s question Topic Adherence: Gauges how well multi-turn conversations stay on topic 🧪 Test Data Generation Creating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage. 🔗 Seamless Integrations Ragas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI Observability platforms - Phoenix - LangSmith - Langfuse 📊 Comprehensive Analysis Beyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements. Getting Started with Ragas Installing Ragas is straightforward: bash uv init && uv add ragas Here\\'s a simple example of evaluating a response using Ragas: ```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI Initialize the LLM, you are going to new OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Your evaluation data test_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" } Create a sample sample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor Create metric faithfulness = Faithfulness(llm=evaluator_llm) Calculate the score result = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ``` 💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas']",
- "Ragas provieds speshulized evalushun metrix for LLMs, such as Faithfulness, Factual Correctness, Response Relevancy, Context Entity Recall, Noise Sensitivity, and LLM Context Recall. These metrix are taylored to address the unique challeenges of LLM-powred systems, like ensuring responses are factually consistant with the context and relevunt to the user query. In a basic evalushun workflow, you selekt metrix that align with your evalushun goals, then configure them—sum metrix, like Faithfulness, require setting up an LLM provider (for example, using LangchainLLMWrapper with a model like gpt-4o). You then run your evalushun by processing your data through the selekted metrix to analyze results and identify improvemint areas.",
+ "How does Ragas support the evaluation of both LLM applications and AI agents, and what specialized metrics are introduced for evaluating AI agents?",
+ "['<1-hop>\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\n# Create metric\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n# Calculate the score\\nresult = await faithfulness.single_turn_ascore(sample)\\nprint(f\"Faithfulness score: {result}\")\\n```\\n\\n> 💡 **Try it yourself:** \\n> Explore the hands-on notebook for this workflow: \\n> [01_Introduction_to_Ragas](https://github.com/mafzaal/intro-to-ragas/blob/master/01_Introduction_to_Ragas.ipynb)\\n\\n## What\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\nWe\\'ll explore each metric in detail, explaining when and how to use them effectively.', \"<2-hop>\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '<3-hop>\\n\\n---\\ntitle: \"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"\\ndate: 2025-04-28T06:00:00-06:00\\nlayout: blog\\ndescription: \"Learn how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications.\"\\ncategories: [\"AI\", \"Agents\", \"Evaluation\", \"Ragas\", \"LLM\"]\\ncoverImage: \"/images/ai_agent_evaluation.png\" \\nreadingTime: 8\\npublished: true\\n---\\n\\nIn our previous posts, we\\'ve explored how Ragas evaluates RAG systems and enables custom metrics for specialized applications. As LLMs evolve beyond simple question-answering to become powerful AI agents, evaluation needs have grown more sophisticated too. In this post, we\\'ll explore Ragas\\' specialized metrics for evaluating AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\n## The Challenge of Evaluating AI Agents']",
+ "Ragas supports the evaluation of LLM applications by providing metrics such as faithfulness, which can be applied to single-turn samples as shown in the introductory workflow. For AI agents, Ragas introduces specialized metrics designed to assess more complex behaviors, including goal accuracy, tool call accuracy, and topic adherence. These metrics enable the evaluation of AI agents that perform multi-turn interactions, utilize tools, and work toward specific goals, thus addressing the advanced requirements of evaluating agent-based applications.",
"multi_hop_abstract_query_synthesizer"
],
[
"7",
- "Which specialized metrics does Ragas provide for evaluating Retrieval-Augmented Generation (RAG) systems, and how do these metrics address the unique evaluation challenges posed by the multi-component nature of RAG systems?",
- "['<1-hop>\\n\\ntitle: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true In our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature. Understanding RAG Systems: More Than the Sum of Their Parts RAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information This dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content. The RAG Evaluation Triad Effective RAG evaluation requires examining three key dimensions: Retrieval Quality: How well does the system find relevant information? Generation Quality: How well does the system produce responses from retrieved information? End-to-End Performance: How well does the complete system satisfy user needs? Let\\'s explore how Ragas helps evaluate each dimension of RAG systems.', \"<2-hop>\\n\\nCore RAG Metrics in Ragas Ragas provides specialized metrics to assess RAG systems across retrieval, generation, and end-to-end performance. Retrieval Quality Metrics 1. Context Relevancy Measures how relevant the retrieved documents are to the user's question. How it works: Takes the user's question (user_input) and the retrieved documents (retrieved_contexts). Uses an LLM to score relevance with two different prompts, averaging the results for robustness. Scores are normalized between 0.0 (irrelevant) and 1.0 (fully relevant). Why it matters: Low scores indicate your retriever is pulling in unrelated or noisy documents. Monitoring this helps you improve the retrieval step. 2. Context Precision Assesses how much of the retrieved context is actually useful for generating the answer. How it works: For each retrieved chunk, an LLM judges if it was necessary for the answer, using the ground truth (reference) or the generated response. Calculates Average Precision, rewarding systems that rank useful chunks higher. Variants: ContextUtilization: Uses the generated response instead of ground truth. Non-LLM version: Compares retrieved chunks to ideal reference contexts using string similarity. Why it matters: High precision means your retriever is efficient; low precision means too much irrelevant information is included. 3. Context Recall Evaluates whether all necessary information from the ground truth answer is present in the retrieved context. How it works: Breaks down the reference answer into sentences. For each sentence, an LLM checks if it can be supported by the retrieved context. The score is the proportion of reference sentences attributed to the retrieved context. Variants: Non-LLM version: Compares reference and retrieved contexts using similarity and thresholds. Why it matters: High recall means your retriever finds all needed information; low recall means critical information is missing. Summary: - Low context relevancy: Retriever needs better query understanding or semantic matching. - Low context precision: Retriever includes unnecessary information. - Low context recall: Retriever misses critical information. Generation Quality Metrics 1. Faithfulness Checks if the generated answer is factually consistent with the retrieved context, addressing hallucination. How it works: Breaks the answer into simple statements. For each, an LLM checks if it can be inferred from the retrieved context. The score is the proportion of faithful statements. Alternative: FaithfulnesswithHHEM: Uses a specialized NLI model for verification. Why it matters: High faithfulness means answers are grounded in context; low faithfulness signals hallucination. 2. Answer Relevancy Measures if the generated answer directly addresses the user's question. How it works: Asks an LLM to generate possible questions for the answer. Compares these to the original question using embedding similarity. Penalizes noncommittal answers. Why it matters: High relevancy means answers are on-topic; low relevancy means answers are off-topic or incomplete. Summary: - Low faithfulness: Generator adds facts not supported by context. - Low answer relevancy: Generator doesn't focus on the specific question. End-to-End Metrics 1. Correctness Assesses factual alignment between the generated answer and a ground truth reference. How it works: Breaks both the answer and reference into claims. Uses NLI to verify claims in both directions. Calculates precision, recall, or F1-score. Why it matters: High correctness means answers match the ground truth; low correctness signals factual errors. Key distinction: - Faithfulness: Compares answer to retrieved context. - FactualCorrectness: Compares answer to ground truth.\", '<3-hop>\\n\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true As Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in. What is Ragas? Ragas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems. At its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations? Why Evaluate LLM Applications? LLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable. Evaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application Key Features of Ragas 🎯 Specialized Metrics Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications: Faithfulness: Measures if the response is factually consistent with the retrieved context Context Relevancy: Evaluates if the retrieved information is relevant to the query Answer Relevancy: Assesses if the response addresses the user\\'s question Topic Adherence: Gauges how well multi-turn conversations stay on topic 🧪 Test Data Generation Creating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage. 🔗 Seamless Integrations Ragas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI Observability platforms - Phoenix - LangSmith - Langfuse 📊 Comprehensive Analysis Beyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements. Getting Started with Ragas Installing Ragas is straightforward: bash uv init && uv add ragas Here\\'s a simple example of evaluating a response using Ragas: ```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI Initialize the LLM, you are going to new OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Your evaluation data test_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" } Create a sample sample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor Create metric faithfulness = Faithfulness(llm=evaluator_llm) Calculate the score result = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ``` 💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas', \"<4-hop>\\n\\nWhat's Coming in This Blog Series This introduction is just the beginning. In the upcoming posts, we'll dive deeper into all aspects of evaluating LLM applications with Ragas: Part 2: Basic Evaluation Workflow We'll explore each metric in detail, explaining when and how to use them effectively. Part 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance. Part 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application's capabilities. Part 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments. Part 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals. Part 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows. Part 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications. Conclusion In a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications. Ready to Elevate Your LLM Applications? Start exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you're facing specific evaluation hurdles, don't hesitate to reach out—we'd love to help!\"]",
- "Ragas provides specialized metrics for evaluating Retrieval-Augmented Generation (RAG) systems that address the unique challenges arising from their multi-component structure, which includes both retrieval and generation. The core metrics are divided into three key dimensions: Retrieval Quality, Generation Quality, and End-to-End Performance. For Retrieval Quality, Ragas offers Context Relevancy (measuring how relevant retrieved documents are to the user's question), Context Precision (assessing how much of the retrieved context is actually useful for generating the answer), and Context Recall (evaluating whether all necessary information from the ground truth answer is present in the retrieved context). For Generation Quality, Ragas includes Faithfulness (checking if the generated answer is factually consistent with the retrieved context) and Answer Relevancy (measuring if the generated answer directly addresses the user's question). For End-to-End Performance, the Correctness metric assesses factual alignment between the generated answer and a ground truth reference. These metrics collectively ensure that both the retrieval and generation components are evaluated individually and in combination, addressing the unique evaluation challenges of RAG systems.",
+ "How does Metric-Driven Development (MDD) utilize combined performance metrics to guide project outcomes, and what are some examples of such metrics in practice?",
+ "['<1-hop>\\n\\n## What Exactly is Metric-Driven Development?\\n\\nMetric-Driven Development (MDD) is a simple but effective framework where teams:\\n\\n1. **Define Clear, Measurable Goals:** Set specific numerical targets (e.g., \"Increase user sign-ups by 20% this quarter\").\\n2. **Base Decisions on Data:** Rely on evidence and measurements, not just opinions or assumptions.\\n3. **Iterate and Learn Quickly:** Continuously measure the impact of changes to see what works and what doesn\\'t.\\n\\nThink of MDD as a **GPS for your project**. Without clear metrics, you\\'re driving in the fog, hoping you\\'re heading in the right direction. With MDD, you get real-time feedback, ensuring you\\'re moving towards your destination efficiently.\\n\\n## Why Teams Struggle Without Clear Metrics\\n\\nWithout a metric-driven approach, teams often fall into common traps:', '<2-hop>\\n\\n* **Metric:** A combined score, e.g., `Points Scored - (Time Taken * Penalty Factor)`.\\n * **Impact:** Trains AI opponents that are challenging but fair, balancing speed and skill.\\n* **Autonomous Vehicles: Safety & Comfort Score**\\n * **Metric:** Combination of factors like smooth acceleration/braking, lane adherence, and deductions for interventions or near-misses.\\n * **Impact:** Guides development towards vehicles that are not only safe but also provide a comfortable ride.']",
+ "Metric-Driven Development (MDD) utilizes combined performance metrics by defining clear, measurable goals and basing decisions on data rather than assumptions. This approach ensures that teams receive real-time feedback and can iterate quickly to improve outcomes. Examples of combined performance metrics in practice include a score such as 'Points Scored - (Time Taken * Penalty Factor)' to train AI opponents that balance speed and skill, and a 'Safety & Comfort Score' for autonomous vehicles, which combines factors like smooth acceleration, lane adherence, and deductions for interventions or near-misses. These combined metrics help guide development towards achieving specific, balanced objectives.",
"multi_hop_abstract_query_synthesizer"
],
[
"8",
- "How does RAGAS facilitate metric-driven development in RAG system evaluation, and what specific metrics does it introduce to improve the assessment process?",
- "['<1-hop>\\n\\nHow to Generate Synthetic Data for RAG Evaluation In the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, synthetic data generation is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like RAGAS and LangSmith. Why Generate Synthetic Data? Early Signal, Fast Iteration Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production. Controlled Complexity You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases. Benchmarking and Comparison Synthetic test sets provide a repeatable, comparable way to measure improvements as you tweak your pipeline (e.g., changing chunk size, embeddings, or prompts). How to Generate Synthetic Data 1. Prepare Your Source Data Start with a set of documents relevant to your domain. For example, you might download and load HTML blog posts into a document format using tools like LangChain’s DirectoryLoader. 2. Build a Knowledge Graph Use RAGAS to convert your documents into a knowledge graph. This graph captures entities, relationships, and summaries, forming the backbone for generating meaningful queries. RAGAS applies default transformations are dependent on the corpus length, here are some examples: Producing Summaries -> produces summaries of the documents Extracting Headlines -> finding the overall headline for the document Theme Extractor -> extracts broad themes about the documents It then uses cosine-similarity and heuristics between the embeddings of the above transformations to construct relationships between the nodes. This is a crucial step, as the quality of your knowledge graph directly impacts the relevance and accuracy of the generated queries. 3. Configure Query Synthesizers RAGAS provides several query synthesizers: - SingleHopSpecificQuerySynthesizer: Generates direct, fact-based questions. - MultiHopAbstractQuerySynthesizer: Creates broader, multi-step reasoning questions. - MultiHopSpecificQuerySynthesizer: Focuses on questions that require connecting specific entities across documents. By mixing these, you get a diverse and challenging test set. 4. Generate the Test Set With your knowledge graph and query synthesizers, use RAGAS’s TestsetGenerator to create a synthetic dataset. This dataset will include questions, reference answers, and supporting contexts. 5. Evaluate and Iterate Load your synthetic dataset into an evaluation platform like LangSmith. Run your RAG pipeline against the test set, and use automated evaluators (for accuracy, helpfulness, style, etc.) to identify strengths and weaknesses. Tweak your pipeline and re-evaluate to drive improvements. Minimal Example Here’s a high-level pseudocode outline (see the notebook for full details): ````python 1. Load documents from langchain_community.document_loaders import DirectoryLoader path = \"data/\" loader = DirectoryLoader(path, glob=\"*.md\") docs = loader.load() 2. Generate data from ragas.testset import TestsetGenerator from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings Initialize the generator with the LLM and embedding model generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4.1\")) generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) Create the test set generator generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings) dataset = generator.generate_with_langchain_docs(docs, testset_size=10) ```` dataset will now contain a set of questions, answers, and contexts that you can use to evaluate your RAG system. 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 04_Synthetic_Data_Generation', '<2-hop>\\n\\ntitle: \"Metric-Driven Development: Make Smarter Decisions, Faster\" date: 2025-05-05T00:00:00-06:00 layout: blog description: \"Your Team\\'s Secret Weapon for Cutting Through Noise and Driving Real Progress. Learn how to use clear metrics to eliminate guesswork and make faster, smarter progress in your projects.\" categories: [\"Development\", \"Productivity\", \"AI\", \"Management\"] coverImage: \"/images/metric-driven-development.png\" readingTime: 9 published: true In today\\'s data-driven world, success depends increasingly on our ability to measure the right things at the right time. Whether you\\'re developing AI systems, building web applications, or managing projects, having clear metrics guides your team toward meaningful progress while eliminating subjective debates. The Power of Metrics in AI Evaluation Recent advances in generative AI and large language models (LLMs) highlight the critical importance of proper evaluation frameworks. Projects like RAGAS (Retrieval Augmented Generation Assessment System) demonstrate how specialized metrics can transform vague goals into actionable insights. For example, when evaluating retrieval-augmented generation systems, generic metrics like BLEU or ROUGE scores often fail to capture what truly matters - the accuracy, relevance, and contextual understanding of the generated responses. RAGAS instead introduces metrics specifically designed for RAG systems: Faithfulness: Measures how well the generated answer aligns with the retrieved context Answer Relevancy: Evaluates whether the response correctly addresses the user\\'s query Context Relevancy: Assesses if the system retrieves information that\\'s actually needed Context Precision: Quantifies how efficiently the system uses retrieved information These targeted metrics provide clearer direction than general-purpose evaluations, allowing teams to make precise improvements where they matter most. Imagine two teams building a new feature for a streaming platform: Team A is stuck in debates. Should they focus on improving video load speed or making the recommendation engine more accurate? One engineer insists, \"Faster videos keep users from leaving!\" Another counters, \"But better recommendations are what make them subscribe!\" They argue based on gut feelings. Team B operates differently. They have a clear, agreed-upon goal: Improve the average \"Watch Time per User\" metric, while ensuring video buffering times stay below 2 seconds. They rapidly test ideas, measuring the impact of each change against this specific target. Which team do you think will make faster, smarter progress? Team B has the edge because they\\'re using Metric-Driven Development (MDD). This is a powerful strategy where teams unite around measurable goals to eliminate guesswork and make real strides. Let\\'s break down how it works, what makes a metric truly useful, and see how industries from healthcare to e-commerce use it to succeed. What Exactly is Metric-Driven Development? Metric-Driven Development (MDD) is a simple but effective framework where teams: Define Clear, Measurable Goals: Set specific numerical targets (e.g., \"Increase user sign-ups by 20% this quarter\"). Base Decisions on Data: Rely on evidence and measurements, not just opinions or assumptions. Iterate and Learn Quickly: Continuously measure the impact of changes to see what works and what doesn\\'t. Think of MDD as a GPS for your project. Without clear metrics, you\\'re driving in the fog, hoping you\\'re heading in the right direction. With MDD, you get real-time feedback, ensuring you\\'re moving towards your destination efficiently. Why Teams Struggle Without Clear Metrics Without a metric-driven approach, teams often fall into common traps: Chasing Too Many Goals: Trying to improve everything at once (\"We need higher accuracy and faster speed and lower costs!\") leads to scattered effort and slow progress. Endless Subjective Debates: Arguments arise that are hard to resolve with data (\"Is Model A\\'s slightly better performance worth the extra complexity?\"). Difficulty Measuring Progress: It\\'s hard to know if you\\'re actually improving (\"Are we doing better than last quarter? How can we be sure?\"). In machine learning (ML), this often happens when teams track various technical scores (like precision, recall, or F1 score – measures of model accuracy) without a single, unifying metric tied to the actual business outcome they want to achieve.']",
- "RAGAS facilitates metric-driven development in RAG system evaluation by providing tools to generate synthetic datasets and by introducing specialized metrics tailored for RAG systems. According to the context, RAGAS enables the creation of synthetic test sets that mimic real user queries and contexts, allowing teams to benchmark and compare system performance in a controlled and repeatable way. This supports rapid iteration and targeted improvements. In addition, RAGAS introduces specific metrics designed for RAG evaluation, such as Faithfulness (measuring alignment of generated answers with retrieved context), Answer Relevancy (evaluating if the response addresses the user’s query), Context Relevancy (assessing if the retrieved information is needed), and Context Precision (quantifying efficient use of retrieved information). These targeted metrics provide clearer direction than generic metrics, enabling teams to make precise, data-driven improvements and embodying the principles of metric-driven development.",
- "multi_hop_specific_query_synthesizer"
+ "How does Ragas provide specialized evaluation metrics for LLMs, and what steps are involved in creating a custom metric to assess technical accuracy in programming explanations?",
+ "[\"<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\", '<2-hop>\\n\\n## Creating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python\\nfrom dataclasses import dataclass, field\\nfrom typing import Dict, Optional, Set\\nimport typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric\\nfrom ragas.prompt import PydanticPrompt\\nfrom ragas.metrics import MetricType, MetricOutputType\\nfrom pydantic import BaseModel\\n\\n# Define input/output models for the prompt\\nclass TechnicalAccuracyInput(BaseModel):\\n question: str\\n context: str\\n response: str\\n programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel):\\n score: float\\n feedback: str']",
+ "Ragas is an open-source evaluation framework specifically designed for LLM applications, offering specialized metrics that address challenges unique to LLM-powered systems, such as ensuring factual accuracy, consistency with retrieved context, and appropriate query handling. To create a custom metric for evaluating technical accuracy in programming explanations, Ragas allows developers to define input and output models (for example, using Pydantic BaseModel classes for technical accuracy input and output), and implement the metric logic using its extensible metric classes. This enables tailored evaluation beyond traditional NLP metrics, supporting the needs of high-stakes LLM applications.",
+ "multi_hop_abstract_query_synthesizer"
],
[
"9",
- "How does the use of Ragas facilitate the evaluation of Retrieval-Augmented Generation (RAG) systems by generating robust EvaluationDatasets, and what are some best practices for ensuring comprehensive evaluation of AI agents according to the provided context?",
- "['<1-hop>\\n\\ntitle: \"Part 4: Generating Test Data with Ragas\" date: 2025-04-27T16:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/generating_test_data.png\" readingTime: 14 published: true In our previous post, we explored how to comprehensively evaluate RAG systems using specialized metrics. However, even the best evaluation framework requires high-quality test data to yield meaningful insights. In this post, we\\'ll dive into how Ragas helps you generate robust test datasets for evaluating your LLM applications. Why and', '<2-hop>\\n\\nImplementing Agent Evaluation in Practice Let\\'s look at a practical example of evaluating an AI agent using these metrics: ```python from ragas.metrics import AgentGoalAccuracyWithoutReference, ToolCallAccuracy, TopicAdherenceScore from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import MultiTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper Initialize the LLM evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Example conversation with a travel booking agent test_data = { \"user_input\": [ {\"role\": \"user\", \"content\": \"I need to book a flight from New York to London next Friday\"}, {\"role\": \"assistant\", \"content\": \"I\\'d be happy to help you book a flight. Let me search for options...\", \"tool_calls\": [{\"name\": \"search_flights\", \"arguments\": {\"origin\": \"NYC\", \"destination\": \"LON\", \"date\": \"next Friday\"}}]}, {\"role\": \"tool\", \"name\": \"search_flights\", \"content\": \"Found 5 flights: Flight 1 (Delta, $750), Flight 2 (British Airways, $820)...\"}, {\"role\": \"assistant\", \"content\": \"I found several flights from New York to London next Friday. The cheapest option is Delta for $750. Would you like to book this one?\"}, {\"role\": \"user\", \"content\": \"Yes, please book the Delta flight\"}, {\"role\": \"assistant\", \"content\": \"I\\'ll book that for you now.\", \"tool_calls\": [{\"name\": \"book_flight\", \"arguments\": {\"flight_id\": \"delta_123\", \"price\": \"$750\"}}]}, {\"role\": \"tool\", \"name\": \"book_flight\", \"content\": \"Booking confirmed. Confirmation #: ABC123\"}, {\"role\": \"assistant\", \"content\": \"Great news! Your flight is confirmed. Your confirmation number is ABC123. The flight is scheduled for next Friday. Is there anything else you need help with?\"} ], \"reference_topics\": [\"travel\", \"flight booking\", \"schedules\", \"prices\"], \"reference_tool_calls\": [ {\"name\": \"search_flights\", \"args\": {\"origin\": \"NYC\", \"destination\": \"LON\", \"date\": \"next Friday\"}}, {\"name\": \"book_flight\", \"args\": {\"flight_id\": \"delta_123\", \"price\": \"$750\"}} ] } Create a sample sample = MultiTurnSample(**test_data) Initialize metrics goal_accuracy = AgentGoalAccuracyWithoutReference(llm=evaluator_llm) tool_accuracy = ToolCallAccuracy() topic_adherence = TopicAdherenceScore(llm=evaluator_llm) Calculate scores goal_score = await goal_accuracy.multi_turn_ascore(sample) tool_score = tool_accuracy.multi_turn_score(sample) topic_score = await topic_adherence.multi_turn_ascore(sample) print(f\"Goal Accuracy: {goal_score}\") print(f\"Tool Call Accuracy: {tool_score}\") print(f\"Topic Adherence: {topic_score}\") ``` 💡 Try it yourself: Explore the hands-on notebook for agent evaluation: 06_Evaluating_AI_Agents Advanced Agent Evaluation Techniques Combining Metrics for Comprehensive Evaluation For a complete assessment of agent capabilities, combine multiple metrics: ```python from ragas import evaluate results = evaluate( dataset, # Your dataset of agent conversations metrics=[ AgentGoalAccuracyWithoutReference(llm=evaluator_llm), ToolCallAccuracy(), TopicAdherence(llm=evaluator_llm) ] ) ``` Best Practices for Agent Evaluation Test scenario coverage: Include a diverse range of interaction scenarios Edge case handling: Test how agents handle unexpected inputs or failures Longitudinal evaluation: Track performance over time to identify regressions Human-in-the-loop validation: Periodically verify metric alignment with human judgments Continuous feedback loops: Use evaluation insights to guide agent improvements Conclusion Evaluating AI agents requires specialized metrics that go beyond traditional RAG evaluation. Ragas\\' agent_goal_accuracy, tool_call_accuracy, and topic_adherence provide crucial insights into whether an agent can successfully complete tasks, use tools correctly, and stay within designated boundaries. By incorporating these metrics into your evaluation pipeline, you can build more reliable and effective AI agents that truly deliver on the promise of helpful, goal-oriented AI assistants. In our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows. Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Metrics and Customization Part 6: Evaluating AI Agents — You are here Next up in the series: Part 7: Integrations and Observability Part 8: Building Feedback Loops How are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!']",
- "Ragas facilitates the evaluation of Retrieval-Augmented Generation (RAG) systems by enabling the generation of robust test datasets, which are essential for meaningful evaluation. According to the context, Ragas supports the creation of document-based, domain-specific, and adversarial test datasets, ensuring that LLM applications are tested under diverse and challenging scenarios. The EvaluationDataset class in Ragas allows for the structuring of multi-turn agent conversations, which can then be assessed using specialized metrics such as AgentGoalAccuracyWithoutReference, ToolCallAccuracy, and TopicAdherenceScore. These metrics provide insights into an agent's ability to complete tasks, use tools correctly, and adhere to designated topics. Best practices for comprehensive evaluation include covering a wide range of interaction scenarios, testing edge case handling, conducting longitudinal evaluations to track performance over time, incorporating human-in-the-loop validation to align metrics with human judgment, and establishing continuous feedback loops to guide agent improvements. By combining robust EvaluationDatasets with these best practices and metrics, Ragas enables a thorough and reliable evaluation process for AI agents in RAG systems.",
- "multi_hop_specific_query_synthesizer"
- ],
- [
- "10",
- "How can ChatOpenAI be integrated into a RAG evaluation pipeline for both synthetic data generation and advanced metric evaluation using Ragas, and what are the key steps involved in this process?",
- "['<1-hop>\\n\\nHow to Generate Synthetic Data for RAG Evaluation In the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, synthetic data generation is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like RAGAS and LangSmith. Why Generate Synthetic Data? Early Signal, Fast Iteration Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production. Controlled Complexity You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases. Benchmarking and Comparison Synthetic test sets provide a repeatable, comparable way to measure improvements as you tweak your pipeline (e.g., changing chunk size, embeddings, or prompts). How to Generate Synthetic Data 1. Prepare Your Source Data Start with a set of documents relevant to your domain. For example, you might download and load HTML blog posts into a document format using tools like LangChain’s DirectoryLoader. 2. Build a Knowledge Graph Use RAGAS to convert your documents into a knowledge graph. This graph captures entities, relationships, and summaries, forming the backbone for generating meaningful queries. RAGAS applies default transformations are dependent on the corpus length, here are some examples: Producing Summaries -> produces summaries of the documents Extracting Headlines -> finding the overall headline for the document Theme Extractor -> extracts broad themes about the documents It then uses cosine-similarity and heuristics between the embeddings of the above transformations to construct relationships between the nodes. This is a crucial step, as the quality of your knowledge graph directly impacts the relevance and accuracy of the generated queries. 3. Configure Query Synthesizers RAGAS provides several query synthesizers: - SingleHopSpecificQuerySynthesizer: Generates direct, fact-based questions. - MultiHopAbstractQuerySynthesizer: Creates broader, multi-step reasoning questions. - MultiHopSpecificQuerySynthesizer: Focuses on questions that require connecting specific entities across documents. By mixing these, you get a diverse and challenging test set. 4. Generate the Test Set With your knowledge graph and query synthesizers, use RAGAS’s TestsetGenerator to create a synthetic dataset. This dataset will include questions, reference answers, and supporting contexts. 5. Evaluate and Iterate Load your synthetic dataset into an evaluation platform like LangSmith. Run your RAG pipeline against the test set, and use automated evaluators (for accuracy, helpfulness, style, etc.) to identify strengths and weaknesses. Tweak your pipeline and re-evaluate to drive improvements. Minimal Example Here’s a high-level pseudocode outline (see the notebook for full details): ````python 1. Load documents from langchain_community.document_loaders import DirectoryLoader path = \"data/\" loader = DirectoryLoader(path, glob=\"*.md\") docs = loader.load() 2. Generate data from ragas.testset import TestsetGenerator from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings Initialize the generator with the LLM and embedding model generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4.1\")) generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) Create the test set generator generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings) dataset = generator.generate_with_langchain_docs(docs, testset_size=10) ```` dataset will now contain a set of questions, answers, and contexts that you can use to evaluate your RAG system. 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 04_Synthetic_Data_Generation', '<2-hop>\\n\\ntitle: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true In our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs. Beyond the Basics: Why Advanced Metrics Matter While Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements: Domain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements Custom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches Specialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics Business-specific KPIs: Aligning evaluation with business objectives requires customized metrics Let\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs. Understanding Ragas\\' Metric Architecture Before creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture: 1. Understand the Metric Base Classes All metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these: SingleTurnMetric: For metrics that evaluate a single question/response pair. MultiTurnMetric: For metrics that evaluate multi-turn conversations. MetricWithLLM: For metrics that require an LLM for evaluation. MetricWithEmbeddings: For metrics that use embeddings. You can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric). Each metric implements specific scoring methods depending on its type: _single_turn_ascore: For single-turn metrics _multi_turn_ascore: For multi-turn metrics Creating Your First Custom Metric Let\\'s create a custom metric that evaluates technical accuracy in programming explanations: ```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t from ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel Define input/output models for the prompt class TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\" class TechnicalAccuracyOutput(BaseModel): score: float feedback: str Define the prompt class TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ] Create the metric @dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\", } } ) output_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS evaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt) async def _single_turn_ascore(self, sample, callbacks) -> float: assert self.llm is not None, \"LLM must be set\" question = sample.user_input response = sample.response # Extract programming language from question if possible programming_language = \"python\" # Default languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"] for lang in languages: if lang in question.lower(): programming_language = lang break # Get the context context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\" # Prepare input for prompt prompt_input = TechnicalAccuracyInput( question=question, context=context, response=response, programming_language=programming_language ) # Generate evaluation evaluation = await self.evaluation_prompt.generate( data=prompt_input, llm=self.llm, callbacks=callbacks ) return evaluation.score ``` Using the Custom Metric To use the custom metric, simply include it in your evaluation pipeline: ```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper Initialize the LLM, you are going to OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) test_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", } Create a sample sample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\") Note: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions, `` You can also use theevaluate` function to evaluate a dataset: ```python from ragas import evaluate from ragas import evaluate results = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ``` 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization']",
- "ChatOpenAI can be integrated into a RAG evaluation pipeline for both synthetic data generation and advanced metric evaluation using Ragas by following a series of steps outlined in the provided context. First, for synthetic data generation, you initialize the generator with the LLM and embedding model by wrapping ChatOpenAI (for example, with LangchainLLMWrapper using ChatOpenAI(model=\"gpt-4.1\")) and OpenAIEmbeddings. This setup is used with Ragas’s TestsetGenerator to create a synthetic dataset containing questions, answers, and contexts for RAG evaluation. Second, for advanced metric evaluation, ChatOpenAI can be used as the underlying LLM in custom metrics within Ragas. For instance, when creating a custom metric such as TechnicalAccuracy, you initialize the evaluator LLM with LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")), and use it to score responses based on criteria like syntax correctness and algorithmic accuracy. The key steps are: (1) loading and preparing source documents, (2) initializing the LLM and embedding models with ChatOpenAI, (3) generating synthetic datasets with Ragas, and (4) evaluating the datasets using both built-in and custom metrics powered by ChatOpenAI as the LLM. This approach enables robust, repeatable evaluation and supports both standard and domain-specific assessment needs.",
- "multi_hop_specific_query_synthesizer"
- ],
- [
- "11",
- "How does Ragas facilitate the comprehensive evaluation of RAG systems by addressing both their retrieval and generation components, and how does this approach differ when evaluating more complex AI agents that use tools and pursue specific goals?",
- "['<1-hop>\\n\\ntitle: \"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\" date: 2025-04-28T06:00:00-06:00 layout: blog description: \"Learn how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications.\" categories: [\"AI\", \"Agents\", \"Evaluation\", \"Ragas\", \"LLM\"] coverImage: \"/images/ai_agent_evaluation.png\" readingTime: 8 published: true In our previous posts, we\\'ve explored how Ragas evaluates RAG systems and enables custom metrics for specialized applications. As LLMs evolve beyond simple question-answering to become powerful AI agents, evaluation needs have grown more sophisticated too. In this post, we\\'ll explore Ragas\\' specialized metrics for evaluating AI agents that engage in multi-turn interactions, use tools, and work toward specific goals. ', '<2-hop>\\n\\ntitle: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true In our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature. Understanding RAG Systems: More Than the Sum of Their Parts RAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information This dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content. The RAG Evaluation Triad Effective RAG evaluation requires examining three key dimensions: Retrieval Quality: How well does the system find relevant information? Generation Quality: How well does the system produce responses from retrieved information? End-to-End Performance: How well does the complete system satisfy user needs? Let\\'s explore how Ragas helps evaluate each dimension of RAG systems.']",
- "Ragas facilitates the comprehensive evaluation of RAG systems by providing specialized metrics that assess both the retrieval and generation components. For RAG systems, Ragas evaluates retrieval quality (how well relevant information is found), generation quality (how accurately and coherently responses are produced from retrieved information), and end-to-end performance (how well the system satisfies user needs). This approach ensures that both the individual components and their interaction are thoroughly assessed. When evaluating more complex AI agents, Ragas extends its evaluation with additional specialized metrics for goal accuracy, tool call accuracy, and topic adherence, reflecting the increased sophistication required for agents that engage in multi-turn interactions, use tools, and work toward specific goals.",
- "multi_hop_specific_query_synthesizer"
+ "How do observability best practices contribute to building production-ready AI systems?",
+ "['<1-hop>\\n\\n## Best Practices for Observability\\n\\n1. **Define clear thresholds**: Establish performance baselines and alert thresholds for each metric\\n2. **Segment evaluations**: Break down results by query type, data source, or other relevant factors\\n3. **Historical tracking**: Maintain historical evaluation data to identify trends and regressions\\n4. **Correlation analysis**: Link evaluation metrics to user feedback and business outcomes\\n5. **Regular benchmarking**: Periodically evaluate against fixed test sets to ensure consistency\\n6. **Alert on regressions**: Implement automated alerts when metrics drop below thresholds\\n7. **Contextualize metrics**: Include example failures alongside aggregate metrics for better understanding\\n\\n## Building a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:', \"<2-hop>\\n\\n- **Production-ready**: Understanding software engineering best practices helps create AI systems that can operate reliably at scale.\\n- **User-focused**: Experience with UX principles ensures AI solutions are designed with actual human users in mind.\\n- **Integrated**: Knowledge of enterprise systems makes it easier to connect AI capabilities with existing business processes.\\n- **Simplified**: My experience in streamlining complex business processes helps me identify where AI can have the greatest impact through intelligent automation.\\n- **Business-oriented**: I understand that AI isn't just about the technology—it's about solving real business problems and creating measurable value.\\n- **Practical**: I focus on practical applications that deliver immediate benefits rather than getting caught up in theoretical possibilities.\\n\\n## What's Next\\n\\nAs I return to my AI roots, I'm excited to share this journey with you through this blog. In the coming months, I plan to write about:\"]",
+ "Observability best practices, such as defining clear thresholds, segmenting evaluations, maintaining historical tracking, and alerting on regressions, ensure that AI systems are continuously monitored and improved. These practices are essential for creating production-ready AI systems that can operate reliably at scale, as they enable consistent performance evaluation and rapid response to issues.",
+ "multi_hop_abstract_query_synthesizer"
]
],
"shape": {
"columns": 4,
- "rows": 12
+ "rows": 10
}
},
"text/html": [
@@ -334,151 +461,129 @@
"
\n",
" \n",
" 0 | \n",
- " Does Ragas support integration with Langfuse? | \n",
- " [title: \"Part 1: Introduction to Ragas: The Es... | \n",
- " Yes, Ragas works with observability platforms ... | \n",
+ " How are Large Language Models integrated into ... | \n",
+ " [---\\ntitle: \"Part 1: Introduction to Ragas: T... | \n",
+ " Large Language Models (LLMs) are becoming fund... | \n",
" single_hop_specifc_query_synthesizer | \n",
"
\n",
" \n",
" 1 | \n",
- " what Part 8: Building Feedback Loops do for LL... | \n",
- " [What's Coming in This Blog Series This introd... | \n",
- " Part 8: Building Feedback Loops show how to im... | \n",
+ " Howw does Ragas help evalute LLM applikations ... | \n",
+ " [## What is Ragas?\\n\\n[Ragas](https://docs.rag... | \n",
+ " Ragas is an open-source evaluation framework d... | \n",
" single_hop_specifc_query_synthesizer | \n",
"
\n",
" \n",
" 2 | \n",
- " How does Ragas assist with Evaluation of RAG s... | \n",
- " [title: \"Part 4: Generating Test Data with Rag... | \n",
- " Ragas helps generate robust test datasets for ... | \n",
+ " What specialized metrics does Ragas provide fo... | \n",
+ " [Evaluation serves several key purposes:\\n- **... | \n",
+ " Ragas offers both LLM-based and computational ... | \n",
" single_hop_specifc_query_synthesizer | \n",
"
\n",
" \n",
" 3 | \n",
- " Wut is OpenAIEmbeddings used for? | \n",
- " [How to Generate Synthetic Data for RAG Evalua... | \n",
- " OpenAIEmbeddings is used as an embedding model... | \n",
+ " me wanna know how LangSmith work with Ragas, l... | \n",
+ " [### 🧪 Test Data Generation\\nCreating high-qua... | \n",
+ " LangSmith is listed as one of the observabilit... | \n",
" single_hop_specifc_query_synthesizer | \n",
"
\n",
" \n",
" 4 | \n",
- " Wht are the key steps in the Ragas evalution w... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation W... | \n",
- " The key steps in the Ragas evaluation workflow... | \n",
- " multi_hop_abstract_query_synthesizer | \n",
+ " How do I use the OPENAI API key when initializ... | \n",
+ " [## Getting Started with Ragas\\n\\nInstalling R... | \n",
+ " To use the OPENAI API key when initializing an... | \n",
+ " single_hop_specifc_query_synthesizer | \n",
"
\n",
" \n",
" 5 | \n",
- " How does Ragas facilitate both test data gener... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 4: Generating Test Da... | \n",
- " Ragas facilitates test data generation and syn... | \n",
+ " How does synthetic data generation contribute ... | \n",
+ " [<1-hop>\\n\\n## Why and How to Generate Synthet... | \n",
+ " Synthetic data generation enables rapid creati... | \n",
" multi_hop_abstract_query_synthesizer | \n",
"
\n",
" \n",
" 6 | \n",
- " Wht speshulized evalushun metrix does Ragas pr... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation W... | \n",
- " Ragas provieds speshulized evalushun metrix fo... | \n",
+ " How does Ragas support the evaluation of both ... | \n",
+ " [<1-hop>\\n\\n# Create a sample\\nsample = Single... | \n",
+ " Ragas supports the evaluation of LLM applicati... | \n",
" multi_hop_abstract_query_synthesizer | \n",
"
\n",
" \n",
" 7 | \n",
- " Which specialized metrics does Ragas provide f... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 3: Evaluating RAG Sys... | \n",
- " Ragas provides specialized metrics for evaluat... | \n",
+ " How does Metric-Driven Development (MDD) utili... | \n",
+ " [<1-hop>\\n\\n## What Exactly is Metric-Driven D... | \n",
+ " Metric-Driven Development (MDD) utilizes combi... | \n",
" multi_hop_abstract_query_synthesizer | \n",
"
\n",
" \n",
" 8 | \n",
- " How does RAGAS facilitate metric-driven develo... | \n",
- " [<1-hop>\\n\\nHow to Generate Synthetic Data for... | \n",
- " RAGAS facilitates metric-driven development in... | \n",
- " multi_hop_specific_query_synthesizer | \n",
+ " How does Ragas provide specialized evaluation ... | \n",
+ " [<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https... | \n",
+ " Ragas is an open-source evaluation framework s... | \n",
+ " multi_hop_abstract_query_synthesizer | \n",
"
\n",
" \n",
" 9 | \n",
- " How does the use of Ragas facilitate the evalu... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 4: Generating Test Da... | \n",
- " Ragas facilitates the evaluation of Retrieval-... | \n",
- " multi_hop_specific_query_synthesizer | \n",
- "
\n",
- " \n",
- " 10 | \n",
- " How can ChatOpenAI be integrated into a RAG ev... | \n",
- " [<1-hop>\\n\\nHow to Generate Synthetic Data for... | \n",
- " ChatOpenAI can be integrated into a RAG evalua... | \n",
- " multi_hop_specific_query_synthesizer | \n",
- "
\n",
- " \n",
- " 11 | \n",
- " How does Ragas facilitate the comprehensive ev... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 6: Evaluating AI Agen... | \n",
- " Ragas facilitates the comprehensive evaluation... | \n",
- " multi_hop_specific_query_synthesizer | \n",
+ " How do observability best practices contribute... | \n",
+ " [<1-hop>\\n\\n## Best Practices for Observabilit... | \n",
+ " Observability best practices, such as defining... | \n",
+ " multi_hop_abstract_query_synthesizer | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " user_input \\\n",
- "0 Does Ragas support integration with Langfuse? \n",
- "1 what Part 8: Building Feedback Loops do for LL... \n",
- "2 How does Ragas assist with Evaluation of RAG s... \n",
- "3 Wut is OpenAIEmbeddings used for? \n",
- "4 Wht are the key steps in the Ragas evalution w... \n",
- "5 How does Ragas facilitate both test data gener... \n",
- "6 Wht speshulized evalushun metrix does Ragas pr... \n",
- "7 Which specialized metrics does Ragas provide f... \n",
- "8 How does RAGAS facilitate metric-driven develo... \n",
- "9 How does the use of Ragas facilitate the evalu... \n",
- "10 How can ChatOpenAI be integrated into a RAG ev... \n",
- "11 How does Ragas facilitate the comprehensive ev... \n",
+ " user_input \\\n",
+ "0 How are Large Language Models integrated into ... \n",
+ "1 Howw does Ragas help evalute LLM applikations ... \n",
+ "2 What specialized metrics does Ragas provide fo... \n",
+ "3 me wanna know how LangSmith work with Ragas, l... \n",
+ "4 How do I use the OPENAI API key when initializ... \n",
+ "5 How does synthetic data generation contribute ... \n",
+ "6 How does Ragas support the evaluation of both ... \n",
+ "7 How does Metric-Driven Development (MDD) utili... \n",
+ "8 How does Ragas provide specialized evaluation ... \n",
+ "9 How do observability best practices contribute... \n",
"\n",
- " reference_contexts \\\n",
- "0 [title: \"Part 1: Introduction to Ragas: The Es... \n",
- "1 [What's Coming in This Blog Series This introd... \n",
- "2 [title: \"Part 4: Generating Test Data with Rag... \n",
- "3 [How to Generate Synthetic Data for RAG Evalua... \n",
- "4 [<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation W... \n",
- "5 [<1-hop>\\n\\ntitle: \"Part 4: Generating Test Da... \n",
- "6 [<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation W... \n",
- "7 [<1-hop>\\n\\ntitle: \"Part 3: Evaluating RAG Sys... \n",
- "8 [<1-hop>\\n\\nHow to Generate Synthetic Data for... \n",
- "9 [<1-hop>\\n\\ntitle: \"Part 4: Generating Test Da... \n",
- "10 [<1-hop>\\n\\nHow to Generate Synthetic Data for... \n",
- "11 [<1-hop>\\n\\ntitle: \"Part 6: Evaluating AI Agen... \n",
+ " reference_contexts \\\n",
+ "0 [---\\ntitle: \"Part 1: Introduction to Ragas: T... \n",
+ "1 [## What is Ragas?\\n\\n[Ragas](https://docs.rag... \n",
+ "2 [Evaluation serves several key purposes:\\n- **... \n",
+ "3 [### 🧪 Test Data Generation\\nCreating high-qua... \n",
+ "4 [## Getting Started with Ragas\\n\\nInstalling R... \n",
+ "5 [<1-hop>\\n\\n## Why and How to Generate Synthet... \n",
+ "6 [<1-hop>\\n\\n# Create a sample\\nsample = Single... \n",
+ "7 [<1-hop>\\n\\n## What Exactly is Metric-Driven D... \n",
+ "8 [<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https... \n",
+ "9 [<1-hop>\\n\\n## Best Practices for Observabilit... \n",
"\n",
- " reference \\\n",
- "0 Yes, Ragas works with observability platforms ... \n",
- "1 Part 8: Building Feedback Loops show how to im... \n",
- "2 Ragas helps generate robust test datasets for ... \n",
- "3 OpenAIEmbeddings is used as an embedding model... \n",
- "4 The key steps in the Ragas evaluation workflow... \n",
- "5 Ragas facilitates test data generation and syn... \n",
- "6 Ragas provieds speshulized evalushun metrix fo... \n",
- "7 Ragas provides specialized metrics for evaluat... \n",
- "8 RAGAS facilitates metric-driven development in... \n",
- "9 Ragas facilitates the evaluation of Retrieval-... \n",
- "10 ChatOpenAI can be integrated into a RAG evalua... \n",
- "11 Ragas facilitates the comprehensive evaluation... \n",
+ " reference \\\n",
+ "0 Large Language Models (LLMs) are becoming fund... \n",
+ "1 Ragas is an open-source evaluation framework d... \n",
+ "2 Ragas offers both LLM-based and computational ... \n",
+ "3 LangSmith is listed as one of the observabilit... \n",
+ "4 To use the OPENAI API key when initializing an... \n",
+ "5 Synthetic data generation enables rapid creati... \n",
+ "6 Ragas supports the evaluation of LLM applicati... \n",
+ "7 Metric-Driven Development (MDD) utilizes combi... \n",
+ "8 Ragas is an open-source evaluation framework s... \n",
+ "9 Observability best practices, such as defining... \n",
"\n",
- " synthesizer_name \n",
- "0 single_hop_specifc_query_synthesizer \n",
- "1 single_hop_specifc_query_synthesizer \n",
- "2 single_hop_specifc_query_synthesizer \n",
- "3 single_hop_specifc_query_synthesizer \n",
- "4 multi_hop_abstract_query_synthesizer \n",
- "5 multi_hop_abstract_query_synthesizer \n",
- "6 multi_hop_abstract_query_synthesizer \n",
- "7 multi_hop_abstract_query_synthesizer \n",
- "8 multi_hop_specific_query_synthesizer \n",
- "9 multi_hop_specific_query_synthesizer \n",
- "10 multi_hop_specific_query_synthesizer \n",
- "11 multi_hop_specific_query_synthesizer "
+ " synthesizer_name \n",
+ "0 single_hop_specifc_query_synthesizer \n",
+ "1 single_hop_specifc_query_synthesizer \n",
+ "2 single_hop_specifc_query_synthesizer \n",
+ "3 single_hop_specifc_query_synthesizer \n",
+ "4 single_hop_specifc_query_synthesizer \n",
+ "5 multi_hop_abstract_query_synthesizer \n",
+ "6 multi_hop_abstract_query_synthesizer \n",
+ "7 multi_hop_abstract_query_synthesizer \n",
+ "8 multi_hop_abstract_query_synthesizer \n",
+ "9 multi_hop_abstract_query_synthesizer "
]
},
- "execution_count": 33,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -490,7 +595,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "4ae903d8",
"metadata": {},
"outputs": [],
@@ -500,15 +605,22 @@
},
{
"cell_type": "code",
- "execution_count": 45,
+ "execution_count": 8,
"id": "86ab0d3b",
"metadata": {},
"outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loaded vector store from ./db/vector_store_5\n"
+ ]
+ },
{
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|██████████| 12/12 [00:59<00:00, 4.99s/it]\n"
+ "100%|██████████| 10/10 [00:31<00:00, 3.12s/it]\n"
]
}
],
@@ -519,7 +631,7 @@
},
{
"cell_type": "code",
- "execution_count": 46,
+ "execution_count": 10,
"id": "704669a4",
"metadata": {},
"outputs": [
@@ -564,120 +676,102 @@
}
],
"conversionMethod": "pd.DataFrame",
- "ref": "94e680f7-f55f-4f21-8d9d-fb692e52337c",
+ "ref": "492e8746-11d6-4694-8db9-159cbebf395c",
"rows": [
[
"0",
- "Does Ragas support integration with Langfuse?",
- "['title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!', 'title: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true\\n\\nIn our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs.\\n\\nBeyond the Basics: Why Advanced Metrics Matter\\n\\nWhile Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements:\\n\\nDomain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements\\n\\nCustom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches\\n\\nSpecialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics\\n\\nBusiness-specific KPIs: Aligning evaluation with business objectives requires customized metrics\\n\\nLet\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs.\\n\\nUnderstanding Ragas\\' Metric Architecture\\n\\nBefore creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture:\\n\\n1. Understand the Metric Base Classes\\n\\nAll metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these:\\n\\nSingleTurnMetric: For metrics that evaluate a single question/response pair.\\n\\nMultiTurnMetric: For metrics that evaluate multi-turn conversations.\\n\\nMetricWithLLM: For metrics that require an LLM for evaluation.\\n\\nMetricWithEmbeddings: For metrics that use embeddings.\\n\\nYou can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric).\\n\\nEach metric implements specific scoring methods depending on its type:\\n\\n_single_turn_ascore: For single-turn metrics\\n\\n_multi_turn_ascore: For multi-turn metrics\\n\\nCreating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel\\n\\nDefine input/output models for the prompt\\n\\nclass TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel): score: float feedback: str\\n\\nDefine the prompt\\n\\nclass TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ]\\n\\nCreate the metric\\n\\n@dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\",\\n\\n }\\n }\\n)\\noutput_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS\\nevaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt)\\n\\nasync def _single_turn_ascore(self, sample, callbacks) -> float:\\n assert self.llm is not None, \"LLM must be set\"\\n\\n question = sample.user_input\\n response = sample.response\\n # Extract programming language from question if possible\\n programming_language = \"python\" # Default\\n languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"]\\n for lang in languages:\\n if lang in question.lower():\\n programming_language = lang\\n break\\n\\n # Get the context\\n context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\"\\n\\n # Prepare input for prompt\\n prompt_input = TechnicalAccuracyInput(\\n question=question,\\n context=context,\\n response=response,\\n programming_language=programming_language\\n )\\n\\n # Generate evaluation\\n evaluation = await self.evaluation_prompt.generate(\\n data=prompt_input, llm=self.llm, callbacks=callbacks\\n )\\n\\n return evaluation.score\\n\\n```\\n\\nUsing the Custom Metric\\n\\nTo use the custom metric, simply include it in your evaluation pipeline:\\n\\n```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\ntest_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\")\\n\\nNote: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions,\\n\\n`` You can also use theevaluate` function to evaluate a dataset:\\n\\n```python from ragas import evaluate from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization\\n\\nCustomizing Metrics for Your Application\\n\\nYou can further refine your evaluation by customizing existing metrics—such as adjusting thresholds or criteria—to better fit your application\\'s requirements. For multi-turn conversations, you might configure metrics like topic adherence to emphasize specific aspects, such as precision or recall, based on your evaluation objectives.\\n\\nIn specialized domains like healthcare or legal, it\\'s crucial to design custom metrics that capture domain-specific accuracy and compliance needs. For complex applications, consider combining several metrics into composite scores to represent multiple quality dimensions.\\n\\nWhen assessing capabilities like code generation or structured outputs, develop metrics that evaluate execution correctness or schema compliance. For advanced scenarios, you can build metric pipelines that orchestrate several metrics and aggregate their results using strategies like weighted averages or minimum scores.\\n\\nBy thoughtfully customizing and combining metrics, you can achieve a comprehensive and meaningful evaluation framework tailored to your unique use case.\\n\\nBest Practices for Custom Metric Development\\n\\nSingle Responsibility: Each metric should evaluate one specific aspect\\n\\nClear Definition: Define precisely what your metric measures\\n\\nBounded Output: Scores should be normalized, typically in [0,1]\\n\\nReproducibility: Minimize randomness in evaluation\\n\\nDocumentation: Document criteria, prompt design, and interpretation guidelines\\n\\nTest with Examples: Verify metric behavior on clear-cut examples\\n\\nHuman Correlation: Validate that metrics correlate with human judgment\\n\\nStandardizing Custom Metrics\\n\\nTo ensure consistency across custom metrics, consider the following best practices:\\n\\nDefine a clear, human-readable description for each metric.\\n\\nProvide interpretation guidelines to help users understand score meanings.\\n\\nInclude metadata such as metric name, required columns, and output type.\\n\\nUse a standardized interface or base class for all custom metrics.\\n\\nImplementation Patterns for Advanced Metrics\\n\\nWhen developing advanced metrics like topic adherence:\\n\\nDesign multi-step evaluation workflows for complex tasks.\\n\\nUse specialized prompts for different sub-tasks within the metric.\\n\\nAllow configurable scoring modes (e.g., precision, recall, F1).\\n\\nSupport conversational context for multi-turn evaluations.\\n\\nDebugging Custom Metrics\\n\\nEffective debugging strategies include:\\n\\nImplementing a debug mode to capture prompt inputs, outputs, and intermediate results.\\n\\nLogging detailed evaluation steps for easier troubleshooting.\\n\\nReviewing final scores alongside intermediate calculations to identify issues.\\n\\nConclusion: Building an Evaluation Ecosystem\\n\\nCustom metrics allow you to build a comprehensive evaluation ecosystem tailored to your application\\'s specific needs:\\n\\nBaseline metrics: Start with Ragas\\' core metrics for fundamental quality aspects\\n\\nDomain adaptation: Add specialized metrics for your application domain\\n\\nFeature-specific metrics: Develop metrics for unique features of your system\\n\\nBusiness alignment: Create metrics that reflect specific business KPIs and requirements\\n\\nBy extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques — You are here Next up in the series: Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 7: Integrations and Observability with Ragas\" date: 2025-04-30T07:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/integrations-and-observability.png\" readingTime: 12 published: true\\n\\nPart 6: Integrations and Observability with Ragas\\n\\nIn our previous post, we explored how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications. Now, let\\'s discuss how to integrate Ragas into your broader LLM development ecosystem and establish observability practices that transform evaluation from a one-time exercise into a continuous improvement cycle.\\n\\nWhy Integrations and Observability Matter\\n\\nEvaluation is most powerful when it\\'s:\\n\\nIntegrated into your existing workflow and tools\\n\\nAutomated to run consistently with minimal friction\\n\\nObservable so insights are easily accessible and actionable\\n\\nContinuous rather than a one-time or sporadic effort\\n\\nLet\\'s explore how Ragas helps you achieve these goals through its extensive integration capabilities.\\n\\nFramework Integrations\\n\\nRagas seamlessly connects with popular LLM application frameworks, allowing you to evaluate systems built with your preferred tools.\\n\\nLangChain Integration\\n\\nFor LangChain-based applications, Ragas provides dedicated integration support. Here’s how you can integrate Ragas step by step:\\n\\nPrepare your documents: Load your source documents and split them into manageable chunks for retrieval.\\n\\nSet up vector storage: Embed the document chunks and store them in a vector database to enable efficient retrieval.\\n\\nConfigure the retriever and QA chain: Use LangChain components to create a retriever and a question-answering (QA) chain powered by your chosen language model.\\n\\nGenerate a test set: Use Ragas to automatically generate a set of test questions and answers from your documents, or supply your own.\\n\\nEvaluate retrieval and QA performance: Apply Ragas metrics to assess both the retriever and the full QA chain, measuring aspects like context relevancy, faithfulness, and answer quality.\\n\\nReview results: Analyze the evaluation outputs to identify strengths and areas for improvement in your RAG pipeline.\\n\\nThis integration allows you to continuously measure and improve the effectiveness of your retrieval and generation components within the LangChain framework.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nRagas supports integration with a variety of popular LLM and RAG frameworks beyond LangChain, including LlamaIndex and Haystack. These integrations enable seamless evaluation of retrieval and generation components within your preferred stack. If you need guidance or code examples for integrating Ragas with platforms such as LlamaIndex, Haystack, or others, support and tailored examples can be provided on demand to fit your specific workflow and requirements.\\n\\nObservability Platform Integrations\\n\\nBeyond framework integrations, Ragas connects with leading observability platforms to help you monitor, track, and analyze evaluation results over time.\\n\\nLangSmith Integration\\n\\nFor LangChain users, LangSmith provides comprehensive tracing and evaluation. To integrate Ragas evaluation with LangSmith, follow these steps:\\n\\nSet up your environment\\n\\nUpload dataset to LangSmith\\n\\nDefine your LLM or chain\\n\\nSelect Ragas metrics\\n\\nRun evaluation with LangSmith\\n\\nYou can now view detailed experiment results in your LangSmith project dashboard. This integration enables you to trace, evaluate, and monitor your RAG pipeline performance directly within LangSmith, leveraging Ragas metrics for deeper insights.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nOther Platform Integrations\\n\\nRagas can be integrated with a range of observability and monitoring platforms beyond LangSmith, such as Langfuse and others. If you need help connecting Ragas to platforms like Langfuse or have specific requirements for your observability stack, tailored support and examples are available to fit your workflow.\\n\\nBuilding Automated Evaluation Pipelines\\n\\nTo ensure evaluation is a continuous part of your development process, set up automated pipelines that run evaluations regularly and automatically.\\n\\nCI/CD Integration\\n\\nYou can incorporate Ragas into your CI/CD pipeline so that every code change is automatically evaluated. This helps catch regressions early and ensures your RAG system maintains high performance before merging new changes.\\n\\nScheduled Evaluations\\n\\nRegularly scheduled evaluations allow you to monitor your system’s performance over time. By running evaluations at set intervals, you can track trends, spot regressions, and ensure your system continues to meet quality standards.\\n\\nMonitoring Evaluation Metrics Over Time\\n\\nTracking evaluation metrics over time helps you identify performance trends and quickly detect any drops in quality. By visualizing these metrics, you can better understand how changes to your system impact its effectiveness.\\n\\nCreating Custom Dashboards\\n\\nBuilding custom dashboards gives you a comprehensive view of your evaluation results. Dashboards can display current performance, trends, and detailed breakdowns of recent evaluations, making it easier to monitor your system and identify areas for improvement.\\n\\nWith these practices, you can make evaluation an ongoing, automated, and visible part of your development workflow, leading to more reliable and robust RAG systems.\\n\\nBest Practices for Observability\\n\\nDefine clear thresholds: Establish performance baselines and alert thresholds for each metric\\n\\nSegment evaluations: Break down results by query type, data source, or other relevant factors\\n\\nHistorical tracking: Maintain historical evaluation data to identify trends and regressions\\n\\nCorrelation analysis: Link evaluation metrics to user feedback and business outcomes\\n\\nRegular benchmarking: Periodically evaluate against fixed test sets to ensure consistency\\n\\nAlert on regressions: Implement automated alerts when metrics drop below thresholds\\n\\nContextualize metrics: Include example failures alongside aggregate metrics for better understanding\\n\\nBuilding a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:\\n\\nCapture evaluation results with Ragas\\n\\nIdentify patterns in failures and underperforming areas\\n\\nPrioritize improvements based on impact and effort\\n\\nImplement changes to your RAG components\\n\\nValidate improvements with focused re-evaluation\\n\\nMonitor continuously to catch regressions\\n\\nConclusion: From Evaluation to Action\\n\\nIntegrating Ragas with your frameworks and observability tools transforms evaluation from a point-in-time activity to a continuous improvement cycle. By making evaluation metrics visible, actionable, and integrated into your workflows, you create a foundation for systematic improvement of your LLM applications.\\n\\nThe most successful teams don\\'t just evaluate occasionally — they build evaluation into their development culture, making data-driven decisions based on objective metrics rather than subjective impressions.\\n\\nIn our final post, we\\'ll explore how to build effective feedback loops that translate evaluation insights into concrete improvements for your LLM applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Metrics and Customization Part 6: Evaluating AI Agents Part 7: Integrations and Observability with Ragas — You are here Next up in the series: Part 8: Building Feedback Loops\\n\\nHow are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!', 'title: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true\\n\\nIn our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature.\\n\\nUnderstanding RAG Systems: More Than the Sum of Their Parts\\n\\nRAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information\\n\\nThis dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content.\\n\\nThe RAG Evaluation Triad\\n\\nEffective RAG evaluation requires examining three key dimensions:\\n\\nRetrieval Quality: How well does the system find relevant information?\\n\\nGeneration Quality: How well does the system produce responses from retrieved information?\\n\\nEnd-to-End Performance: How well does the complete system satisfy user needs?\\n\\nLet\\'s explore how Ragas helps evaluate each dimension of RAG systems.\\n\\nCore RAG Metrics in Ragas\\n\\nRagas provides specialized metrics to assess RAG systems across retrieval, generation, and end-to-end performance.\\n\\nRetrieval Quality Metrics\\n\\n1. Context Relevancy\\n\\nMeasures how relevant the retrieved documents are to the user\\'s question.\\n\\nHow it works:\\n\\nTakes the user\\'s question (user_input) and the retrieved documents (retrieved_contexts).\\n\\nUses an LLM to score relevance with two different prompts, averaging the results for robustness.\\n\\nScores are normalized between 0.0 (irrelevant) and 1.0 (fully relevant).\\n\\nWhy it matters: Low scores indicate your retriever is pulling in unrelated or noisy documents. Monitoring this helps you improve the retrieval step.\\n\\n2. Context Precision\\n\\nAssesses how much of the retrieved context is actually useful for generating the answer.\\n\\nHow it works:\\n\\nFor each retrieved chunk, an LLM judges if it was necessary for the answer, using the ground truth (reference) or the generated response.\\n\\nCalculates Average Precision, rewarding systems that rank useful chunks higher.\\n\\nVariants:\\n\\nContextUtilization: Uses the generated response instead of ground truth.\\n\\nNon-LLM version: Compares retrieved chunks to ideal reference contexts using string similarity.\\n\\nWhy it matters: High precision means your retriever is efficient; low precision means too much irrelevant information is included.\\n\\n3. Context Recall\\n\\nEvaluates whether all necessary information from the ground truth answer is present in the retrieved context.\\n\\nHow it works:\\n\\nBreaks down the reference answer into sentences.\\n\\nFor each sentence, an LLM checks if it can be supported by the retrieved context.\\n\\nThe score is the proportion of reference sentences attributed to the retrieved context.\\n\\nVariants:\\n\\nNon-LLM version: Compares reference and retrieved contexts using similarity and thresholds.\\n\\nWhy it matters: High recall means your retriever finds all needed information; low recall means critical information is missing.\\n\\nSummary: - Low context relevancy: Retriever needs better query understanding or semantic matching. - Low context precision: Retriever includes unnecessary information. - Low context recall: Retriever misses critical information.\\n\\nGeneration Quality Metrics\\n\\n1. Faithfulness\\n\\nChecks if the generated answer is factually consistent with the retrieved context, addressing hallucination.\\n\\nHow it works:\\n\\nBreaks the answer into simple statements.\\n\\nFor each, an LLM checks if it can be inferred from the retrieved context.\\n\\nThe score is the proportion of faithful statements.\\n\\nAlternative:\\n\\nFaithfulnesswithHHEM: Uses a specialized NLI model for verification.\\n\\nWhy it matters: High faithfulness means answers are grounded in context; low faithfulness signals hallucination.\\n\\n2. Answer Relevancy\\n\\nMeasures if the generated answer directly addresses the user\\'s question.\\n\\nHow it works:\\n\\nAsks an LLM to generate possible questions for the answer.\\n\\nCompares these to the original question using embedding similarity.\\n\\nPenalizes noncommittal answers.\\n\\nWhy it matters: High relevancy means answers are on-topic; low relevancy means answers are off-topic or incomplete.\\n\\nSummary: - Low faithfulness: Generator adds facts not supported by context. - Low answer relevancy: Generator doesn\\'t focus on the specific question.\\n\\nEnd-to-End Metrics\\n\\n1. Correctness\\n\\nAssesses factual alignment between the generated answer and a ground truth reference.\\n\\nHow it works:\\n\\nBreaks both the answer and reference into claims.\\n\\nUses NLI to verify claims in both directions.\\n\\nCalculates precision, recall, or F1-score.\\n\\nWhy it matters: High correctness means answers match the ground truth; low correctness signals factual errors.\\n\\nKey distinction: - Faithfulness: Compares answer to retrieved context. - FactualCorrectness: Compares answer to ground truth.\\n\\nCommon RAG Evaluation Patterns\\n\\n1. High Retrieval, Low Generation Scores\\n\\nDiagnosis: Good retrieval, poor use of information.\\n\\nFixes: Improve prompts, use better generation models, or verify responses post-generation.\\n\\n2. Low Retrieval, High Generation Scores\\n\\nDiagnosis: Good generation, inadequate information.\\n\\nFixes: Enhance indexing, retrieval algorithms, or expand the knowledge base.\\n\\n3. Low Context Precision, High Faithfulness\\n\\nDiagnosis: Retrieves too much, but generates reliably.\\n\\nFixes: Filter passages, optimize chunk size, or use re-ranking.\\n\\nBest Practices for RAG Evaluation\\n\\nEvaluate components independently: Assess retrieval and generation separately.\\n\\nUse diverse queries: Include factoid, explanatory, and complex questions.\\n\\nCompare against baselines: Test against simpler systems.\\n\\nPerform ablation studies: Try variations like different chunk sizes or retrieval models.\\n\\nCombine with human evaluation: Use Ragas with human judgment for a complete view.\\n\\nConclusion: The Iterative RAG Evaluation Cycle\\n\\nEffective RAG development is iterative:\\n\\nEvaluate: Measure performance.\\n\\nAnalyze: Identify weaknesses.\\n\\nImprove: Apply targeted enhancements.\\n\\nRe-evaluate: Measure the impact of changes.\\n\\nThe Iterative RAG Evaluation Cycle\\n\\nBy using Ragas to implement this cycle, you can systematically improve your RAG system\\'s performance across all dimensions.\\n\\nIn our next post, we\\'ll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas — You are here Next up in the series: Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!']",
- "['title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true As Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in. What is Ragas? Ragas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems. At its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations? Why Evaluate LLM Applications? LLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable. Evaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application Key Features of Ragas 🎯 Specialized Metrics Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications: Faithfulness: Measures if the response is factually consistent with the retrieved context Context Relevancy: Evaluates if the retrieved information is relevant to the query Answer Relevancy: Assesses if the response addresses the user\\'s question Topic Adherence: Gauges how well multi-turn conversations stay on topic 🧪 Test Data Generation Creating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage. 🔗 Seamless Integrations Ragas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI Observability platforms - Phoenix - LangSmith - Langfuse 📊 Comprehensive Analysis Beyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements. Getting Started with Ragas Installing Ragas is straightforward: bash uv init && uv add ragas Here\\'s a simple example of evaluating a response using Ragas: ```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI Initialize the LLM, you are going to new OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Your evaluation data test_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" } Create a sample sample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor Create metric faithfulness = Faithfulness(llm=evaluator_llm) Calculate the score result = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ``` 💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas']",
- "Yes, Ragas supports integration with Langfuse, allowing you to monitor, track, and analyze evaluation results over time. This integration enables you to trace, evaluate, and monitor your RAG pipeline performance directly within Langfuse.\n\nFor more details, you can explore the following links:\n\n- [Integrations and Observability with Ragas](https://thedataguy.pro/blog/integrations-and-observability-with-ragas/)\n- [Introduction to Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Evaluating RAG Systems with Ragas](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)",
- "Yes, Ragas works with observability platforms such as Langfuse.",
+ "How are Large Language Models integrated into modern applications, and why is their performance evaluation considered critical according to the context?",
+ "['| **Harmfulness** | | ✓ | |\\n| **Coherence** | | ✓ | |\\n| **Context Relevancy** | | | ✓ |', '| **Metric** | **Comprehensive RAG Evaluation** | **Content Quality Evaluation** | **Retrieval Quality Evaluation** |\\n|-----------------------------|----------------------------------|---------------------------------|-----------------------------------|\\n| **Faithfulness** | ✓ | ✓ | |\\n| **Answer Relevancy** | ✓ | ✓ | |\\n| **Context Recall** | ✓ | | ✓ |\\n| **Context Precision** | ✓ | | ✓ |\\n| **Harmfulness** | | ✓ | |', \"**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\nWe'll explore each metric in detail, explaining when and how to use them effectively.\\n\\n**[Part 3: Evaluating RAG Systems](/blog/evaluating-rag-systems-with-ragas/)** \\nLearn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\nDiscover how to create high-quality test datasets that thoroughly exercise your application's capabilities.\\n\\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\nGo beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\nLearn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\", \"What excites me most is that my experience in enterprise software development gives me a unique perspective on AI implementation. I understand not just the algorithms and models, but also how to integrate them into robust, production-ready systems that deliver real value.\\n\\n## The Best of Both Worlds\\n\\nComing back to AI doesn't mean leaving behind everything I learned in web and enterprise development. Quite the opposite - I believe my background gives me a particular advantage in building AI systems that are:\"]",
+ "['---\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\"\\ndate: 2025-04-26T18:00:00-06:00\\nlayout: blog\\ndescription: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\"\\ncategories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"]\\ncoverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\"\\nreadingTime: 7\\npublished: true\\n---\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\n## What is Ragas?']",
+ "I don't know.",
+ "Large Language Models (LLMs) are becoming fundamental components of modern applications, such as question-answering systems, document retrieval tools, and conversational agents. Effectively evaluating their performance is considered increasingly critical to ensure reliable metrics for assessing how well these applications perform.",
"single_hop_specifc_query_synthesizer"
],
[
"1",
- "what Part 8: Building Feedback Loops do for LLM app devs?",
- "['title: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true\\n\\nIn our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs.\\n\\nBeyond the Basics: Why Advanced Metrics Matter\\n\\nWhile Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements:\\n\\nDomain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements\\n\\nCustom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches\\n\\nSpecialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics\\n\\nBusiness-specific KPIs: Aligning evaluation with business objectives requires customized metrics\\n\\nLet\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs.\\n\\nUnderstanding Ragas\\' Metric Architecture\\n\\nBefore creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture:\\n\\n1. Understand the Metric Base Classes\\n\\nAll metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these:\\n\\nSingleTurnMetric: For metrics that evaluate a single question/response pair.\\n\\nMultiTurnMetric: For metrics that evaluate multi-turn conversations.\\n\\nMetricWithLLM: For metrics that require an LLM for evaluation.\\n\\nMetricWithEmbeddings: For metrics that use embeddings.\\n\\nYou can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric).\\n\\nEach metric implements specific scoring methods depending on its type:\\n\\n_single_turn_ascore: For single-turn metrics\\n\\n_multi_turn_ascore: For multi-turn metrics\\n\\nCreating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel\\n\\nDefine input/output models for the prompt\\n\\nclass TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel): score: float feedback: str\\n\\nDefine the prompt\\n\\nclass TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ]\\n\\nCreate the metric\\n\\n@dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\",\\n\\n }\\n }\\n)\\noutput_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS\\nevaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt)\\n\\nasync def _single_turn_ascore(self, sample, callbacks) -> float:\\n assert self.llm is not None, \"LLM must be set\"\\n\\n question = sample.user_input\\n response = sample.response\\n # Extract programming language from question if possible\\n programming_language = \"python\" # Default\\n languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"]\\n for lang in languages:\\n if lang in question.lower():\\n programming_language = lang\\n break\\n\\n # Get the context\\n context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\"\\n\\n # Prepare input for prompt\\n prompt_input = TechnicalAccuracyInput(\\n question=question,\\n context=context,\\n response=response,\\n programming_language=programming_language\\n )\\n\\n # Generate evaluation\\n evaluation = await self.evaluation_prompt.generate(\\n data=prompt_input, llm=self.llm, callbacks=callbacks\\n )\\n\\n return evaluation.score\\n\\n```\\n\\nUsing the Custom Metric\\n\\nTo use the custom metric, simply include it in your evaluation pipeline:\\n\\n```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\ntest_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\")\\n\\nNote: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions,\\n\\n`` You can also use theevaluate` function to evaluate a dataset:\\n\\n```python from ragas import evaluate from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization\\n\\nCustomizing Metrics for Your Application\\n\\nYou can further refine your evaluation by customizing existing metrics—such as adjusting thresholds or criteria—to better fit your application\\'s requirements. For multi-turn conversations, you might configure metrics like topic adherence to emphasize specific aspects, such as precision or recall, based on your evaluation objectives.\\n\\nIn specialized domains like healthcare or legal, it\\'s crucial to design custom metrics that capture domain-specific accuracy and compliance needs. For complex applications, consider combining several metrics into composite scores to represent multiple quality dimensions.\\n\\nWhen assessing capabilities like code generation or structured outputs, develop metrics that evaluate execution correctness or schema compliance. For advanced scenarios, you can build metric pipelines that orchestrate several metrics and aggregate their results using strategies like weighted averages or minimum scores.\\n\\nBy thoughtfully customizing and combining metrics, you can achieve a comprehensive and meaningful evaluation framework tailored to your unique use case.\\n\\nBest Practices for Custom Metric Development\\n\\nSingle Responsibility: Each metric should evaluate one specific aspect\\n\\nClear Definition: Define precisely what your metric measures\\n\\nBounded Output: Scores should be normalized, typically in [0,1]\\n\\nReproducibility: Minimize randomness in evaluation\\n\\nDocumentation: Document criteria, prompt design, and interpretation guidelines\\n\\nTest with Examples: Verify metric behavior on clear-cut examples\\n\\nHuman Correlation: Validate that metrics correlate with human judgment\\n\\nStandardizing Custom Metrics\\n\\nTo ensure consistency across custom metrics, consider the following best practices:\\n\\nDefine a clear, human-readable description for each metric.\\n\\nProvide interpretation guidelines to help users understand score meanings.\\n\\nInclude metadata such as metric name, required columns, and output type.\\n\\nUse a standardized interface or base class for all custom metrics.\\n\\nImplementation Patterns for Advanced Metrics\\n\\nWhen developing advanced metrics like topic adherence:\\n\\nDesign multi-step evaluation workflows for complex tasks.\\n\\nUse specialized prompts for different sub-tasks within the metric.\\n\\nAllow configurable scoring modes (e.g., precision, recall, F1).\\n\\nSupport conversational context for multi-turn evaluations.\\n\\nDebugging Custom Metrics\\n\\nEffective debugging strategies include:\\n\\nImplementing a debug mode to capture prompt inputs, outputs, and intermediate results.\\n\\nLogging detailed evaluation steps for easier troubleshooting.\\n\\nReviewing final scores alongside intermediate calculations to identify issues.\\n\\nConclusion: Building an Evaluation Ecosystem\\n\\nCustom metrics allow you to build a comprehensive evaluation ecosystem tailored to your application\\'s specific needs:\\n\\nBaseline metrics: Start with Ragas\\' core metrics for fundamental quality aspects\\n\\nDomain adaptation: Add specialized metrics for your application domain\\n\\nFeature-specific metrics: Develop metrics for unique features of your system\\n\\nBusiness alignment: Create metrics that reflect specific business KPIs and requirements\\n\\nBy extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques — You are here Next up in the series: Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!', 'title: \"Part 2: Basic Evaluation Workflow with Ragas\" date: 2025-04-26T19:00:00-06:00 layout: blog description: \"Learn how to set up a basic evaluation workflow for LLM applications using Ragas. This guide walks you through data preparation, metric selection, and result analysis.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1600132806370-bf17e65e942f?q=80&w=1988&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 8 published: true\\n\\nIn our previous post, we introduced Ragas as a powerful framework for evaluating LLM applications. Now, let\\'s dive into the practical aspects of setting up your first evaluation pipeline.\\n\\nUnderstanding the Evaluation Workflow\\n\\nA typical Ragas evaluation workflow consists of four key steps:\\n\\nPrepare your data: Collect queries, contexts, responses, and reference answers\\n\\nSelect appropriate metrics: Choose metrics that align with what you want to evaluate\\n\\nRun the evaluation: Process your data through the selected metrics\\n\\nAnalyze the results: Interpret scores and identify areas for improvement\\n\\nLet\\'s walk through each step with practical examples.\\n\\nStep 1: Setting Up Your Environment\\n\\nFirst, ensure you have Ragas installed:\\n\\nbash uv add ragas\\n\\nNext, import the necessary components:\\n\\npython import pandas as pd from ragas import EvaluationDataset from ragas import evaluate, RunConfig from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity\\n\\nStep 2: Preparing Your Evaluation Data\\n\\nFor a RAG system evaluation, you\\'ll need:\\n\\nQuestions: User queries to your system\\n\\nContexts: Documents or chunks retrieved by your system\\n\\nResponses: Answers generated by your system\\n\\nGround truth (optional): Reference answers or documents for comparison\\n\\nHere\\'s how to organize this data:\\n\\n```python\\n\\nSample data\\n\\ndata = { \"user_input\": [ \"What are the main symptoms of COVID-19?\", \"How does machine learning differ from deep learning?\" ], \"retrieved_contexts\": [ [ \"Common symptoms of COVID-19 include fever, cough, and fatigue. Some patients also report loss of taste or smell, body aches, and difficulty breathing.\", \"COVID-19 is caused by the SARS-CoV-2 virus and spreads primarily through respiratory droplets.\" ], [ \"Machine learning is a subset of AI focused on algorithms that learn from data without being explicitly programmed.\", \"Deep learning is a specialized form of machine learning using neural networks with many layers (deep neural networks).\" ] ], \"response\": [ \"The main symptoms of COVID-19 include fever, cough, fatigue, and sometimes loss of taste or smell, body aches, and breathing difficulties.\", \"Machine learning is a subset of AI that focuses on algorithms learning from data, while deep learning is a specialized form of machine learning that uses deep neural networks with multiple layers.\" ], \"reference\": [ \"COVID-19 symptoms commonly include fever, dry cough, fatigue, loss of taste or smell, body aches, sore throat, and in severe cases, difficulty breathing.\", \"Machine learning is a branch of AI where systems learn from data, identify patterns, and make decisions with minimal human intervention. Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks) to analyze various factors of data.\" ] }\\n\\neval_data = pd.DataFrame(data)\\n\\nConvert to a format Ragas can use\\n\\nevaluation_dataset = EvaluationDataset.from_pandas(eval_data) evaluation_dataset\\n\\n```\\n\\nStep 3: Selecting and Configuring Metrics\\n\\nRagas offers various metrics to evaluate different aspects of your system:\\n\\nCore RAG Metrics:\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nFactual Correctness: Assesses if the response is accurate and free from factual errors.\\n\\nResponse Relevancy: Evaluates if the response directly addresses the user query.\\n\\nContext Entity Recall: Measures how well the retrieved context captures relevant entities from the ground truth.\\n\\nNoise Sensitivity: Assesses the robustness of the response to irrelevant or noisy context.\\n\\nLLM Context Recall: Evaluates how effectively the LLM utilizes the provided context to generate the response.\\n\\nFor metrics that require an LLM (like faithfulness), you need to configure the LLM provider:\\n\\n```python\\n\\nConfigure LLM for evaluation\\n\\nfrom langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nDefine metrics to use\\n\\nmetrics = [ Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity(), LLMContextRecall() ] ```\\n\\nStep 4: Running the Evaluation\\n\\nNow, run the evaluation with your selected metrics:\\n\\n```python\\n\\nRun evaluation\\n\\nresults = evaluate( evaluation_dataset, metrics=metrics, llm=evaluator_llm # Required for LLM-based metrics )\\n\\nView results\\n\\nprint(results) ```\\n\\nOutput:\\n\\nValues will vary based on your data and LLM performance.\\n\\npython { \"faithfulness\": 1.0000, \"factual_correctness\": 0.6750, \"answer_relevancy\": 0.9897, \"context_entity_recall\": 0.8889, \"noise_sensitivity_relevant\": 0.1667, \"context_recall\": 0.5000 }\\n\\nStep 5: Interpreting Results\\n\\nRagas metrics typically return scores between 0 and 1, where higher is better:\\n\\nUnderstanding Score Ranges:\\n\\n0.8-1.0: Excellent performance\\n\\n0.6-0.8: Good performance\\n\\n0.4-0.6: Moderate performance, needs improvement\\n\\n0.4 or lower: Poor performance, requires significant attention\\n\\nAdvanced Use: Custom Evaluation for Specific Examples\\n\\nFor more detailed analysis of specific examples:\\n\\n```python from ragas import SingleTurnSample from ragas.metrics import AspectCritic\\n\\nDefine a specific test case\\n\\ntest_data = { \"user_input\": \"What are quantum computers?\", \"response\": \"Quantum computers use quantum bits or qubits that can exist in multiple states simultaneously, unlike classical bits that can only be 0 or 1.\", \"retrieved_contexts\": [\"Quantum computing is a type of computation that harnesses quantum mechanical phenomena.\"] }\\n\\nCreate a custom evaluation metric\\n\\ncustom_metric = AspectCritic( name=\"quantum_accuracy\", llm=llm, definition=\"Verify if the explanation of quantum computing is accurate and complete.\" )\\n\\nScore the sample\\n\\nsample = SingleTurnSample(**test_data) score = await custom_metric.single_turn_ascore(sample) print(f\"Quantum accuracy score: {score}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 02_Basic_Evaluation_Workflow_with_Ragas\\n\\nCommon Evaluation Patterns and Metrics\\n\\nBelow is a matrix mapping evaluation patterns to the metrics used, along with definitions for each metric:\\n\\nMetric Comprehensive RAG Evaluation Content Quality Evaluation Retrieval Quality Evaluation Faithfulness ✓ ✓ Answer Relevancy ✓ ✓ Context Recall ✓ ✓ Context Precision ✓ ✓ Harmfulness ✓ Coherence ✓ Context Relevancy ✓\\n\\nMetric Definitions\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nAnswer Relevancy: Assesses if the response addresses the question.\\n\\nContext Recall: Measures how well the retrieved context covers the information in the ground truth.\\n\\nContext Precision: Evaluates the proportion of relevant information in the retrieved context.\\n\\nHarmfulness: Evaluates if the response contains harmful or inappropriate content.\\n\\nCoherence: Measures the logical flow and clarity of the response.\\n\\nContext Relevancy: Evaluates if the retrieved context is relevant to the question.\\n\\nThis matrix provides a clear overview of which metrics to use for specific evaluation patterns and their respective definitions.\\n\\nBest Practices for Ragas Evaluation\\n\\nStart simple: Begin with core metrics before adding more specialized ones\\n\\nUse diverse test cases: Include a variety of questions, from simple to complex\\n\\nConsider edge cases: Test with queries that might challenge your system\\n\\nCompare versions: Track metrics across different versions of your application\\n\\nCombine with human evaluation: Use Ragas alongside human feedback for a comprehensive assessment\\n\\nConclusion\\n\\nSetting up a basic evaluation workflow with Ragas is straightforward yet powerful. By systematically evaluating your LLM applications, you gain objective insights into their performance and clear directions for improvement.\\n\\nIn our next post, we\\'ll delve deeper into specialized evaluation techniques for RAG systems, exploring advanced metrics and evaluation strategies for retrieval-augmented generation applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow — You are here Next up in the series: Part 3: Evaluating RAG Systems Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHave you set up your first Ragas evaluation? What aspects of your LLM application are you most interested in measuring? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 8: Building Feedback Loops with Ragas\" date: 2025-05-04T00:00:00-06:00 layout: blog description: \"A research-driven guide to designing robust, actionable feedback loops for LLM and RAG systems using Ragas. Learn how to select metrics, set baselines, define thresholds, and incorporate user and human feedback for continuous improvement.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\", \"Data\"] coverImage: \"/images/building-feedback-loops.png\" readingTime: 10 published: true\\n\\nA high-performing LLM or RAG system is never static. The most successful teams treat evaluation as a continuous, iterative process—one that closes the loop between measurement, analysis, and improvement. In this post, we’ll design a research-backed feedback loop process using Ragas, focusing on actionable activities at each stage and strategies for integrating user and human feedback.\\n\\nDesigning the Feedback Loop: A Stepwise Process\\n\\nThe feedback loop process is a systematic approach to continuously improve your LLM or RAG system. It consists of seven key steps, each building on the previous one to create a sustainable cycle of evidence-driven progress.\\n\\nFeedback Loop Process\\n\\n1. Select the Right Metric\\n\\nPurpose: Identify metrics that best reflect your application’s goals and user needs.\\n\\nActivities: - Map business objectives to measurable outcomes (e.g., accuracy, faithfulness, relevancy). - Review available Ragas metrics and select those most aligned with your use case. - Periodically revisit metric selection as your product or user base evolves.\\n\\n2. Develop and Measure Baseline Metrics\\n\\nPurpose: Establish a reference point for current system performance.\\n\\nActivities: - Assemble a representative evaluation dataset. - Run your system and record metric scores for each example. - Document baseline results for all selected metrics. - Ensure the baseline dataset remains stable for future comparisons.\\n\\n3. Analyze and Define Acceptable Threshold Values\\n\\nPurpose: Set clear, actionable standards for what constitutes “good enough” performance.\\n\\nActivities: - Analyze baseline metric distributions (mean, variance, outliers). - Consult stakeholders to define minimum acceptable values for each metric. - Document thresholds and rationale for transparency. - Consider different thresholds for different segments (e.g., critical vs. non-critical queries).\\n\\n4. Evaluate and Select Improvement Areas\\n\\nPurpose: Identify where your system most often fails to meet thresholds and prioritize improvements.\\n\\nActivities: - Segment evaluation results by metric, query type, or user group. - Identify patterns or clusters of failure (e.g., certain topics, long queries). - Prioritize areas with the greatest impact on user experience or business goals. - Formulate hypotheses about root causes.\\n\\n5. Implement Improvements\\n\\nPurpose: Take targeted actions to address identified weaknesses.\\n\\nActivities: - Design and implement changes (e.g., prompt tuning, retrieval upgrades, model fine-tuning). - Document all interventions and their intended effects. - Ensure changes are isolated for clear attribution of impact.\\n\\n6. Record Metrics for History\\n\\nPurpose: Build a longitudinal record to track progress and avoid regressions.\\n\\nActivities: - After each improvement, re-evaluate on the same baseline dataset. - Log metric scores, system version, date, and description of changes. - Visualize trends over time to inform future decisions.\\n\\nMetric Record Log Schema Example:\\n\\nTimestamp System Version Metric Name Value Dataset Name Change Description 2025-05-04T12:00:00 v1.2.0 faithfulness 0.78 baseline_v1 Added re-ranking to retriever 2025-05-04T12:00:00 v1.2.0 answer_relevancy 0.81 baseline_v1 Added re-ranking to retriever ... ... ... ... ... ...\\n\\n7. Repeat: Analyze, Evaluate, Implement, Record\\n\\nPurpose: Establish a sustainable, iterative cycle of improvement.\\n\\nActivities: - Regularly revisit analysis as new data or feedback emerges. - Continuously refine thresholds and priorities. - Maintain a culture of evidence-based iteration.\\n\\nIntegrating User Feedback in Production\\n\\nPurpose\\n\\nUser feedback provides real-world validation and uncovers blind spots in automated metrics. Incorporating it closes the gap between technical evaluation and actual user satisfaction.\\n\\nStrategies\\n\\nIn-Product Feedback Widgets: Allow users to rate answers or flag issues directly in the interface.\\n\\nPassive Signals: Analyze user behavior (e.g., follow-up queries, abandonment) as implicit feedback.\\n\\nFeedback Sampling: Periodically sample user sessions for manual review.\\n\\nFeedback Aggregation: Aggregate and categorize feedback to identify recurring pain points.\\n\\nMetric Correlation: Analyze how user feedback correlates with automated metrics to calibrate thresholds.\\n\\nRecording User Feedback\\n\\nUser Feedback Log Schema Example:\\n\\nTimestamp User ID Query ID User Rating Feedback Text Metric Scores System Version 2025-05-04T13:00:00 12345 q_987 2 \"Answer was off-topic\" {faithfulness: 0.6, answer_relevancy: 0.5} v1.2.0 2025-05-04T13:00:00 67890 q_654 4 \"Good answer, but could be more concise\" {faithfulness: 0.8, answer_relevancy: 0.9} v1.2.0 ... ... ... ... ... ... ...\\n\\nIncluding Human Labelers in Evaluation\\n\\nPurpose\\n\\nHuman labelers provide high-quality, nuanced judgments that automated metrics may miss, especially for ambiguous or complex queries.\\n\\nStrategies\\n\\nPeriodic Human Review: Regularly sample evaluation outputs for human annotation.\\n\\nDisagreement Analysis: Focus human review on cases where user feedback and metrics disagree.\\n\\nLabeler Training: Provide clear guidelines and calibration sessions to ensure consistency.\\n\\nHybrid Scoring: Combine human and automated scores for a more holistic evaluation.\\n\\nContinuous Calibration: Use human labels to refine and validate automated metric thresholds.\\n\\nConclusion\\n\\nA robust feedback loop is the foundation of sustainable improvement for LLM and RAG systems. By systematically selecting metrics, measuring baselines, setting thresholds, and integrating both user and human feedback, you create a virtuous cycle of evidence-driven progress. The most effective teams treat evaluation as an ongoing process—one that is deeply connected to real user outcomes and grounded in transparent, repeatable measurement.\\n\\nThis is the eighth part of a series on Ragas, a research-driven evaluation framework for LLM and RAG systems. If you missed the previous parts, check them out below:\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Metrics and Customization Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops — You are here\\n\\nHave questions or want to share your feedback loop strategies? Connect with me on LinkedIn for discussion or collaboration!']",
- "[\"What's Coming in This Blog Series This introduction is just the beginning. In the upcoming posts, we'll dive deeper into all aspects of evaluating LLM applications with Ragas: Part 2: Basic Evaluation Workflow We'll explore each metric in detail, explaining when and how to use them effectively. Part 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance. Part 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application's capabilities. Part 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments. Part 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals. Part 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows. Part 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications. Conclusion In a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications. Ready to Elevate Your LLM Applications? Start exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you're facing specific evaluation hurdles, don't hesitate to reach out—we'd love to help!\"]",
- "Part 8: Building Feedback Loops with Ragas provides LLM app developers with a systematic approach to continuously improve their applications. It outlines a stepwise process for designing feedback loops that integrate user and human feedback into the evaluation process. Key activities include selecting appropriate metrics, establishing baseline performance, defining acceptable thresholds, identifying improvement areas, implementing changes, and recording metrics over time. This iterative cycle helps ensure that LLM applications evolve based on real user outcomes and maintain high performance.\n\nFor more details, you can explore the full article here: [Building Feedback Loops with Ragas](https://thedataguy.pro/blog/building-feedback-loops-with-ragas/). \n\nYou may also find these related parts useful:\n- [Part 1: Introduction to Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Part 2: Basic Evaluation Workflow with Ragas](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)\n- [Part 5: Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)",
- "Part 8: Building Feedback Loops show how to implement feedback loops that drive continuous improvement in LLM applications and how to turn evaluation insights into concrete improvements for LLM applications.",
+ "Howw does Ragas help evalute LLM applikations in the helthcare industree, and why is this importent for ensuring akuracy and reliabilitee?",
+ "[\"In our next post, we'll explore advanced metrics and customization techniques for specialized evaluation needs.\\n\\n---\\n\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**Part 4: Test Data Generation — _You are here_** \\n*Next up in the series:* \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\n---\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**Part 3: Evaluating RAG Systems with Ragas — _You are here_** \\n*Next up in the series:* \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**Part 6: Evaluating AI Agents — _You are here_** \\n*Next up in the series:* \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\"]",
+ "[\"## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\"]",
+ "I don't know.",
+ "Ragas is an open-source evaluation framework designed for LLM applications, with strengths in Retrieval-Augmented Generation systems. It provides specialized metrics to address challenges unique to LLM-powered systems, such as determining if the application retrieves the right information and if responses are factually accurate and consistent with the retrieved context. Evaluating LLM applications is especially important in healthcare, where accuracy and reliability are critical, because LLMs can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. Proper evaluation with frameworks like Ragas is non-negotiable in healthcare to ensure the system's accuracy and reliability.",
"single_hop_specifc_query_synthesizer"
],
[
"2",
- "How does Ragas assist with Evaluation of RAG systems?",
- "['title: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true\\n\\nIn our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature.\\n\\nUnderstanding RAG Systems: More Than the Sum of Their Parts\\n\\nRAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information\\n\\nThis dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content.\\n\\nThe RAG Evaluation Triad\\n\\nEffective RAG evaluation requires examining three key dimensions:\\n\\nRetrieval Quality: How well does the system find relevant information?\\n\\nGeneration Quality: How well does the system produce responses from retrieved information?\\n\\nEnd-to-End Performance: How well does the complete system satisfy user needs?\\n\\nLet\\'s explore how Ragas helps evaluate each dimension of RAG systems.\\n\\nCore RAG Metrics in Ragas\\n\\nRagas provides specialized metrics to assess RAG systems across retrieval, generation, and end-to-end performance.\\n\\nRetrieval Quality Metrics\\n\\n1. Context Relevancy\\n\\nMeasures how relevant the retrieved documents are to the user\\'s question.\\n\\nHow it works:\\n\\nTakes the user\\'s question (user_input) and the retrieved documents (retrieved_contexts).\\n\\nUses an LLM to score relevance with two different prompts, averaging the results for robustness.\\n\\nScores are normalized between 0.0 (irrelevant) and 1.0 (fully relevant).\\n\\nWhy it matters: Low scores indicate your retriever is pulling in unrelated or noisy documents. Monitoring this helps you improve the retrieval step.\\n\\n2. Context Precision\\n\\nAssesses how much of the retrieved context is actually useful for generating the answer.\\n\\nHow it works:\\n\\nFor each retrieved chunk, an LLM judges if it was necessary for the answer, using the ground truth (reference) or the generated response.\\n\\nCalculates Average Precision, rewarding systems that rank useful chunks higher.\\n\\nVariants:\\n\\nContextUtilization: Uses the generated response instead of ground truth.\\n\\nNon-LLM version: Compares retrieved chunks to ideal reference contexts using string similarity.\\n\\nWhy it matters: High precision means your retriever is efficient; low precision means too much irrelevant information is included.\\n\\n3. Context Recall\\n\\nEvaluates whether all necessary information from the ground truth answer is present in the retrieved context.\\n\\nHow it works:\\n\\nBreaks down the reference answer into sentences.\\n\\nFor each sentence, an LLM checks if it can be supported by the retrieved context.\\n\\nThe score is the proportion of reference sentences attributed to the retrieved context.\\n\\nVariants:\\n\\nNon-LLM version: Compares reference and retrieved contexts using similarity and thresholds.\\n\\nWhy it matters: High recall means your retriever finds all needed information; low recall means critical information is missing.\\n\\nSummary: - Low context relevancy: Retriever needs better query understanding or semantic matching. - Low context precision: Retriever includes unnecessary information. - Low context recall: Retriever misses critical information.\\n\\nGeneration Quality Metrics\\n\\n1. Faithfulness\\n\\nChecks if the generated answer is factually consistent with the retrieved context, addressing hallucination.\\n\\nHow it works:\\n\\nBreaks the answer into simple statements.\\n\\nFor each, an LLM checks if it can be inferred from the retrieved context.\\n\\nThe score is the proportion of faithful statements.\\n\\nAlternative:\\n\\nFaithfulnesswithHHEM: Uses a specialized NLI model for verification.\\n\\nWhy it matters: High faithfulness means answers are grounded in context; low faithfulness signals hallucination.\\n\\n2. Answer Relevancy\\n\\nMeasures if the generated answer directly addresses the user\\'s question.\\n\\nHow it works:\\n\\nAsks an LLM to generate possible questions for the answer.\\n\\nCompares these to the original question using embedding similarity.\\n\\nPenalizes noncommittal answers.\\n\\nWhy it matters: High relevancy means answers are on-topic; low relevancy means answers are off-topic or incomplete.\\n\\nSummary: - Low faithfulness: Generator adds facts not supported by context. - Low answer relevancy: Generator doesn\\'t focus on the specific question.\\n\\nEnd-to-End Metrics\\n\\n1. Correctness\\n\\nAssesses factual alignment between the generated answer and a ground truth reference.\\n\\nHow it works:\\n\\nBreaks both the answer and reference into claims.\\n\\nUses NLI to verify claims in both directions.\\n\\nCalculates precision, recall, or F1-score.\\n\\nWhy it matters: High correctness means answers match the ground truth; low correctness signals factual errors.\\n\\nKey distinction: - Faithfulness: Compares answer to retrieved context. - FactualCorrectness: Compares answer to ground truth.\\n\\nCommon RAG Evaluation Patterns\\n\\n1. High Retrieval, Low Generation Scores\\n\\nDiagnosis: Good retrieval, poor use of information.\\n\\nFixes: Improve prompts, use better generation models, or verify responses post-generation.\\n\\n2. Low Retrieval, High Generation Scores\\n\\nDiagnosis: Good generation, inadequate information.\\n\\nFixes: Enhance indexing, retrieval algorithms, or expand the knowledge base.\\n\\n3. Low Context Precision, High Faithfulness\\n\\nDiagnosis: Retrieves too much, but generates reliably.\\n\\nFixes: Filter passages, optimize chunk size, or use re-ranking.\\n\\nBest Practices for RAG Evaluation\\n\\nEvaluate components independently: Assess retrieval and generation separately.\\n\\nUse diverse queries: Include factoid, explanatory, and complex questions.\\n\\nCompare against baselines: Test against simpler systems.\\n\\nPerform ablation studies: Try variations like different chunk sizes or retrieval models.\\n\\nCombine with human evaluation: Use Ragas with human judgment for a complete view.\\n\\nConclusion: The Iterative RAG Evaluation Cycle\\n\\nEffective RAG development is iterative:\\n\\nEvaluate: Measure performance.\\n\\nAnalyze: Identify weaknesses.\\n\\nImprove: Apply targeted enhancements.\\n\\nRe-evaluate: Measure the impact of changes.\\n\\nThe Iterative RAG Evaluation Cycle\\n\\nBy using Ragas to implement this cycle, you can systematically improve your RAG system\\'s performance across all dimensions.\\n\\nIn our next post, we\\'ll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas — You are here Next up in the series: Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true\\n\\nIn our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs.\\n\\nBeyond the Basics: Why Advanced Metrics Matter\\n\\nWhile Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements:\\n\\nDomain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements\\n\\nCustom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches\\n\\nSpecialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics\\n\\nBusiness-specific KPIs: Aligning evaluation with business objectives requires customized metrics\\n\\nLet\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs.\\n\\nUnderstanding Ragas\\' Metric Architecture\\n\\nBefore creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture:\\n\\n1. Understand the Metric Base Classes\\n\\nAll metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these:\\n\\nSingleTurnMetric: For metrics that evaluate a single question/response pair.\\n\\nMultiTurnMetric: For metrics that evaluate multi-turn conversations.\\n\\nMetricWithLLM: For metrics that require an LLM for evaluation.\\n\\nMetricWithEmbeddings: For metrics that use embeddings.\\n\\nYou can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric).\\n\\nEach metric implements specific scoring methods depending on its type:\\n\\n_single_turn_ascore: For single-turn metrics\\n\\n_multi_turn_ascore: For multi-turn metrics\\n\\nCreating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel\\n\\nDefine input/output models for the prompt\\n\\nclass TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel): score: float feedback: str\\n\\nDefine the prompt\\n\\nclass TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ]\\n\\nCreate the metric\\n\\n@dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\",\\n\\n }\\n }\\n)\\noutput_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS\\nevaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt)\\n\\nasync def _single_turn_ascore(self, sample, callbacks) -> float:\\n assert self.llm is not None, \"LLM must be set\"\\n\\n question = sample.user_input\\n response = sample.response\\n # Extract programming language from question if possible\\n programming_language = \"python\" # Default\\n languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"]\\n for lang in languages:\\n if lang in question.lower():\\n programming_language = lang\\n break\\n\\n # Get the context\\n context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\"\\n\\n # Prepare input for prompt\\n prompt_input = TechnicalAccuracyInput(\\n question=question,\\n context=context,\\n response=response,\\n programming_language=programming_language\\n )\\n\\n # Generate evaluation\\n evaluation = await self.evaluation_prompt.generate(\\n data=prompt_input, llm=self.llm, callbacks=callbacks\\n )\\n\\n return evaluation.score\\n\\n```\\n\\nUsing the Custom Metric\\n\\nTo use the custom metric, simply include it in your evaluation pipeline:\\n\\n```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\ntest_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\")\\n\\nNote: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions,\\n\\n`` You can also use theevaluate` function to evaluate a dataset:\\n\\n```python from ragas import evaluate from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization\\n\\nCustomizing Metrics for Your Application\\n\\nYou can further refine your evaluation by customizing existing metrics—such as adjusting thresholds or criteria—to better fit your application\\'s requirements. For multi-turn conversations, you might configure metrics like topic adherence to emphasize specific aspects, such as precision or recall, based on your evaluation objectives.\\n\\nIn specialized domains like healthcare or legal, it\\'s crucial to design custom metrics that capture domain-specific accuracy and compliance needs. For complex applications, consider combining several metrics into composite scores to represent multiple quality dimensions.\\n\\nWhen assessing capabilities like code generation or structured outputs, develop metrics that evaluate execution correctness or schema compliance. For advanced scenarios, you can build metric pipelines that orchestrate several metrics and aggregate their results using strategies like weighted averages or minimum scores.\\n\\nBy thoughtfully customizing and combining metrics, you can achieve a comprehensive and meaningful evaluation framework tailored to your unique use case.\\n\\nBest Practices for Custom Metric Development\\n\\nSingle Responsibility: Each metric should evaluate one specific aspect\\n\\nClear Definition: Define precisely what your metric measures\\n\\nBounded Output: Scores should be normalized, typically in [0,1]\\n\\nReproducibility: Minimize randomness in evaluation\\n\\nDocumentation: Document criteria, prompt design, and interpretation guidelines\\n\\nTest with Examples: Verify metric behavior on clear-cut examples\\n\\nHuman Correlation: Validate that metrics correlate with human judgment\\n\\nStandardizing Custom Metrics\\n\\nTo ensure consistency across custom metrics, consider the following best practices:\\n\\nDefine a clear, human-readable description for each metric.\\n\\nProvide interpretation guidelines to help users understand score meanings.\\n\\nInclude metadata such as metric name, required columns, and output type.\\n\\nUse a standardized interface or base class for all custom metrics.\\n\\nImplementation Patterns for Advanced Metrics\\n\\nWhen developing advanced metrics like topic adherence:\\n\\nDesign multi-step evaluation workflows for complex tasks.\\n\\nUse specialized prompts for different sub-tasks within the metric.\\n\\nAllow configurable scoring modes (e.g., precision, recall, F1).\\n\\nSupport conversational context for multi-turn evaluations.\\n\\nDebugging Custom Metrics\\n\\nEffective debugging strategies include:\\n\\nImplementing a debug mode to capture prompt inputs, outputs, and intermediate results.\\n\\nLogging detailed evaluation steps for easier troubleshooting.\\n\\nReviewing final scores alongside intermediate calculations to identify issues.\\n\\nConclusion: Building an Evaluation Ecosystem\\n\\nCustom metrics allow you to build a comprehensive evaluation ecosystem tailored to your application\\'s specific needs:\\n\\nBaseline metrics: Start with Ragas\\' core metrics for fundamental quality aspects\\n\\nDomain adaptation: Add specialized metrics for your application domain\\n\\nFeature-specific metrics: Develop metrics for unique features of your system\\n\\nBusiness alignment: Create metrics that reflect specific business KPIs and requirements\\n\\nBy extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques — You are here Next up in the series: Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!', 'title: \"Part 2: Basic Evaluation Workflow with Ragas\" date: 2025-04-26T19:00:00-06:00 layout: blog description: \"Learn how to set up a basic evaluation workflow for LLM applications using Ragas. This guide walks you through data preparation, metric selection, and result analysis.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1600132806370-bf17e65e942f?q=80&w=1988&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 8 published: true\\n\\nIn our previous post, we introduced Ragas as a powerful framework for evaluating LLM applications. Now, let\\'s dive into the practical aspects of setting up your first evaluation pipeline.\\n\\nUnderstanding the Evaluation Workflow\\n\\nA typical Ragas evaluation workflow consists of four key steps:\\n\\nPrepare your data: Collect queries, contexts, responses, and reference answers\\n\\nSelect appropriate metrics: Choose metrics that align with what you want to evaluate\\n\\nRun the evaluation: Process your data through the selected metrics\\n\\nAnalyze the results: Interpret scores and identify areas for improvement\\n\\nLet\\'s walk through each step with practical examples.\\n\\nStep 1: Setting Up Your Environment\\n\\nFirst, ensure you have Ragas installed:\\n\\nbash uv add ragas\\n\\nNext, import the necessary components:\\n\\npython import pandas as pd from ragas import EvaluationDataset from ragas import evaluate, RunConfig from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity\\n\\nStep 2: Preparing Your Evaluation Data\\n\\nFor a RAG system evaluation, you\\'ll need:\\n\\nQuestions: User queries to your system\\n\\nContexts: Documents or chunks retrieved by your system\\n\\nResponses: Answers generated by your system\\n\\nGround truth (optional): Reference answers or documents for comparison\\n\\nHere\\'s how to organize this data:\\n\\n```python\\n\\nSample data\\n\\ndata = { \"user_input\": [ \"What are the main symptoms of COVID-19?\", \"How does machine learning differ from deep learning?\" ], \"retrieved_contexts\": [ [ \"Common symptoms of COVID-19 include fever, cough, and fatigue. Some patients also report loss of taste or smell, body aches, and difficulty breathing.\", \"COVID-19 is caused by the SARS-CoV-2 virus and spreads primarily through respiratory droplets.\" ], [ \"Machine learning is a subset of AI focused on algorithms that learn from data without being explicitly programmed.\", \"Deep learning is a specialized form of machine learning using neural networks with many layers (deep neural networks).\" ] ], \"response\": [ \"The main symptoms of COVID-19 include fever, cough, fatigue, and sometimes loss of taste or smell, body aches, and breathing difficulties.\", \"Machine learning is a subset of AI that focuses on algorithms learning from data, while deep learning is a specialized form of machine learning that uses deep neural networks with multiple layers.\" ], \"reference\": [ \"COVID-19 symptoms commonly include fever, dry cough, fatigue, loss of taste or smell, body aches, sore throat, and in severe cases, difficulty breathing.\", \"Machine learning is a branch of AI where systems learn from data, identify patterns, and make decisions with minimal human intervention. Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks) to analyze various factors of data.\" ] }\\n\\neval_data = pd.DataFrame(data)\\n\\nConvert to a format Ragas can use\\n\\nevaluation_dataset = EvaluationDataset.from_pandas(eval_data) evaluation_dataset\\n\\n```\\n\\nStep 3: Selecting and Configuring Metrics\\n\\nRagas offers various metrics to evaluate different aspects of your system:\\n\\nCore RAG Metrics:\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nFactual Correctness: Assesses if the response is accurate and free from factual errors.\\n\\nResponse Relevancy: Evaluates if the response directly addresses the user query.\\n\\nContext Entity Recall: Measures how well the retrieved context captures relevant entities from the ground truth.\\n\\nNoise Sensitivity: Assesses the robustness of the response to irrelevant or noisy context.\\n\\nLLM Context Recall: Evaluates how effectively the LLM utilizes the provided context to generate the response.\\n\\nFor metrics that require an LLM (like faithfulness), you need to configure the LLM provider:\\n\\n```python\\n\\nConfigure LLM for evaluation\\n\\nfrom langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nDefine metrics to use\\n\\nmetrics = [ Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity(), LLMContextRecall() ] ```\\n\\nStep 4: Running the Evaluation\\n\\nNow, run the evaluation with your selected metrics:\\n\\n```python\\n\\nRun evaluation\\n\\nresults = evaluate( evaluation_dataset, metrics=metrics, llm=evaluator_llm # Required for LLM-based metrics )\\n\\nView results\\n\\nprint(results) ```\\n\\nOutput:\\n\\nValues will vary based on your data and LLM performance.\\n\\npython { \"faithfulness\": 1.0000, \"factual_correctness\": 0.6750, \"answer_relevancy\": 0.9897, \"context_entity_recall\": 0.8889, \"noise_sensitivity_relevant\": 0.1667, \"context_recall\": 0.5000 }\\n\\nStep 5: Interpreting Results\\n\\nRagas metrics typically return scores between 0 and 1, where higher is better:\\n\\nUnderstanding Score Ranges:\\n\\n0.8-1.0: Excellent performance\\n\\n0.6-0.8: Good performance\\n\\n0.4-0.6: Moderate performance, needs improvement\\n\\n0.4 or lower: Poor performance, requires significant attention\\n\\nAdvanced Use: Custom Evaluation for Specific Examples\\n\\nFor more detailed analysis of specific examples:\\n\\n```python from ragas import SingleTurnSample from ragas.metrics import AspectCritic\\n\\nDefine a specific test case\\n\\ntest_data = { \"user_input\": \"What are quantum computers?\", \"response\": \"Quantum computers use quantum bits or qubits that can exist in multiple states simultaneously, unlike classical bits that can only be 0 or 1.\", \"retrieved_contexts\": [\"Quantum computing is a type of computation that harnesses quantum mechanical phenomena.\"] }\\n\\nCreate a custom evaluation metric\\n\\ncustom_metric = AspectCritic( name=\"quantum_accuracy\", llm=llm, definition=\"Verify if the explanation of quantum computing is accurate and complete.\" )\\n\\nScore the sample\\n\\nsample = SingleTurnSample(**test_data) score = await custom_metric.single_turn_ascore(sample) print(f\"Quantum accuracy score: {score}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 02_Basic_Evaluation_Workflow_with_Ragas\\n\\nCommon Evaluation Patterns and Metrics\\n\\nBelow is a matrix mapping evaluation patterns to the metrics used, along with definitions for each metric:\\n\\nMetric Comprehensive RAG Evaluation Content Quality Evaluation Retrieval Quality Evaluation Faithfulness ✓ ✓ Answer Relevancy ✓ ✓ Context Recall ✓ ✓ Context Precision ✓ ✓ Harmfulness ✓ Coherence ✓ Context Relevancy ✓\\n\\nMetric Definitions\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nAnswer Relevancy: Assesses if the response addresses the question.\\n\\nContext Recall: Measures how well the retrieved context covers the information in the ground truth.\\n\\nContext Precision: Evaluates the proportion of relevant information in the retrieved context.\\n\\nHarmfulness: Evaluates if the response contains harmful or inappropriate content.\\n\\nCoherence: Measures the logical flow and clarity of the response.\\n\\nContext Relevancy: Evaluates if the retrieved context is relevant to the question.\\n\\nThis matrix provides a clear overview of which metrics to use for specific evaluation patterns and their respective definitions.\\n\\nBest Practices for Ragas Evaluation\\n\\nStart simple: Begin with core metrics before adding more specialized ones\\n\\nUse diverse test cases: Include a variety of questions, from simple to complex\\n\\nConsider edge cases: Test with queries that might challenge your system\\n\\nCompare versions: Track metrics across different versions of your application\\n\\nCombine with human evaluation: Use Ragas alongside human feedback for a comprehensive assessment\\n\\nConclusion\\n\\nSetting up a basic evaluation workflow with Ragas is straightforward yet powerful. By systematically evaluating your LLM applications, you gain objective insights into their performance and clear directions for improvement.\\n\\nIn our next post, we\\'ll delve deeper into specialized evaluation techniques for RAG systems, exploring advanced metrics and evaluation strategies for retrieval-augmented generation applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow — You are here Next up in the series: Part 3: Evaluating RAG Systems Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHave you set up your first Ragas evaluation? What aspects of your LLM application are you most interested in measuring? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!']",
- "['title: \"Part 4: Generating Test Data with Ragas\" date: 2025-04-27T16:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/generating_test_data.png\" readingTime: 14 published: true In our previous post, we explored how to comprehensively evaluate RAG systems using specialized metrics. However, even the best evaluation framework requires high-quality test data to yield meaningful insights. In this post, we\\'ll dive into how Ragas helps you generate robust test datasets for evaluating your LLM applications. Why and']",
- "Ragas assists with the evaluation of Retrieval-Augmented Generation (RAG) systems by providing specialized metrics that address the unique challenges of these multi-component systems. It focuses on three key dimensions of evaluation:\n\n1. **Retrieval Quality**: Metrics like Context Relevancy, Context Precision, and Context Recall assess how well the system retrieves relevant information.\n2. **Generation Quality**: Metrics such as Faithfulness and Answer Relevancy evaluate the quality of the responses generated from the retrieved information.\n3. **End-to-End Performance**: Metrics like Correctness measure how well the entire system meets user needs.\n\nRagas enables a systematic evaluation cycle, allowing developers to measure performance, analyze weaknesses, and implement improvements iteratively.\n\nFor more detailed information, you can explore the following links:\n- [Evaluating Rag Systems With Ragas](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)\n- [Introduction To Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Basic Evaluation Workflow With Ragas](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)",
- "Ragas helps generate robust test datasets for evaluating Retrieval-Augmented Generation systems, including document-based, domain-specific, and adversarial test generation techniques.",
+ "What specialized metrics does Ragas provide for evaluating LLM applications?",
+ "[\"In our next post, we'll explore advanced metrics and customization techniques for specialized evaluation needs.\\n\\n---\\n\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**Part 4: Test Data Generation — _You are here_** \\n*Next up in the series:* \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '---\\n*This is the eighth part of a series on Ragas, a research-driven evaluation framework for LLM and RAG systems. If you missed the previous parts, check them out below:*\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**Part 8: Building Feedback Loops — _You are here_**', \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**Part 6: Evaluating AI Agents — _You are here_** \\n*Next up in the series:* \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\"]",
+ "[\"Evaluation serves several key purposes:\\n- **Quality assurance**: Identify and fix issues before they reach users\\n- **Performance tracking**: Monitor how changes impact system performance\\n- **Benchmarking**: Compare different approaches objectively\\n- **Continuous improvement**: Build feedback loops to enhance your application\\n\\n## Key Features of Ragas\\n\\n### 🎯 Specialized Metrics\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\n- **Faithfulness**: Measures if the response is factually consistent with the retrieved context\\n- **Context Relevancy**: Evaluates if the retrieved information is relevant to the query\\n- **Answer Relevancy**: Assesses if the response addresses the user's question\\n- **Topic Adherence**: Gauges how well multi-turn conversations stay on topic\"]",
+ "I don't know.",
+ "Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications, including faithfulness, context relevancy, answer relevancy, and topic adherence.",
"single_hop_specifc_query_synthesizer"
],
[
"3",
- "Wut is OpenAIEmbeddings used for?",
- "['title: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true\\n\\nIn our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs.\\n\\nBeyond the Basics: Why Advanced Metrics Matter\\n\\nWhile Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements:\\n\\nDomain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements\\n\\nCustom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches\\n\\nSpecialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics\\n\\nBusiness-specific KPIs: Aligning evaluation with business objectives requires customized metrics\\n\\nLet\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs.\\n\\nUnderstanding Ragas\\' Metric Architecture\\n\\nBefore creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture:\\n\\n1. Understand the Metric Base Classes\\n\\nAll metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these:\\n\\nSingleTurnMetric: For metrics that evaluate a single question/response pair.\\n\\nMultiTurnMetric: For metrics that evaluate multi-turn conversations.\\n\\nMetricWithLLM: For metrics that require an LLM for evaluation.\\n\\nMetricWithEmbeddings: For metrics that use embeddings.\\n\\nYou can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric).\\n\\nEach metric implements specific scoring methods depending on its type:\\n\\n_single_turn_ascore: For single-turn metrics\\n\\n_multi_turn_ascore: For multi-turn metrics\\n\\nCreating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel\\n\\nDefine input/output models for the prompt\\n\\nclass TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel): score: float feedback: str\\n\\nDefine the prompt\\n\\nclass TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ]\\n\\nCreate the metric\\n\\n@dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\",\\n\\n }\\n }\\n)\\noutput_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS\\nevaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt)\\n\\nasync def _single_turn_ascore(self, sample, callbacks) -> float:\\n assert self.llm is not None, \"LLM must be set\"\\n\\n question = sample.user_input\\n response = sample.response\\n # Extract programming language from question if possible\\n programming_language = \"python\" # Default\\n languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"]\\n for lang in languages:\\n if lang in question.lower():\\n programming_language = lang\\n break\\n\\n # Get the context\\n context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\"\\n\\n # Prepare input for prompt\\n prompt_input = TechnicalAccuracyInput(\\n question=question,\\n context=context,\\n response=response,\\n programming_language=programming_language\\n )\\n\\n # Generate evaluation\\n evaluation = await self.evaluation_prompt.generate(\\n data=prompt_input, llm=self.llm, callbacks=callbacks\\n )\\n\\n return evaluation.score\\n\\n```\\n\\nUsing the Custom Metric\\n\\nTo use the custom metric, simply include it in your evaluation pipeline:\\n\\n```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\ntest_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\")\\n\\nNote: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions,\\n\\n`` You can also use theevaluate` function to evaluate a dataset:\\n\\n```python from ragas import evaluate from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization\\n\\nCustomizing Metrics for Your Application\\n\\nYou can further refine your evaluation by customizing existing metrics—such as adjusting thresholds or criteria—to better fit your application\\'s requirements. For multi-turn conversations, you might configure metrics like topic adherence to emphasize specific aspects, such as precision or recall, based on your evaluation objectives.\\n\\nIn specialized domains like healthcare or legal, it\\'s crucial to design custom metrics that capture domain-specific accuracy and compliance needs. For complex applications, consider combining several metrics into composite scores to represent multiple quality dimensions.\\n\\nWhen assessing capabilities like code generation or structured outputs, develop metrics that evaluate execution correctness or schema compliance. For advanced scenarios, you can build metric pipelines that orchestrate several metrics and aggregate their results using strategies like weighted averages or minimum scores.\\n\\nBy thoughtfully customizing and combining metrics, you can achieve a comprehensive and meaningful evaluation framework tailored to your unique use case.\\n\\nBest Practices for Custom Metric Development\\n\\nSingle Responsibility: Each metric should evaluate one specific aspect\\n\\nClear Definition: Define precisely what your metric measures\\n\\nBounded Output: Scores should be normalized, typically in [0,1]\\n\\nReproducibility: Minimize randomness in evaluation\\n\\nDocumentation: Document criteria, prompt design, and interpretation guidelines\\n\\nTest with Examples: Verify metric behavior on clear-cut examples\\n\\nHuman Correlation: Validate that metrics correlate with human judgment\\n\\nStandardizing Custom Metrics\\n\\nTo ensure consistency across custom metrics, consider the following best practices:\\n\\nDefine a clear, human-readable description for each metric.\\n\\nProvide interpretation guidelines to help users understand score meanings.\\n\\nInclude metadata such as metric name, required columns, and output type.\\n\\nUse a standardized interface or base class for all custom metrics.\\n\\nImplementation Patterns for Advanced Metrics\\n\\nWhen developing advanced metrics like topic adherence:\\n\\nDesign multi-step evaluation workflows for complex tasks.\\n\\nUse specialized prompts for different sub-tasks within the metric.\\n\\nAllow configurable scoring modes (e.g., precision, recall, F1).\\n\\nSupport conversational context for multi-turn evaluations.\\n\\nDebugging Custom Metrics\\n\\nEffective debugging strategies include:\\n\\nImplementing a debug mode to capture prompt inputs, outputs, and intermediate results.\\n\\nLogging detailed evaluation steps for easier troubleshooting.\\n\\nReviewing final scores alongside intermediate calculations to identify issues.\\n\\nConclusion: Building an Evaluation Ecosystem\\n\\nCustom metrics allow you to build a comprehensive evaluation ecosystem tailored to your application\\'s specific needs:\\n\\nBaseline metrics: Start with Ragas\\' core metrics for fundamental quality aspects\\n\\nDomain adaptation: Add specialized metrics for your application domain\\n\\nFeature-specific metrics: Develop metrics for unique features of your system\\n\\nBusiness alignment: Create metrics that reflect specific business KPIs and requirements\\n\\nBy extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques — You are here Next up in the series: Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'layout: blog title: Coming Back to AI Roots - My Professional Journey date: 2025-04-14T00:00:00-06:00 description: A personal reflection on my career journey from AI to web and enterprise software development, and why I\\'m returning to my original passion for artificial intelligence. categories: [\"AI\", \"Personal Journey\", \"Technology\"] coverVideo: \"/videos/back_to_future.mp4\" readingTime: 4 published: true\\n\\nHave you ever felt that life has a way of bringing you full circle? That\\'s exactly how I feel about my career trajectory. My name is Muhammad Afzaal, and I\\'d like to share the story of my professional journey - from my early fascination with artificial intelligence, through years of web and enterprise software development, and now back to where it all began.\\n\\nThe Early AI Days\\n\\nMy professional journey began with a deep fascination for artificial intelligence. As a student, I was captivated by the potential of machines that could learn and make decisions. This was well before the current AI boom - back when neural networks were still considered somewhat niche and the term \"deep learning\" wasn\\'t yet a household phrase.\\n\\nI spent countless hours immersed in neural networks, image processing, and computer vision. My early career was defined by research projects and small-scale AI implementations - including Urdu OCR systems and data extraction from paper-based forms in 2003-2004. I still have vivid memories of recruiting fellow students to handwrite text samples, then meticulously scanning, labeling, and training neural networks with this data. While modest by today\\'s standards, these projects represented glimpses into a future where machines could meaningfully augment human capabilities in ways that seemed almost magical at the time.\\n\\nThe Pivot to Web and Enterprise Development\\n\\nAs often happens in technology careers, opportunities led me in a different direction. The explosive growth of web technologies and enterprise systems created a high demand for developers with these skills, and I found myself gradually pivoting away from AI.\\n\\nFor several years, I immersed myself in the world of web and enterprise software development. I worked with various frameworks and technologies, built scalable systems, and helped businesses solve complex problems through software. This journey taught me invaluable lessons about software architecture, user experience, and delivering production-quality code that serves real business needs.\\n\\nWorking in enterprise software development exposed me to the challenges of building systems that not only function correctly but can also scale, evolve, and adapt to changing requirements. I learned the importance of clean code, thoughtful architecture, and considering the entire lifecycle of software products.\\n\\nWhy I\\'m Returning to AI\\n\\nWhile my time in web and enterprise development was rewarding, I\\'ve always felt a pull back toward artificial intelligence. The recent AI renaissance - with breakthroughs in large language models, generative AI, and machine learning at scale - has reignited my original passion.\\n\\nWe\\'re living in what may be the most exciting time in AI history. Models like GPT-4, Claude, and open-source alternatives are demonstrating capabilities that seemed like science fiction just a few years ago. The tools and frameworks available today make AI more accessible than ever before, and the potential applications span virtually every domain of human endeavor.\\n\\nWhat excites me most is that my experience in enterprise software development gives me a unique perspective on AI implementation. I understand not just the algorithms and models, but also how to integrate them into robust, production-ready systems that deliver real value.\\n\\nThe Best of Both Worlds\\n\\nComing back to AI doesn\\'t mean leaving behind everything I learned in web and enterprise development. Quite the opposite - I believe my background gives me a particular advantage in building AI systems that are:\\n\\nProduction-ready: Understanding software engineering best practices helps create AI systems that can operate reliably at scale.\\n\\nUser-focused: Experience with UX principles ensures AI solutions are designed with actual human users in mind.\\n\\nIntegrated: Knowledge of enterprise systems makes it easier to connect AI capabilities with existing business processes.\\n\\nSimplified: My experience in streamlining complex business processes helps me identify where AI can have the greatest impact through intelligent automation.\\n\\nBusiness-oriented: I understand that AI isn\\'t just about the technology—it\\'s about solving real business problems and creating measurable value.\\n\\nPractical: I focus on practical applications that deliver immediate benefits rather than getting caught up in theoretical possibilities.\\n\\nWhat\\'s Next\\n\\nAs I return to my AI roots, I\\'m excited to share this journey with you through this blog. In the coming months, I plan to write about:\\n\\nPractical applications of modern AI technologies\\n\\nHow to bridge the gap between AI research and production systems\\n\\nThe intersection of web technologies and AI\\n\\nEthical considerations in AI implementation\\n\\nTutorials and guides for developers looking to incorporate AI into their projects\\n\\nIf you\\'re interested in AI, software development, or the intersection of these fields, I hope you\\'ll join me on this journey. Whether you\\'re a seasoned AI practitioner, a web developer curious about machine learning, or simply interested in how technology is evolving, I believe there\\'s something here for you.\\n\\nHere\\'s to coming full circle, building on past experiences, and embracing the exciting future of AI!\\n\\nHave questions or topics you\\'d like me to cover? Feel free to reach out — we’d love to help!', 'title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!', 'title: \"Part 7: Integrations and Observability with Ragas\" date: 2025-04-30T07:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/integrations-and-observability.png\" readingTime: 12 published: true\\n\\nPart 6: Integrations and Observability with Ragas\\n\\nIn our previous post, we explored how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications. Now, let\\'s discuss how to integrate Ragas into your broader LLM development ecosystem and establish observability practices that transform evaluation from a one-time exercise into a continuous improvement cycle.\\n\\nWhy Integrations and Observability Matter\\n\\nEvaluation is most powerful when it\\'s:\\n\\nIntegrated into your existing workflow and tools\\n\\nAutomated to run consistently with minimal friction\\n\\nObservable so insights are easily accessible and actionable\\n\\nContinuous rather than a one-time or sporadic effort\\n\\nLet\\'s explore how Ragas helps you achieve these goals through its extensive integration capabilities.\\n\\nFramework Integrations\\n\\nRagas seamlessly connects with popular LLM application frameworks, allowing you to evaluate systems built with your preferred tools.\\n\\nLangChain Integration\\n\\nFor LangChain-based applications, Ragas provides dedicated integration support. Here’s how you can integrate Ragas step by step:\\n\\nPrepare your documents: Load your source documents and split them into manageable chunks for retrieval.\\n\\nSet up vector storage: Embed the document chunks and store them in a vector database to enable efficient retrieval.\\n\\nConfigure the retriever and QA chain: Use LangChain components to create a retriever and a question-answering (QA) chain powered by your chosen language model.\\n\\nGenerate a test set: Use Ragas to automatically generate a set of test questions and answers from your documents, or supply your own.\\n\\nEvaluate retrieval and QA performance: Apply Ragas metrics to assess both the retriever and the full QA chain, measuring aspects like context relevancy, faithfulness, and answer quality.\\n\\nReview results: Analyze the evaluation outputs to identify strengths and areas for improvement in your RAG pipeline.\\n\\nThis integration allows you to continuously measure and improve the effectiveness of your retrieval and generation components within the LangChain framework.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nRagas supports integration with a variety of popular LLM and RAG frameworks beyond LangChain, including LlamaIndex and Haystack. These integrations enable seamless evaluation of retrieval and generation components within your preferred stack. If you need guidance or code examples for integrating Ragas with platforms such as LlamaIndex, Haystack, or others, support and tailored examples can be provided on demand to fit your specific workflow and requirements.\\n\\nObservability Platform Integrations\\n\\nBeyond framework integrations, Ragas connects with leading observability platforms to help you monitor, track, and analyze evaluation results over time.\\n\\nLangSmith Integration\\n\\nFor LangChain users, LangSmith provides comprehensive tracing and evaluation. To integrate Ragas evaluation with LangSmith, follow these steps:\\n\\nSet up your environment\\n\\nUpload dataset to LangSmith\\n\\nDefine your LLM or chain\\n\\nSelect Ragas metrics\\n\\nRun evaluation with LangSmith\\n\\nYou can now view detailed experiment results in your LangSmith project dashboard. This integration enables you to trace, evaluate, and monitor your RAG pipeline performance directly within LangSmith, leveraging Ragas metrics for deeper insights.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nOther Platform Integrations\\n\\nRagas can be integrated with a range of observability and monitoring platforms beyond LangSmith, such as Langfuse and others. If you need help connecting Ragas to platforms like Langfuse or have specific requirements for your observability stack, tailored support and examples are available to fit your workflow.\\n\\nBuilding Automated Evaluation Pipelines\\n\\nTo ensure evaluation is a continuous part of your development process, set up automated pipelines that run evaluations regularly and automatically.\\n\\nCI/CD Integration\\n\\nYou can incorporate Ragas into your CI/CD pipeline so that every code change is automatically evaluated. This helps catch regressions early and ensures your RAG system maintains high performance before merging new changes.\\n\\nScheduled Evaluations\\n\\nRegularly scheduled evaluations allow you to monitor your system’s performance over time. By running evaluations at set intervals, you can track trends, spot regressions, and ensure your system continues to meet quality standards.\\n\\nMonitoring Evaluation Metrics Over Time\\n\\nTracking evaluation metrics over time helps you identify performance trends and quickly detect any drops in quality. By visualizing these metrics, you can better understand how changes to your system impact its effectiveness.\\n\\nCreating Custom Dashboards\\n\\nBuilding custom dashboards gives you a comprehensive view of your evaluation results. Dashboards can display current performance, trends, and detailed breakdowns of recent evaluations, making it easier to monitor your system and identify areas for improvement.\\n\\nWith these practices, you can make evaluation an ongoing, automated, and visible part of your development workflow, leading to more reliable and robust RAG systems.\\n\\nBest Practices for Observability\\n\\nDefine clear thresholds: Establish performance baselines and alert thresholds for each metric\\n\\nSegment evaluations: Break down results by query type, data source, or other relevant factors\\n\\nHistorical tracking: Maintain historical evaluation data to identify trends and regressions\\n\\nCorrelation analysis: Link evaluation metrics to user feedback and business outcomes\\n\\nRegular benchmarking: Periodically evaluate against fixed test sets to ensure consistency\\n\\nAlert on regressions: Implement automated alerts when metrics drop below thresholds\\n\\nContextualize metrics: Include example failures alongside aggregate metrics for better understanding\\n\\nBuilding a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:\\n\\nCapture evaluation results with Ragas\\n\\nIdentify patterns in failures and underperforming areas\\n\\nPrioritize improvements based on impact and effort\\n\\nImplement changes to your RAG components\\n\\nValidate improvements with focused re-evaluation\\n\\nMonitor continuously to catch regressions\\n\\nConclusion: From Evaluation to Action\\n\\nIntegrating Ragas with your frameworks and observability tools transforms evaluation from a point-in-time activity to a continuous improvement cycle. By making evaluation metrics visible, actionable, and integrated into your workflows, you create a foundation for systematic improvement of your LLM applications.\\n\\nThe most successful teams don\\'t just evaluate occasionally — they build evaluation into their development culture, making data-driven decisions based on objective metrics rather than subjective impressions.\\n\\nIn our final post, we\\'ll explore how to build effective feedback loops that translate evaluation insights into concrete improvements for your LLM applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Metrics and Customization Part 6: Evaluating AI Agents Part 7: Integrations and Observability with Ragas — You are here Next up in the series: Part 8: Building Feedback Loops\\n\\nHow are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!']",
- "['How to Generate Synthetic Data for RAG Evaluation In the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, synthetic data generation is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like RAGAS and LangSmith. Why Generate Synthetic Data? Early Signal, Fast Iteration Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production. Controlled Complexity You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases. Benchmarking and Comparison Synthetic test sets provide a repeatable, comparable way to measure improvements as you tweak your pipeline (e.g., changing chunk size, embeddings, or prompts). How to Generate Synthetic Data 1. Prepare Your Source Data Start with a set of documents relevant to your domain. For example, you might download and load HTML blog posts into a document format using tools like LangChain’s DirectoryLoader. 2. Build a Knowledge Graph Use RAGAS to convert your documents into a knowledge graph. This graph captures entities, relationships, and summaries, forming the backbone for generating meaningful queries. RAGAS applies default transformations are dependent on the corpus length, here are some examples: Producing Summaries -> produces summaries of the documents Extracting Headlines -> finding the overall headline for the document Theme Extractor -> extracts broad themes about the documents It then uses cosine-similarity and heuristics between the embeddings of the above transformations to construct relationships between the nodes. This is a crucial step, as the quality of your knowledge graph directly impacts the relevance and accuracy of the generated queries. 3. Configure Query Synthesizers RAGAS provides several query synthesizers: - SingleHopSpecificQuerySynthesizer: Generates direct, fact-based questions. - MultiHopAbstractQuerySynthesizer: Creates broader, multi-step reasoning questions. - MultiHopSpecificQuerySynthesizer: Focuses on questions that require connecting specific entities across documents. By mixing these, you get a diverse and challenging test set. 4. Generate the Test Set With your knowledge graph and query synthesizers, use RAGAS’s TestsetGenerator to create a synthetic dataset. This dataset will include questions, reference answers, and supporting contexts. 5. Evaluate and Iterate Load your synthetic dataset into an evaluation platform like LangSmith. Run your RAG pipeline against the test set, and use automated evaluators (for accuracy, helpfulness, style, etc.) to identify strengths and weaknesses. Tweak your pipeline and re-evaluate to drive improvements. Minimal Example Here’s a high-level pseudocode outline (see the notebook for full details): ````python 1. Load documents from langchain_community.document_loaders import DirectoryLoader path = \"data/\" loader = DirectoryLoader(path, glob=\"*.md\") docs = loader.load() 2. Generate data from ragas.testset import TestsetGenerator from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings Initialize the generator with the LLM and embedding model generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4.1\")) generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) Create the test set generator generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings) dataset = generator.generate_with_langchain_docs(docs, testset_size=10) ```` dataset will now contain a set of questions, answers, and contexts that you can use to evaluate your RAG system. 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 04_Synthetic_Data_Generation']",
+ "me wanna know how LangSmith work with Ragas, like is it for test data or what, and how it help me as LLM app builder, can you tell me all about LangSmith from this info?",
+ "[\"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**Part 6: Evaluating AI Agents — _You are here_** \\n*Next up in the series:* \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\n---\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**Part 3: Evaluating RAG Systems with Ragas — _You are here_** \\n*Next up in the series:* \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\"]",
+ "[\"### 🧪 Test Data Generation\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n### 🔗 Seamless Integrations\\nRagas works with popular LLM frameworks and tools:\\n- [LangChain](https://www.langchain.com/)\\n- [LlamaIndex](https://www.llamaindex.ai/)\\n- [Haystack](https://haystack.deepset.ai/)\\n- [OpenAI](https://openai.com/)\\n\\nObservability platforms \\n- [Phoenix](https://phoenix.arize.com/)\\n- [LangSmith](https://python.langchain.com/docs/introduction/)\\n- [Langfuse](https://www.langfuse.com/)\\n\\n### 📊 Comprehensive Analysis\\nBeyond simple scores, Ragas provides detailed insights into your application's strengths and weaknesses, enabling targeted improvements.\\n\\n## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere's a simple example of evaluating a response using Ragas:\"]",
"I don't know.",
- "OpenAIEmbeddings is used as an embedding model in the synthetic data generation process for RAG evaluation, as shown when initializing the generator with LangchainEmbeddingsWrapper(OpenAIEmbeddings()).",
+ "LangSmith is listed as one of the observability platforms that Ragas works with. The context does not provide details about how LangSmith specifically functions, but it shows that Ragas integrates with LangSmith to support observability in LLM-powered systems.",
"single_hop_specifc_query_synthesizer"
],
[
"4",
- "Wht are the key steps in the Ragas evalution workflow for RAG systems, and wich specialized evalution metrics can be selected to asess system performance?",
- "['title: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true\\n\\nIn our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature.\\n\\nUnderstanding RAG Systems: More Than the Sum of Their Parts\\n\\nRAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information\\n\\nThis dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content.\\n\\nThe RAG Evaluation Triad\\n\\nEffective RAG evaluation requires examining three key dimensions:\\n\\nRetrieval Quality: How well does the system find relevant information?\\n\\nGeneration Quality: How well does the system produce responses from retrieved information?\\n\\nEnd-to-End Performance: How well does the complete system satisfy user needs?\\n\\nLet\\'s explore how Ragas helps evaluate each dimension of RAG systems.\\n\\nCore RAG Metrics in Ragas\\n\\nRagas provides specialized metrics to assess RAG systems across retrieval, generation, and end-to-end performance.\\n\\nRetrieval Quality Metrics\\n\\n1. Context Relevancy\\n\\nMeasures how relevant the retrieved documents are to the user\\'s question.\\n\\nHow it works:\\n\\nTakes the user\\'s question (user_input) and the retrieved documents (retrieved_contexts).\\n\\nUses an LLM to score relevance with two different prompts, averaging the results for robustness.\\n\\nScores are normalized between 0.0 (irrelevant) and 1.0 (fully relevant).\\n\\nWhy it matters: Low scores indicate your retriever is pulling in unrelated or noisy documents. Monitoring this helps you improve the retrieval step.\\n\\n2. Context Precision\\n\\nAssesses how much of the retrieved context is actually useful for generating the answer.\\n\\nHow it works:\\n\\nFor each retrieved chunk, an LLM judges if it was necessary for the answer, using the ground truth (reference) or the generated response.\\n\\nCalculates Average Precision, rewarding systems that rank useful chunks higher.\\n\\nVariants:\\n\\nContextUtilization: Uses the generated response instead of ground truth.\\n\\nNon-LLM version: Compares retrieved chunks to ideal reference contexts using string similarity.\\n\\nWhy it matters: High precision means your retriever is efficient; low precision means too much irrelevant information is included.\\n\\n3. Context Recall\\n\\nEvaluates whether all necessary information from the ground truth answer is present in the retrieved context.\\n\\nHow it works:\\n\\nBreaks down the reference answer into sentences.\\n\\nFor each sentence, an LLM checks if it can be supported by the retrieved context.\\n\\nThe score is the proportion of reference sentences attributed to the retrieved context.\\n\\nVariants:\\n\\nNon-LLM version: Compares reference and retrieved contexts using similarity and thresholds.\\n\\nWhy it matters: High recall means your retriever finds all needed information; low recall means critical information is missing.\\n\\nSummary: - Low context relevancy: Retriever needs better query understanding or semantic matching. - Low context precision: Retriever includes unnecessary information. - Low context recall: Retriever misses critical information.\\n\\nGeneration Quality Metrics\\n\\n1. Faithfulness\\n\\nChecks if the generated answer is factually consistent with the retrieved context, addressing hallucination.\\n\\nHow it works:\\n\\nBreaks the answer into simple statements.\\n\\nFor each, an LLM checks if it can be inferred from the retrieved context.\\n\\nThe score is the proportion of faithful statements.\\n\\nAlternative:\\n\\nFaithfulnesswithHHEM: Uses a specialized NLI model for verification.\\n\\nWhy it matters: High faithfulness means answers are grounded in context; low faithfulness signals hallucination.\\n\\n2. Answer Relevancy\\n\\nMeasures if the generated answer directly addresses the user\\'s question.\\n\\nHow it works:\\n\\nAsks an LLM to generate possible questions for the answer.\\n\\nCompares these to the original question using embedding similarity.\\n\\nPenalizes noncommittal answers.\\n\\nWhy it matters: High relevancy means answers are on-topic; low relevancy means answers are off-topic or incomplete.\\n\\nSummary: - Low faithfulness: Generator adds facts not supported by context. - Low answer relevancy: Generator doesn\\'t focus on the specific question.\\n\\nEnd-to-End Metrics\\n\\n1. Correctness\\n\\nAssesses factual alignment between the generated answer and a ground truth reference.\\n\\nHow it works:\\n\\nBreaks both the answer and reference into claims.\\n\\nUses NLI to verify claims in both directions.\\n\\nCalculates precision, recall, or F1-score.\\n\\nWhy it matters: High correctness means answers match the ground truth; low correctness signals factual errors.\\n\\nKey distinction: - Faithfulness: Compares answer to retrieved context. - FactualCorrectness: Compares answer to ground truth.\\n\\nCommon RAG Evaluation Patterns\\n\\n1. High Retrieval, Low Generation Scores\\n\\nDiagnosis: Good retrieval, poor use of information.\\n\\nFixes: Improve prompts, use better generation models, or verify responses post-generation.\\n\\n2. Low Retrieval, High Generation Scores\\n\\nDiagnosis: Good generation, inadequate information.\\n\\nFixes: Enhance indexing, retrieval algorithms, or expand the knowledge base.\\n\\n3. Low Context Precision, High Faithfulness\\n\\nDiagnosis: Retrieves too much, but generates reliably.\\n\\nFixes: Filter passages, optimize chunk size, or use re-ranking.\\n\\nBest Practices for RAG Evaluation\\n\\nEvaluate components independently: Assess retrieval and generation separately.\\n\\nUse diverse queries: Include factoid, explanatory, and complex questions.\\n\\nCompare against baselines: Test against simpler systems.\\n\\nPerform ablation studies: Try variations like different chunk sizes or retrieval models.\\n\\nCombine with human evaluation: Use Ragas with human judgment for a complete view.\\n\\nConclusion: The Iterative RAG Evaluation Cycle\\n\\nEffective RAG development is iterative:\\n\\nEvaluate: Measure performance.\\n\\nAnalyze: Identify weaknesses.\\n\\nImprove: Apply targeted enhancements.\\n\\nRe-evaluate: Measure the impact of changes.\\n\\nThe Iterative RAG Evaluation Cycle\\n\\nBy using Ragas to implement this cycle, you can systematically improve your RAG system\\'s performance across all dimensions.\\n\\nIn our next post, we\\'ll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas — You are here Next up in the series: Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true\\n\\nIn our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs.\\n\\nBeyond the Basics: Why Advanced Metrics Matter\\n\\nWhile Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements:\\n\\nDomain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements\\n\\nCustom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches\\n\\nSpecialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics\\n\\nBusiness-specific KPIs: Aligning evaluation with business objectives requires customized metrics\\n\\nLet\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs.\\n\\nUnderstanding Ragas\\' Metric Architecture\\n\\nBefore creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture:\\n\\n1. Understand the Metric Base Classes\\n\\nAll metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these:\\n\\nSingleTurnMetric: For metrics that evaluate a single question/response pair.\\n\\nMultiTurnMetric: For metrics that evaluate multi-turn conversations.\\n\\nMetricWithLLM: For metrics that require an LLM for evaluation.\\n\\nMetricWithEmbeddings: For metrics that use embeddings.\\n\\nYou can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric).\\n\\nEach metric implements specific scoring methods depending on its type:\\n\\n_single_turn_ascore: For single-turn metrics\\n\\n_multi_turn_ascore: For multi-turn metrics\\n\\nCreating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel\\n\\nDefine input/output models for the prompt\\n\\nclass TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel): score: float feedback: str\\n\\nDefine the prompt\\n\\nclass TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ]\\n\\nCreate the metric\\n\\n@dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\",\\n\\n }\\n }\\n)\\noutput_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS\\nevaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt)\\n\\nasync def _single_turn_ascore(self, sample, callbacks) -> float:\\n assert self.llm is not None, \"LLM must be set\"\\n\\n question = sample.user_input\\n response = sample.response\\n # Extract programming language from question if possible\\n programming_language = \"python\" # Default\\n languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"]\\n for lang in languages:\\n if lang in question.lower():\\n programming_language = lang\\n break\\n\\n # Get the context\\n context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\"\\n\\n # Prepare input for prompt\\n prompt_input = TechnicalAccuracyInput(\\n question=question,\\n context=context,\\n response=response,\\n programming_language=programming_language\\n )\\n\\n # Generate evaluation\\n evaluation = await self.evaluation_prompt.generate(\\n data=prompt_input, llm=self.llm, callbacks=callbacks\\n )\\n\\n return evaluation.score\\n\\n```\\n\\nUsing the Custom Metric\\n\\nTo use the custom metric, simply include it in your evaluation pipeline:\\n\\n```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\ntest_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\")\\n\\nNote: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions,\\n\\n`` You can also use theevaluate` function to evaluate a dataset:\\n\\n```python from ragas import evaluate from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization\\n\\nCustomizing Metrics for Your Application\\n\\nYou can further refine your evaluation by customizing existing metrics—such as adjusting thresholds or criteria—to better fit your application\\'s requirements. For multi-turn conversations, you might configure metrics like topic adherence to emphasize specific aspects, such as precision or recall, based on your evaluation objectives.\\n\\nIn specialized domains like healthcare or legal, it\\'s crucial to design custom metrics that capture domain-specific accuracy and compliance needs. For complex applications, consider combining several metrics into composite scores to represent multiple quality dimensions.\\n\\nWhen assessing capabilities like code generation or structured outputs, develop metrics that evaluate execution correctness or schema compliance. For advanced scenarios, you can build metric pipelines that orchestrate several metrics and aggregate their results using strategies like weighted averages or minimum scores.\\n\\nBy thoughtfully customizing and combining metrics, you can achieve a comprehensive and meaningful evaluation framework tailored to your unique use case.\\n\\nBest Practices for Custom Metric Development\\n\\nSingle Responsibility: Each metric should evaluate one specific aspect\\n\\nClear Definition: Define precisely what your metric measures\\n\\nBounded Output: Scores should be normalized, typically in [0,1]\\n\\nReproducibility: Minimize randomness in evaluation\\n\\nDocumentation: Document criteria, prompt design, and interpretation guidelines\\n\\nTest with Examples: Verify metric behavior on clear-cut examples\\n\\nHuman Correlation: Validate that metrics correlate with human judgment\\n\\nStandardizing Custom Metrics\\n\\nTo ensure consistency across custom metrics, consider the following best practices:\\n\\nDefine a clear, human-readable description for each metric.\\n\\nProvide interpretation guidelines to help users understand score meanings.\\n\\nInclude metadata such as metric name, required columns, and output type.\\n\\nUse a standardized interface or base class for all custom metrics.\\n\\nImplementation Patterns for Advanced Metrics\\n\\nWhen developing advanced metrics like topic adherence:\\n\\nDesign multi-step evaluation workflows for complex tasks.\\n\\nUse specialized prompts for different sub-tasks within the metric.\\n\\nAllow configurable scoring modes (e.g., precision, recall, F1).\\n\\nSupport conversational context for multi-turn evaluations.\\n\\nDebugging Custom Metrics\\n\\nEffective debugging strategies include:\\n\\nImplementing a debug mode to capture prompt inputs, outputs, and intermediate results.\\n\\nLogging detailed evaluation steps for easier troubleshooting.\\n\\nReviewing final scores alongside intermediate calculations to identify issues.\\n\\nConclusion: Building an Evaluation Ecosystem\\n\\nCustom metrics allow you to build a comprehensive evaluation ecosystem tailored to your application\\'s specific needs:\\n\\nBaseline metrics: Start with Ragas\\' core metrics for fundamental quality aspects\\n\\nDomain adaptation: Add specialized metrics for your application domain\\n\\nFeature-specific metrics: Develop metrics for unique features of your system\\n\\nBusiness alignment: Create metrics that reflect specific business KPIs and requirements\\n\\nBy extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques — You are here Next up in the series: Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 2: Basic Evaluation Workflow with Ragas\" date: 2025-04-26T19:00:00-06:00 layout: blog description: \"Learn how to set up a basic evaluation workflow for LLM applications using Ragas. This guide walks you through data preparation, metric selection, and result analysis.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1600132806370-bf17e65e942f?q=80&w=1988&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 8 published: true\\n\\nIn our previous post, we introduced Ragas as a powerful framework for evaluating LLM applications. Now, let\\'s dive into the practical aspects of setting up your first evaluation pipeline.\\n\\nUnderstanding the Evaluation Workflow\\n\\nA typical Ragas evaluation workflow consists of four key steps:\\n\\nPrepare your data: Collect queries, contexts, responses, and reference answers\\n\\nSelect appropriate metrics: Choose metrics that align with what you want to evaluate\\n\\nRun the evaluation: Process your data through the selected metrics\\n\\nAnalyze the results: Interpret scores and identify areas for improvement\\n\\nLet\\'s walk through each step with practical examples.\\n\\nStep 1: Setting Up Your Environment\\n\\nFirst, ensure you have Ragas installed:\\n\\nbash uv add ragas\\n\\nNext, import the necessary components:\\n\\npython import pandas as pd from ragas import EvaluationDataset from ragas import evaluate, RunConfig from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity\\n\\nStep 2: Preparing Your Evaluation Data\\n\\nFor a RAG system evaluation, you\\'ll need:\\n\\nQuestions: User queries to your system\\n\\nContexts: Documents or chunks retrieved by your system\\n\\nResponses: Answers generated by your system\\n\\nGround truth (optional): Reference answers or documents for comparison\\n\\nHere\\'s how to organize this data:\\n\\n```python\\n\\nSample data\\n\\ndata = { \"user_input\": [ \"What are the main symptoms of COVID-19?\", \"How does machine learning differ from deep learning?\" ], \"retrieved_contexts\": [ [ \"Common symptoms of COVID-19 include fever, cough, and fatigue. Some patients also report loss of taste or smell, body aches, and difficulty breathing.\", \"COVID-19 is caused by the SARS-CoV-2 virus and spreads primarily through respiratory droplets.\" ], [ \"Machine learning is a subset of AI focused on algorithms that learn from data without being explicitly programmed.\", \"Deep learning is a specialized form of machine learning using neural networks with many layers (deep neural networks).\" ] ], \"response\": [ \"The main symptoms of COVID-19 include fever, cough, fatigue, and sometimes loss of taste or smell, body aches, and breathing difficulties.\", \"Machine learning is a subset of AI that focuses on algorithms learning from data, while deep learning is a specialized form of machine learning that uses deep neural networks with multiple layers.\" ], \"reference\": [ \"COVID-19 symptoms commonly include fever, dry cough, fatigue, loss of taste or smell, body aches, sore throat, and in severe cases, difficulty breathing.\", \"Machine learning is a branch of AI where systems learn from data, identify patterns, and make decisions with minimal human intervention. Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks) to analyze various factors of data.\" ] }\\n\\neval_data = pd.DataFrame(data)\\n\\nConvert to a format Ragas can use\\n\\nevaluation_dataset = EvaluationDataset.from_pandas(eval_data) evaluation_dataset\\n\\n```\\n\\nStep 3: Selecting and Configuring Metrics\\n\\nRagas offers various metrics to evaluate different aspects of your system:\\n\\nCore RAG Metrics:\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nFactual Correctness: Assesses if the response is accurate and free from factual errors.\\n\\nResponse Relevancy: Evaluates if the response directly addresses the user query.\\n\\nContext Entity Recall: Measures how well the retrieved context captures relevant entities from the ground truth.\\n\\nNoise Sensitivity: Assesses the robustness of the response to irrelevant or noisy context.\\n\\nLLM Context Recall: Evaluates how effectively the LLM utilizes the provided context to generate the response.\\n\\nFor metrics that require an LLM (like faithfulness), you need to configure the LLM provider:\\n\\n```python\\n\\nConfigure LLM for evaluation\\n\\nfrom langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nDefine metrics to use\\n\\nmetrics = [ Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity(), LLMContextRecall() ] ```\\n\\nStep 4: Running the Evaluation\\n\\nNow, run the evaluation with your selected metrics:\\n\\n```python\\n\\nRun evaluation\\n\\nresults = evaluate( evaluation_dataset, metrics=metrics, llm=evaluator_llm # Required for LLM-based metrics )\\n\\nView results\\n\\nprint(results) ```\\n\\nOutput:\\n\\nValues will vary based on your data and LLM performance.\\n\\npython { \"faithfulness\": 1.0000, \"factual_correctness\": 0.6750, \"answer_relevancy\": 0.9897, \"context_entity_recall\": 0.8889, \"noise_sensitivity_relevant\": 0.1667, \"context_recall\": 0.5000 }\\n\\nStep 5: Interpreting Results\\n\\nRagas metrics typically return scores between 0 and 1, where higher is better:\\n\\nUnderstanding Score Ranges:\\n\\n0.8-1.0: Excellent performance\\n\\n0.6-0.8: Good performance\\n\\n0.4-0.6: Moderate performance, needs improvement\\n\\n0.4 or lower: Poor performance, requires significant attention\\n\\nAdvanced Use: Custom Evaluation for Specific Examples\\n\\nFor more detailed analysis of specific examples:\\n\\n```python from ragas import SingleTurnSample from ragas.metrics import AspectCritic\\n\\nDefine a specific test case\\n\\ntest_data = { \"user_input\": \"What are quantum computers?\", \"response\": \"Quantum computers use quantum bits or qubits that can exist in multiple states simultaneously, unlike classical bits that can only be 0 or 1.\", \"retrieved_contexts\": [\"Quantum computing is a type of computation that harnesses quantum mechanical phenomena.\"] }\\n\\nCreate a custom evaluation metric\\n\\ncustom_metric = AspectCritic( name=\"quantum_accuracy\", llm=llm, definition=\"Verify if the explanation of quantum computing is accurate and complete.\" )\\n\\nScore the sample\\n\\nsample = SingleTurnSample(**test_data) score = await custom_metric.single_turn_ascore(sample) print(f\"Quantum accuracy score: {score}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 02_Basic_Evaluation_Workflow_with_Ragas\\n\\nCommon Evaluation Patterns and Metrics\\n\\nBelow is a matrix mapping evaluation patterns to the metrics used, along with definitions for each metric:\\n\\nMetric Comprehensive RAG Evaluation Content Quality Evaluation Retrieval Quality Evaluation Faithfulness ✓ ✓ Answer Relevancy ✓ ✓ Context Recall ✓ ✓ Context Precision ✓ ✓ Harmfulness ✓ Coherence ✓ Context Relevancy ✓\\n\\nMetric Definitions\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nAnswer Relevancy: Assesses if the response addresses the question.\\n\\nContext Recall: Measures how well the retrieved context covers the information in the ground truth.\\n\\nContext Precision: Evaluates the proportion of relevant information in the retrieved context.\\n\\nHarmfulness: Evaluates if the response contains harmful or inappropriate content.\\n\\nCoherence: Measures the logical flow and clarity of the response.\\n\\nContext Relevancy: Evaluates if the retrieved context is relevant to the question.\\n\\nThis matrix provides a clear overview of which metrics to use for specific evaluation patterns and their respective definitions.\\n\\nBest Practices for Ragas Evaluation\\n\\nStart simple: Begin with core metrics before adding more specialized ones\\n\\nUse diverse test cases: Include a variety of questions, from simple to complex\\n\\nConsider edge cases: Test with queries that might challenge your system\\n\\nCompare versions: Track metrics across different versions of your application\\n\\nCombine with human evaluation: Use Ragas alongside human feedback for a comprehensive assessment\\n\\nConclusion\\n\\nSetting up a basic evaluation workflow with Ragas is straightforward yet powerful. By systematically evaluating your LLM applications, you gain objective insights into their performance and clear directions for improvement.\\n\\nIn our next post, we\\'ll delve deeper into specialized evaluation techniques for RAG systems, exploring advanced metrics and evaluation strategies for retrieval-augmented generation applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow — You are here Next up in the series: Part 3: Evaluating RAG Systems Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHave you set up your first Ragas evaluation? What aspects of your LLM application are you most interested in measuring? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!']",
- "['<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation Workflow with Ragas\" date: 2025-04-26T19:00:00-06:00 layout: blog description: \"Learn how to set up a basic evaluation workflow for LLM applications using Ragas. This guide walks you through data preparation, metric selection, and result analysis.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1600132806370-bf17e65e942f?q=80&w=1988&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 8 published: true In our previous post, we introduced Ragas as a powerful framework for evaluating LLM applications. Now, let\\'s dive into the practical aspects of setting up your first evaluation pipeline. Understanding the Evaluation Workflow A typical Ragas evaluation workflow consists of four key steps: Prepare your data: Collect queries, contexts, responses, and reference answers Select appropriate metrics: Choose metrics that align with what you want to evaluate Run the evaluation: Process your data through the selected metrics Analyze the results: Interpret scores and identify areas for improvement Let\\'s walk through each step with practical examples. Step 1: Setting Up Your Environment First, ensure you have Ragas installed: bash uv add ragas Next, import the necessary components: python import pandas as pd from ragas import EvaluationDataset from ragas import evaluate, RunConfig from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity Step 2: Preparing Your Evaluation Data For a RAG system evaluation, you\\'ll need: Questions: User queries to your system Contexts: Documents or chunks retrieved by your system Responses: Answers generated by your system Ground truth (optional): Reference answers or documents for comparison Here\\'s how to organize this data: ```python Sample data data = { \"user_input\": [ \"What are the main symptoms of COVID-19?\", \"How does machine learning differ from deep learning?\" ], \"retrieved_contexts\": [ [ \"Common symptoms of COVID-19 include fever, cough, and fatigue. Some patients also report loss of taste or smell, body aches, and difficulty breathing.\", \"COVID-19 is caused by the SARS-CoV-2 virus and spreads primarily through respiratory droplets.\" ], [ \"Machine learning is a subset of AI focused on algorithms that learn from data without being explicitly programmed.\", \"Deep learning is a specialized form of machine learning using neural networks with many layers (deep neural networks).\" ] ], \"response\": [ \"The main symptoms of COVID-19 include fever, cough, fatigue, and sometimes loss of taste or smell, body aches, and breathing difficulties.\", \"Machine learning is a subset of AI that focuses on algorithms learning from data, while deep learning is a specialized form of machine learning that uses deep neural networks with multiple layers.\" ], \"reference\": [ \"COVID-19 symptoms commonly include fever, dry cough, fatigue, loss of taste or smell, body aches, sore throat, and in severe cases, difficulty breathing.\", \"Machine learning is a branch of AI where systems learn from data, identify patterns, and make decisions with minimal human intervention. Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks) to analyze various factors of data.\" ] } eval_data = pd.DataFrame(data) Convert to a format Ragas can use evaluation_dataset = EvaluationDataset.from_pandas(eval_data) evaluation_dataset ``` Step 3: Selecting and Configuring Metrics Ragas offers various metrics to evaluate different aspects of your system: Core RAG Metrics: Faithfulness: Measures if the response is factually consistent with the provided context. Factual Correctness: Assesses if the response is accurate and free from factual errors. Response Relevancy: Evaluates if the response directly addresses the user query. Context Entity Recall: Measures how well the retrieved context captures relevant entities from the ground truth. Noise Sensitivity: Assesses the robustness of the response to irrelevant or noisy context. LLM Context Recall: Evaluates how effectively the LLM utilizes the provided context to generate the response. For metrics that require an LLM (like faithfulness), you need to configure the LLM provider: ```python Configure LLM for evaluation from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper Initialize the LLM, you are going to OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Define metrics to use metrics = [ Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity(), LLMContextRecall() ] ```', '<2-hop>\\n\\ntitle: \"Part 4: Generating Test Data with Ragas\" date: 2025-04-27T16:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/generating_test_data.png\" readingTime: 14 published: true In our previous post, we explored how to comprehensively evaluate RAG systems using specialized metrics. However, even the best evaluation framework requires high-quality test data to yield meaningful insights. In this post, we\\'ll dive into how Ragas helps you generate robust test datasets for evaluating your LLM applications. Why and']",
- "### Key Steps in the Ragas Evaluation Workflow for RAG Systems\n\n1. **Prepare Your Data**: Collect user queries, retrieved contexts, generated responses, and reference answers.\n2. **Select Appropriate Metrics**: Choose metrics that align with the evaluation goals.\n3. **Run the Evaluation**: Process the data through the selected metrics.\n4. **Analyze the Results**: Interpret the scores to identify areas for improvement.\n\n### Specialized Evaluation Metrics for RAG Systems\n\n- **Retrieval Quality Metrics**:\n - **Context Relevancy**: Measures the relevance of retrieved documents to the user's question.\n - **Context Precision**: Assesses how much of the retrieved context is useful for generating the answer.\n - **Context Recall**: Evaluates whether all necessary information from the ground truth answer is present in the retrieved context.\n\n- **Generation Quality Metrics**:\n - **Faithfulness**: Checks if the generated answer is factually consistent with the retrieved context.\n - **Answer Relevancy**: Measures if the generated answer directly addresses the user's question.\n\n- **End-to-End Performance Metrics**:\n - **Correctness**: Assesses factual alignment between the generated answer and a ground truth reference.\n\nFor more detailed information, you can explore the following links:\n- [Evaluating Rag Systems With Ragas](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)\n- [Basic Evaluation Workflow With Ragas](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)\n- [Introduction To Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)",
- "The key steps in the Ragas evaluation workflow for RAG systems include preparing your data (collecting queries, contexts, responses, and reference answers), selecting appropriate metrics that align with your evaluation goals, running the evaluation by processing your data through the selected metrics, and analyzing the results to interpret scores and identify areas for improvement. Specialized evaluation metrics offered by Ragas include Faithfulness (measuring factual consistency with context), Factual Correctness (assessing accuracy and freedom from factual errors), Response Relevancy (evaluating if the response addresses the user query), Context Entity Recall (measuring how well the retrieved context captures relevant entities), Noise Sensitivity (assessing robustness to irrelevant context), and LLM Context Recall (evaluating how effectively the LLM uses the provided context to generate the response).",
- "multi_hop_abstract_query_synthesizer"
+ "How do I use the OPENAI API key when initializing an LLM for evaluation with Ragas?",
+ "[\"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**Part 6: Evaluating AI Agents — _You are here_** \\n*Next up in the series:* \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python\\nfrom ragas.metrics import Faithfulness\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import SingleTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\nfrom langchain_openai import ChatOpenAI\\n\\n# Initialize the LLM, you are going to new OPENAI API key\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \\n\\n# Your evaluation data\\ntest_data = {\\n \"user_input\": \"What is the capital of France?\",\\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\\n \"response\": \"The capital of France is Paris.\"\\n}\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor', '## Implementing Agent Evaluation in Practice\\n\\nLet\\'s look at a practical example of evaluating an AI agent using these metrics:\\n\\n```python\\nfrom ragas.metrics import AgentGoalAccuracyWithoutReference, ToolCallAccuracy, TopicAdherenceScore\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import MultiTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\n\\n# Initialize the LLM\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))']",
+ "['## Getting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\n```bash\\nuv init && uv add ragas\\n```\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python\\nfrom ragas.metrics import Faithfulness\\nfrom ragas.evaluation import EvaluationDataset\\nfrom ragas.dataset_schema import SingleTurnSample\\nfrom langchain_openai import ChatOpenAI\\nfrom ragas.llms import LangchainLLMWrapper\\nfrom langchain_openai import ChatOpenAI\\n\\n# Initialize the LLM, you are going to new OPENAI API key\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) \\n\\n# Your evaluation data\\ntest_data = {\\n \"user_input\": \"What is the capital of France?\",\\n \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"],\\n \"response\": \"The capital of France is Paris.\"\\n}\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor']",
+ "To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you can follow this example:\n\n```python\nfrom langchain_openai import ChatOpenAI\nfrom ragas.llms import LangchainLLMWrapper\n\n# Initialize the LLM with your OPENAI API key\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\n```\n\nMake sure to replace `\"gpt-4o\"` with the appropriate model you wish to use. You will need to set your API key in your environment or configuration as required by the `ChatOpenAI` class.\n\nFor more details, you can refer to the following links:\n- [Part 1: Introduction to Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Part 5: Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)",
+ "To use the OPENAI API key when initializing an LLM for evaluation with Ragas, you need to provide your OPENAI API key during the initialization of the ChatOpenAI model, as shown in the example: evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")).",
+ "single_hop_specifc_query_synthesizer"
],
[
"5",
- "How does Ragas facilitate both test data generation and synthetic data generation for evaluating Retrieval-Augmented Generation (RAG) systems, and what are the key steps and tools involved in creating robust synthetic test datasets as described in the blog series?",
- "['title: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true\\n\\nIn our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature.\\n\\nUnderstanding RAG Systems: More Than the Sum of Their Parts\\n\\nRAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information\\n\\nThis dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content.\\n\\nThe RAG Evaluation Triad\\n\\nEffective RAG evaluation requires examining three key dimensions:\\n\\nRetrieval Quality: How well does the system find relevant information?\\n\\nGeneration Quality: How well does the system produce responses from retrieved information?\\n\\nEnd-to-End Performance: How well does the complete system satisfy user needs?\\n\\nLet\\'s explore how Ragas helps evaluate each dimension of RAG systems.\\n\\nCore RAG Metrics in Ragas\\n\\nRagas provides specialized metrics to assess RAG systems across retrieval, generation, and end-to-end performance.\\n\\nRetrieval Quality Metrics\\n\\n1. Context Relevancy\\n\\nMeasures how relevant the retrieved documents are to the user\\'s question.\\n\\nHow it works:\\n\\nTakes the user\\'s question (user_input) and the retrieved documents (retrieved_contexts).\\n\\nUses an LLM to score relevance with two different prompts, averaging the results for robustness.\\n\\nScores are normalized between 0.0 (irrelevant) and 1.0 (fully relevant).\\n\\nWhy it matters: Low scores indicate your retriever is pulling in unrelated or noisy documents. Monitoring this helps you improve the retrieval step.\\n\\n2. Context Precision\\n\\nAssesses how much of the retrieved context is actually useful for generating the answer.\\n\\nHow it works:\\n\\nFor each retrieved chunk, an LLM judges if it was necessary for the answer, using the ground truth (reference) or the generated response.\\n\\nCalculates Average Precision, rewarding systems that rank useful chunks higher.\\n\\nVariants:\\n\\nContextUtilization: Uses the generated response instead of ground truth.\\n\\nNon-LLM version: Compares retrieved chunks to ideal reference contexts using string similarity.\\n\\nWhy it matters: High precision means your retriever is efficient; low precision means too much irrelevant information is included.\\n\\n3. Context Recall\\n\\nEvaluates whether all necessary information from the ground truth answer is present in the retrieved context.\\n\\nHow it works:\\n\\nBreaks down the reference answer into sentences.\\n\\nFor each sentence, an LLM checks if it can be supported by the retrieved context.\\n\\nThe score is the proportion of reference sentences attributed to the retrieved context.\\n\\nVariants:\\n\\nNon-LLM version: Compares reference and retrieved contexts using similarity and thresholds.\\n\\nWhy it matters: High recall means your retriever finds all needed information; low recall means critical information is missing.\\n\\nSummary: - Low context relevancy: Retriever needs better query understanding or semantic matching. - Low context precision: Retriever includes unnecessary information. - Low context recall: Retriever misses critical information.\\n\\nGeneration Quality Metrics\\n\\n1. Faithfulness\\n\\nChecks if the generated answer is factually consistent with the retrieved context, addressing hallucination.\\n\\nHow it works:\\n\\nBreaks the answer into simple statements.\\n\\nFor each, an LLM checks if it can be inferred from the retrieved context.\\n\\nThe score is the proportion of faithful statements.\\n\\nAlternative:\\n\\nFaithfulnesswithHHEM: Uses a specialized NLI model for verification.\\n\\nWhy it matters: High faithfulness means answers are grounded in context; low faithfulness signals hallucination.\\n\\n2. Answer Relevancy\\n\\nMeasures if the generated answer directly addresses the user\\'s question.\\n\\nHow it works:\\n\\nAsks an LLM to generate possible questions for the answer.\\n\\nCompares these to the original question using embedding similarity.\\n\\nPenalizes noncommittal answers.\\n\\nWhy it matters: High relevancy means answers are on-topic; low relevancy means answers are off-topic or incomplete.\\n\\nSummary: - Low faithfulness: Generator adds facts not supported by context. - Low answer relevancy: Generator doesn\\'t focus on the specific question.\\n\\nEnd-to-End Metrics\\n\\n1. Correctness\\n\\nAssesses factual alignment between the generated answer and a ground truth reference.\\n\\nHow it works:\\n\\nBreaks both the answer and reference into claims.\\n\\nUses NLI to verify claims in both directions.\\n\\nCalculates precision, recall, or F1-score.\\n\\nWhy it matters: High correctness means answers match the ground truth; low correctness signals factual errors.\\n\\nKey distinction: - Faithfulness: Compares answer to retrieved context. - FactualCorrectness: Compares answer to ground truth.\\n\\nCommon RAG Evaluation Patterns\\n\\n1. High Retrieval, Low Generation Scores\\n\\nDiagnosis: Good retrieval, poor use of information.\\n\\nFixes: Improve prompts, use better generation models, or verify responses post-generation.\\n\\n2. Low Retrieval, High Generation Scores\\n\\nDiagnosis: Good generation, inadequate information.\\n\\nFixes: Enhance indexing, retrieval algorithms, or expand the knowledge base.\\n\\n3. Low Context Precision, High Faithfulness\\n\\nDiagnosis: Retrieves too much, but generates reliably.\\n\\nFixes: Filter passages, optimize chunk size, or use re-ranking.\\n\\nBest Practices for RAG Evaluation\\n\\nEvaluate components independently: Assess retrieval and generation separately.\\n\\nUse diverse queries: Include factoid, explanatory, and complex questions.\\n\\nCompare against baselines: Test against simpler systems.\\n\\nPerform ablation studies: Try variations like different chunk sizes or retrieval models.\\n\\nCombine with human evaluation: Use Ragas with human judgment for a complete view.\\n\\nConclusion: The Iterative RAG Evaluation Cycle\\n\\nEffective RAG development is iterative:\\n\\nEvaluate: Measure performance.\\n\\nAnalyze: Identify weaknesses.\\n\\nImprove: Apply targeted enhancements.\\n\\nRe-evaluate: Measure the impact of changes.\\n\\nThe Iterative RAG Evaluation Cycle\\n\\nBy using Ragas to implement this cycle, you can systematically improve your RAG system\\'s performance across all dimensions.\\n\\nIn our next post, we\\'ll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas — You are here Next up in the series: Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 4: Generating Test Data with Ragas\" date: 2025-04-27T16:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/generating_test_data.png\" readingTime: 14 published: true\\n\\nIn our previous post, we explored how to comprehensively evaluate RAG systems using specialized metrics. However, even the best evaluation framework requires high-quality test data to yield meaningful insights. In this post, we\\'ll dive into how Ragas helps you generate robust test datasets for evaluating your LLM applications.\\n\\nWhy and How to Generate Synthetic Data for RAG Evaluation\\n\\nIn the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, synthetic data generation is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like RAGAS and LangSmith.\\n\\nWhy Generate Synthetic Data?\\n\\nEarly Signal, Fast Iteration Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production.\\n\\nControlled Complexity You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases.\\n\\nBenchmarking and Comparison Synthetic test sets provide a repeatable, comparable way to measure improvements as you tweak your pipeline (e.g., changing chunk size, embeddings, or prompts).\\n\\nHow to Generate Synthetic Data\\n\\n1. Prepare Your Source Data\\n\\nStart with a set of documents relevant to your domain. For example, you might download and load HTML blog posts into a document format using tools like LangChain’s DirectoryLoader.\\n\\n2. Build a Knowledge Graph\\n\\nUse RAGAS to convert your documents into a knowledge graph. This graph captures entities, relationships, and summaries, forming the backbone for generating meaningful queries. RAGAS applies default transformations are dependent on the corpus length, here are some examples:\\n\\nProducing Summaries -> produces summaries of the documents\\n\\nExtracting Headlines -> finding the overall headline for the document\\n\\nTheme Extractor -> extracts broad themes about the documents\\n\\nIt then uses cosine-similarity and heuristics between the embeddings of the above transformations to construct relationships between the nodes. This is a crucial step, as the quality of your knowledge graph directly impacts the relevance and accuracy of the generated queries.\\n\\n3. Configure Query Synthesizers\\n\\nRAGAS provides several query synthesizers: - SingleHopSpecificQuerySynthesizer: Generates direct, fact-based questions. - MultiHopAbstractQuerySynthesizer: Creates broader, multi-step reasoning questions. - MultiHopSpecificQuerySynthesizer: Focuses on questions that require connecting specific entities across documents.\\n\\nBy mixing these, you get a diverse and challenging test set.\\n\\n4. Generate the Test Set\\n\\nWith your knowledge graph and query synthesizers, use RAGAS’s TestsetGenerator to create a synthetic dataset. This dataset will include questions, reference answers, and supporting contexts.\\n\\n5. Evaluate and Iterate\\n\\nLoad your synthetic dataset into an evaluation platform like LangSmith. Run your RAG pipeline against the test set, and use automated evaluators (for accuracy, helpfulness, style, etc.) to identify strengths and weaknesses. Tweak your pipeline and re-evaluate to drive improvements.\\n\\nMinimal Example\\n\\nHere’s a high-level pseudocode outline (see the notebook for full details):\\n\\n````python\\n\\n1. Load documents\\n\\nfrom langchain_community.document_loaders import DirectoryLoader path = \"data/\" loader = DirectoryLoader(path, glob=\"*.md\") docs = loader.load()\\n\\n2. Generate data\\n\\nfrom ragas.testset import TestsetGenerator from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings\\n\\nInitialize the generator with the LLM and embedding model\\n\\ngenerator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4.1\")) generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())\\n\\nCreate the test set generator\\n\\ngenerator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings) dataset = generator.generate_with_langchain_docs(docs, testset_size=10) ````\\n\\ndataset will now contain a set of questions, answers, and contexts that you can use to evaluate your RAG system.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 04_Synthetic_Data_Generation\\n\\nUnderstanding the Generated Dataset Columns\\n\\nThe synthetic dataset generated by Ragas typically includes the following columns:\\n\\nuser_input: The generated question or query that simulates what a real user might ask. This is the prompt your RAG system will attempt to answer.\\n\\nreference_contexts: A list of document snippets or passages that contain the information needed to answer the user_input. These serve as the ground truth retrieval targets.\\n\\nreference: The ideal answer to the user_input, based strictly on the reference_contexts. This is used as the gold standard for evaluating answer accuracy.\\n\\nsynthesizer_name: The name of the query synthesizer (e.g., SingleHopSpecificQuerySynthesizer, MultiHopAbstractQuerySynthesizer) that generated the question. This helps track the type and complexity of each test case.\\n\\nThese columns enable comprehensive evaluation by linking each question to its supporting evidence and expected answer, while also providing insight into the diversity and difficulty of the generated queries.\\n\\nDeep Dive into Test Data Generation\\n\\nSo you have a collection of documents and want to create a robust evaluation dataset for your RAG system using Ragas. The TestsetGenerator\\'s generate_with_langchain_docs method is your starting point. But what exactly happens when you call it? Let\\'s peek under the hood.\\n\\nThe Goal: To take raw Langchain Document objects and transform them into a structured Ragas Testset containing diverse question-answer pairs grounded in those documents.\\n\\nThe Workflow:\\n\\nInput & Validation: The function receives your Langchain documents, the desired testset_size, and optional configurations for transformations and query types. It first checks if it has the necessary LLM and embedding models to proceed (either provided during TestsetGenerator initialization or passed directly to this method).\\n\\nSetting Up Transformations: This is a crucial step.\\n\\nUser-Provided: If you pass a specific transforms configuration, the generator uses that.\\n\\nDefault Transformations: If you don\\'t provide transforms, the generator calls ragas.testset.transforms.default_transforms. This sets up a standard pipeline to process your raw documents into a usable knowledge graph foundation. We\\'ll detail this below.\\n\\nDocument Conversion: Your Langchain Document objects are converted into Ragas\\' internal Node representation, specifically NodeType.DOCUMENT. Each node holds the page_content and metadata.\\n\\nInitial Knowledge Graph: A KnowledgeGraph object is created, initially containing just these document nodes.\\n\\nApplying Transformations: The core processing happens here using ragas.testset.transforms.apply_transforms. The chosen transforms (default or custom) are executed sequentially on the KnowledgeGraph. This modifies the graph by:\\n\\nAdding new nodes (e.g., chunks, questions, answers).\\n\\nAdding relationships between nodes (e.g., linking a question to the chunk it came from). The generator\\'s internal knowledge_graph attribute is updated with this processed graph.\\n\\nDelegation to generate(): Now that the foundational knowledge graph with basic Q&A pairs is built (thanks to transformations), generate_with_langchain_docs calls the main self.generate() method. This method handles the final step of creating the diverse test samples.\\n\\nSpotlight: Default Transformations (default_transforms)\\n\\nWhen you don\\'t specify custom transformations, Ragas applies a sensible default pipeline to prepare your documents:\\n\\nChunking (SentenceChunker): Breaks down your large documents into smaller, more manageable chunks (often sentences or groups of sentences). This is essential for focused retrieval and question generation.\\n\\nEmbedding: Generates vector embeddings for each chunk using the provided embedding model. These are needed for similarity-based operations.\\n\\nFiltering (SimilarityFilter, InformationFilter): Removes redundant chunks (those too similar to others) and potentially low-information chunks to clean up the knowledge base.\\n\\nBase Q&A Generation (QAGenerator): This is where the initial, simple question-answer pairs are created. The generator looks at individual (filtered) chunks and uses an LLM to formulate straightforward questions whose answers are directly present in that chunk.\\n\\nEssentially, the default transformations build a knowledge graph populated with embedded, filtered document chunks and corresponding simple, extractive question-answer pairs.\\n\\nSpotlight: Query Synthesizers (via self.generate() and default_query_distribution)\\n\\nThe self.generate() method, called by generate_with_langchain_docs, is responsible for taking the foundational graph and creating the final, potentially complex, test questions using Query Synthesizers (also referred to as \"evolutions\" or \"scenarios\").\\n\\nQuery Distribution: self.generate() uses a query_distribution parameter. If you don\\'t provide one, it calls ragas.testset.synthesizers.default_query_distribution.\\n\\nDefault Synthesizers: This default distribution defines a mix of different synthesizer types and the probability of using each one. Common defaults include:\\n\\nsimple: Takes the base Q&A pairs generated during transformation and potentially rephrases them slightly.\\n\\nreasoning: Creates questions requiring logical inference based on the context in the graph.\\n\\nmulti_context: Generates questions needing information synthesized from multiple different chunks/nodes in the graph.\\n\\nconditional: Creates questions with \"if/then\" clauses based on information in the graph.\\n\\nGeneration Process: self.generate() calculates how many questions of each type to create based on the testset_size and the distribution probabilities. It then uses an Executor to run the appropriate synthesizers, generating the final TestsetSample objects that make up your evaluation dataset.\\n\\nIn Summary:\\n\\ngenerate_with_langchain_docs orchestrates a two-phase process:\\n\\nTransformation Phase: Uses (typically default) transformations like chunking, filtering, and base Q&A generation to build a foundational knowledge graph from your documents.\\n\\nSynthesis Phase (via self.generate): Uses (typically default) query synthesizers/evolutions (simple, reasoning, multi_context, etc.) to create diverse and complex questions based on the information stored in the transformed knowledge graph.\\n\\nThis automated pipeline allows you to go from raw documents to a rich, multi-faceted evaluation dataset with minimal configuration.\\n\\nBest Practices for Test Data Generation\\n\\nStart small and iterate: Begin with a small test set to verify quality before scaling up\\n\\nDiversify document sources: Include different document types, styles, and domains\\n\\nBalance question types: Ensure coverage of simple, complex, and edge-case scenarios\\n\\nManual review: Sample-check generated questions for quality and relevance\\n\\nProgressive difficulty: Include both easy and challenging questions to identify performance thresholds\\n\\nDocument metadata: Retain information about test case generation for later analysis\\n\\nVersion control: Track test set versions alongside your application versions\\n\\nConclusion: Building a Test Data Generation Strategy\\n\\nTest data generation should be an integral part of your LLM application development cycle:\\n\\nInitial development: Generate broad test sets to identify general capabilities and limitations\\n\\nRefinement: Create targeted test sets for specific features or improvements\\n\\nRegression testing: Maintain benchmark test sets to ensure changes don\\'t break existing functionality\\n\\nContinuous improvement: Generate new test cases as your application evolves\\n\\nBy leveraging Ragas for automated test data generation, you can build comprehensive evaluation datasets that thoroughly exercise your LLM applications, leading to more robust, reliable systems.\\n\\nIn our next post, we\\'ll explore advanced metrics and customization techniques for specialized evaluation needs.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation — You are here Next up in the series: Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!', 'title: \"Part 7: Integrations and Observability with Ragas\" date: 2025-04-30T07:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/integrations-and-observability.png\" readingTime: 12 published: true\\n\\nPart 6: Integrations and Observability with Ragas\\n\\nIn our previous post, we explored how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications. Now, let\\'s discuss how to integrate Ragas into your broader LLM development ecosystem and establish observability practices that transform evaluation from a one-time exercise into a continuous improvement cycle.\\n\\nWhy Integrations and Observability Matter\\n\\nEvaluation is most powerful when it\\'s:\\n\\nIntegrated into your existing workflow and tools\\n\\nAutomated to run consistently with minimal friction\\n\\nObservable so insights are easily accessible and actionable\\n\\nContinuous rather than a one-time or sporadic effort\\n\\nLet\\'s explore how Ragas helps you achieve these goals through its extensive integration capabilities.\\n\\nFramework Integrations\\n\\nRagas seamlessly connects with popular LLM application frameworks, allowing you to evaluate systems built with your preferred tools.\\n\\nLangChain Integration\\n\\nFor LangChain-based applications, Ragas provides dedicated integration support. Here’s how you can integrate Ragas step by step:\\n\\nPrepare your documents: Load your source documents and split them into manageable chunks for retrieval.\\n\\nSet up vector storage: Embed the document chunks and store them in a vector database to enable efficient retrieval.\\n\\nConfigure the retriever and QA chain: Use LangChain components to create a retriever and a question-answering (QA) chain powered by your chosen language model.\\n\\nGenerate a test set: Use Ragas to automatically generate a set of test questions and answers from your documents, or supply your own.\\n\\nEvaluate retrieval and QA performance: Apply Ragas metrics to assess both the retriever and the full QA chain, measuring aspects like context relevancy, faithfulness, and answer quality.\\n\\nReview results: Analyze the evaluation outputs to identify strengths and areas for improvement in your RAG pipeline.\\n\\nThis integration allows you to continuously measure and improve the effectiveness of your retrieval and generation components within the LangChain framework.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nRagas supports integration with a variety of popular LLM and RAG frameworks beyond LangChain, including LlamaIndex and Haystack. These integrations enable seamless evaluation of retrieval and generation components within your preferred stack. If you need guidance or code examples for integrating Ragas with platforms such as LlamaIndex, Haystack, or others, support and tailored examples can be provided on demand to fit your specific workflow and requirements.\\n\\nObservability Platform Integrations\\n\\nBeyond framework integrations, Ragas connects with leading observability platforms to help you monitor, track, and analyze evaluation results over time.\\n\\nLangSmith Integration\\n\\nFor LangChain users, LangSmith provides comprehensive tracing and evaluation. To integrate Ragas evaluation with LangSmith, follow these steps:\\n\\nSet up your environment\\n\\nUpload dataset to LangSmith\\n\\nDefine your LLM or chain\\n\\nSelect Ragas metrics\\n\\nRun evaluation with LangSmith\\n\\nYou can now view detailed experiment results in your LangSmith project dashboard. This integration enables you to trace, evaluate, and monitor your RAG pipeline performance directly within LangSmith, leveraging Ragas metrics for deeper insights.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nOther Platform Integrations\\n\\nRagas can be integrated with a range of observability and monitoring platforms beyond LangSmith, such as Langfuse and others. If you need help connecting Ragas to platforms like Langfuse or have specific requirements for your observability stack, tailored support and examples are available to fit your workflow.\\n\\nBuilding Automated Evaluation Pipelines\\n\\nTo ensure evaluation is a continuous part of your development process, set up automated pipelines that run evaluations regularly and automatically.\\n\\nCI/CD Integration\\n\\nYou can incorporate Ragas into your CI/CD pipeline so that every code change is automatically evaluated. This helps catch regressions early and ensures your RAG system maintains high performance before merging new changes.\\n\\nScheduled Evaluations\\n\\nRegularly scheduled evaluations allow you to monitor your system’s performance over time. By running evaluations at set intervals, you can track trends, spot regressions, and ensure your system continues to meet quality standards.\\n\\nMonitoring Evaluation Metrics Over Time\\n\\nTracking evaluation metrics over time helps you identify performance trends and quickly detect any drops in quality. By visualizing these metrics, you can better understand how changes to your system impact its effectiveness.\\n\\nCreating Custom Dashboards\\n\\nBuilding custom dashboards gives you a comprehensive view of your evaluation results. Dashboards can display current performance, trends, and detailed breakdowns of recent evaluations, making it easier to monitor your system and identify areas for improvement.\\n\\nWith these practices, you can make evaluation an ongoing, automated, and visible part of your development workflow, leading to more reliable and robust RAG systems.\\n\\nBest Practices for Observability\\n\\nDefine clear thresholds: Establish performance baselines and alert thresholds for each metric\\n\\nSegment evaluations: Break down results by query type, data source, or other relevant factors\\n\\nHistorical tracking: Maintain historical evaluation data to identify trends and regressions\\n\\nCorrelation analysis: Link evaluation metrics to user feedback and business outcomes\\n\\nRegular benchmarking: Periodically evaluate against fixed test sets to ensure consistency\\n\\nAlert on regressions: Implement automated alerts when metrics drop below thresholds\\n\\nContextualize metrics: Include example failures alongside aggregate metrics for better understanding\\n\\nBuilding a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:\\n\\nCapture evaluation results with Ragas\\n\\nIdentify patterns in failures and underperforming areas\\n\\nPrioritize improvements based on impact and effort\\n\\nImplement changes to your RAG components\\n\\nValidate improvements with focused re-evaluation\\n\\nMonitor continuously to catch regressions\\n\\nConclusion: From Evaluation to Action\\n\\nIntegrating Ragas with your frameworks and observability tools transforms evaluation from a point-in-time activity to a continuous improvement cycle. By making evaluation metrics visible, actionable, and integrated into your workflows, you create a foundation for systematic improvement of your LLM applications.\\n\\nThe most successful teams don\\'t just evaluate occasionally — they build evaluation into their development culture, making data-driven decisions based on objective metrics rather than subjective impressions.\\n\\nIn our final post, we\\'ll explore how to build effective feedback loops that translate evaluation insights into concrete improvements for your LLM applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Metrics and Customization Part 6: Evaluating AI Agents Part 7: Integrations and Observability with Ragas — You are here Next up in the series: Part 8: Building Feedback Loops\\n\\nHow are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!']",
- "['<1-hop>\\n\\ntitle: \"Part 4: Generating Test Data with Ragas\" date: 2025-04-27T16:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/generating_test_data.png\" readingTime: 14 published: true In our previous post, we explored how to comprehensively evaluate RAG systems using specialized metrics. However, even the best evaluation framework requires high-quality test data to yield meaningful insights. In this post, we\\'ll dive into how Ragas helps you generate robust test datasets for evaluating your LLM applications. Why and', '<2-hop>\\n\\nHow to Generate Synthetic Data for RAG Evaluation In the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, synthetic data generation is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like RAGAS and LangSmith. Why Generate Synthetic Data? Early Signal, Fast Iteration Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production. Controlled Complexity You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases. Benchmarking and Comparison Synthetic test sets provide a repeatable, comparable way to measure improvements as you tweak your pipeline (e.g., changing chunk size, embeddings, or prompts). How to Generate Synthetic Data 1. Prepare Your Source Data Start with a set of documents relevant to your domain. For example, you might download and load HTML blog posts into a document format using tools like LangChain’s DirectoryLoader. 2. Build a Knowledge Graph Use RAGAS to convert your documents into a knowledge graph. This graph captures entities, relationships, and summaries, forming the backbone for generating meaningful queries. RAGAS applies default transformations are dependent on the corpus length, here are some examples: Producing Summaries -> produces summaries of the documents Extracting Headlines -> finding the overall headline for the document Theme Extractor -> extracts broad themes about the documents It then uses cosine-similarity and heuristics between the embeddings of the above transformations to construct relationships between the nodes. This is a crucial step, as the quality of your knowledge graph directly impacts the relevance and accuracy of the generated queries. 3. Configure Query Synthesizers RAGAS provides several query synthesizers: - SingleHopSpecificQuerySynthesizer: Generates direct, fact-based questions. - MultiHopAbstractQuerySynthesizer: Creates broader, multi-step reasoning questions. - MultiHopSpecificQuerySynthesizer: Focuses on questions that require connecting specific entities across documents. By mixing these, you get a diverse and challenging test set. 4. Generate the Test Set With your knowledge graph and query synthesizers, use RAGAS’s TestsetGenerator to create a synthetic dataset. This dataset will include questions, reference answers, and supporting contexts. 5. Evaluate and Iterate Load your synthetic dataset into an evaluation platform like LangSmith. Run your RAG pipeline against the test set, and use automated evaluators (for accuracy, helpfulness, style, etc.) to identify strengths and weaknesses. Tweak your pipeline and re-evaluate to drive improvements. Minimal Example Here’s a high-level pseudocode outline (see the notebook for full details): ````python 1. Load documents from langchain_community.document_loaders import DirectoryLoader path = \"data/\" loader = DirectoryLoader(path, glob=\"*.md\") docs = loader.load() 2. Generate data from ragas.testset import TestsetGenerator from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings Initialize the generator with the LLM and embedding model generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4.1\")) generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) Create the test set generator generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings) dataset = generator.generate_with_langchain_docs(docs, testset_size=10) ```` dataset will now contain a set of questions, answers, and contexts that you can use to evaluate your RAG system. 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 04_Synthetic_Data_Generation', \"<3-hop>\\n\\nWhat's Coming in This Blog Series This introduction is just the beginning. In the upcoming posts, we'll dive deeper into all aspects of evaluating LLM applications with Ragas: Part 2: Basic Evaluation Workflow We'll explore each metric in detail, explaining when and how to use them effectively. Part 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance. Part 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application's capabilities. Part 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments. Part 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals. Part 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows. Part 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications. Conclusion In a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications. Ready to Elevate Your LLM Applications? Start exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you're facing specific evaluation hurdles, don't hesitate to reach out—we'd love to help!\"]",
- "Ragas facilitates both test data generation and synthetic data generation for evaluating Retrieval-Augmented Generation (RAG) systems by providing a structured framework to create robust test datasets. Here are the key steps and tools involved in creating these datasets:\n\n### Key Steps in Creating Robust Synthetic Test Datasets\n\n1. **Prepare Your Source Data**: Start with relevant documents, which can be loaded into a document format using tools like LangChain’s DirectoryLoader.\n\n2. **Build a Knowledge Graph**: Use Ragas to convert documents into a knowledge graph that captures entities, relationships, and summaries. This is crucial for generating meaningful queries.\n\n3. **Configure Query Synthesizers**: Ragas offers various query synthesizers, such as:\n - **SingleHopSpecificQuerySynthesizer**: Generates direct, fact-based questions.\n - **MultiHopAbstractQuerySynthesizer**: Creates broader, multi-step reasoning questions.\n - **MultiHopSpecificQuerySynthesizer**: Focuses on questions that require connecting specific entities across documents.\n\n4. **Generate the Test Set**: Utilize Ragas’s `TestsetGenerator` to create a synthetic dataset that includes questions, reference answers, and supporting contexts.\n\n5. **Evaluate and Iterate**: Load the synthetic dataset into an evaluation platform like LangSmith, run your RAG pipeline against the test set, and use automated evaluators to identify strengths and weaknesses.\n\n### Tools Involved\n- **Ragas**: The primary framework for evaluation and data generation.\n- **LangChain**: A tool for loading and processing documents.\n- **LangSmith**: An evaluation platform for analyzing results.\n\nFor more detailed information, you can refer to the following blog posts:\n- [Evaluating Rag Systems With Ragas](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)\n- [Generating Test Data With Ragas](https://thedataguy.pro/blog/generating-test-data-with-ragas/)\n- [Introduction To Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)",
- "Ragas facilitates test data generation and synthetic data generation for evaluating Retrieval-Augmented Generation (RAG) systems by providing a structured workflow and specialized tools. According to the blog series, high-quality test datasets are essential for meaningful evaluation of LLM applications. Ragas enables the creation of robust test datasets by supporting document-based, domain-specific, and adversarial test generation techniques (<1-hop>). For synthetic data generation, Ragas allows developers to quickly create test sets that mimic real user queries and contexts, which is especially useful when real-world data is scarce or expensive to label. The process involves several key steps: preparing source documents, building a knowledge graph using Ragas (which captures entities, relationships, and summaries), and configuring query synthesizers such as SingleHopSpecificQuerySynthesizer, MultiHopAbstractQuerySynthesizer, and MultiHopSpecificQuerySynthesizer to generate diverse and challenging questions. The TestsetGenerator in Ragas then creates a synthetic dataset containing questions, reference answers, and supporting contexts. This synthetic dataset can be loaded into evaluation platforms like LangSmith for automated assessment and iterative improvement of the RAG pipeline (<2-hop>). The blog series further outlines that these practices are part of a comprehensive approach to evaluating LLM applications, with future posts covering advanced evaluation techniques and feedback loops for continuous improvement (<3-hop>).",
+ "How does synthetic data generation contribute to the importance of data quantity in improving RAG evaluation, and what paradigm shift does this represent according to Peter Norvig's perspective?",
+ "[\"In our next post, we'll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\n---\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**Part 3: Evaluating RAG Systems with Ragas — _You are here_** \\n*Next up in the series:* \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '| **Metric** | **Comprehensive RAG Evaluation** | **Content Quality Evaluation** | **Retrieval Quality Evaluation** |\\n|-----------------------------|----------------------------------|---------------------------------|-----------------------------------|\\n| **Faithfulness** | ✓ | ✓ | |\\n| **Answer Relevancy** | ✓ | ✓ | |\\n| **Context Recall** | ✓ | | ✓ |\\n| **Context Precision** | ✓ | | ✓ |\\n| **Harmfulness** | | ✓ | |', '| **Harmfulness** | | ✓ | |\\n| **Coherence** | | ✓ | |\\n| **Context Relevancy** | | | ✓ |', \"**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\nWe'll explore each metric in detail, explaining when and how to use them effectively.\\n\\n**[Part 3: Evaluating RAG Systems](/blog/evaluating-rag-systems-with-ragas/)** \\nLearn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\nDiscover how to create high-quality test datasets that thoroughly exercise your application's capabilities.\\n\\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\nGo beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\nLearn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\"]",
+ "['<1-hop>\\n\\n## Why and How to Generate Synthetic Data for RAG Evaluation\\n\\nIn the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, **synthetic data generation** is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like [RAGAS](https://github.com/explodinggradients/ragas) and [LangSmith](https://smith.langchain.com/).\\n\\n---\\n\\n### Why Generate Synthetic Data?\\n\\n1. **Early Signal, Fast Iteration** \\n Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production.\\n\\n2. **Controlled Complexity** \\n You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases.', '<2-hop>\\n\\n## The Origin of \"Data is King\"\\n\\nPeter Norvig famously stated, \"We don\\'t have better algorithms. We just have more data.\" This statement emerged during a time when Google\\'s approach to machine translation was yielding surprisingly effective results not through algorithmic innovations, but through the sheer volume of multilingual data they had amassed. \\n\\nThis perspective represented a paradigm shift. Prior to this, the field had largely focused on crafting ever more sophisticated algorithms, with the assumption that smarter code would yield better results. Norvig\\'s insight suggested something different: even relatively simple algorithms could outperform more sophisticated ones when trained on sufficiently large datasets.\\n\\n## The Business Imperative of Data Ownership']",
+ "I don't know.",
+ "Synthetic data generation enables rapid creation of test sets that mimic real user queries and contexts, allowing for early evaluation and iteration of RAG systems even when real-world data is scarce or expensive to label. This approach increases the quantity of data available for evaluation, supporting robust system development. According to Peter Norvig's perspective, as described in the context, the importance of data quantity represents a paradigm shift: rather than relying solely on more sophisticated algorithms, having more data—even with simpler algorithms—can lead to better results. Thus, synthetic data generation aligns with this shift by providing the large datasets necessary to improve system performance.",
"multi_hop_abstract_query_synthesizer"
],
[
"6",
- "Wht speshulized evalushun metrix does Ragas provied for LLMs, and how do you selekt and configure these metrix in a basic evalushun workflow?",
- "['title: \"Part 2: Basic Evaluation Workflow with Ragas\" date: 2025-04-26T19:00:00-06:00 layout: blog description: \"Learn how to set up a basic evaluation workflow for LLM applications using Ragas. This guide walks you through data preparation, metric selection, and result analysis.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1600132806370-bf17e65e942f?q=80&w=1988&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 8 published: true\\n\\nIn our previous post, we introduced Ragas as a powerful framework for evaluating LLM applications. Now, let\\'s dive into the practical aspects of setting up your first evaluation pipeline.\\n\\nUnderstanding the Evaluation Workflow\\n\\nA typical Ragas evaluation workflow consists of four key steps:\\n\\nPrepare your data: Collect queries, contexts, responses, and reference answers\\n\\nSelect appropriate metrics: Choose metrics that align with what you want to evaluate\\n\\nRun the evaluation: Process your data through the selected metrics\\n\\nAnalyze the results: Interpret scores and identify areas for improvement\\n\\nLet\\'s walk through each step with practical examples.\\n\\nStep 1: Setting Up Your Environment\\n\\nFirst, ensure you have Ragas installed:\\n\\nbash uv add ragas\\n\\nNext, import the necessary components:\\n\\npython import pandas as pd from ragas import EvaluationDataset from ragas import evaluate, RunConfig from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity\\n\\nStep 2: Preparing Your Evaluation Data\\n\\nFor a RAG system evaluation, you\\'ll need:\\n\\nQuestions: User queries to your system\\n\\nContexts: Documents or chunks retrieved by your system\\n\\nResponses: Answers generated by your system\\n\\nGround truth (optional): Reference answers or documents for comparison\\n\\nHere\\'s how to organize this data:\\n\\n```python\\n\\nSample data\\n\\ndata = { \"user_input\": [ \"What are the main symptoms of COVID-19?\", \"How does machine learning differ from deep learning?\" ], \"retrieved_contexts\": [ [ \"Common symptoms of COVID-19 include fever, cough, and fatigue. Some patients also report loss of taste or smell, body aches, and difficulty breathing.\", \"COVID-19 is caused by the SARS-CoV-2 virus and spreads primarily through respiratory droplets.\" ], [ \"Machine learning is a subset of AI focused on algorithms that learn from data without being explicitly programmed.\", \"Deep learning is a specialized form of machine learning using neural networks with many layers (deep neural networks).\" ] ], \"response\": [ \"The main symptoms of COVID-19 include fever, cough, fatigue, and sometimes loss of taste or smell, body aches, and breathing difficulties.\", \"Machine learning is a subset of AI that focuses on algorithms learning from data, while deep learning is a specialized form of machine learning that uses deep neural networks with multiple layers.\" ], \"reference\": [ \"COVID-19 symptoms commonly include fever, dry cough, fatigue, loss of taste or smell, body aches, sore throat, and in severe cases, difficulty breathing.\", \"Machine learning is a branch of AI where systems learn from data, identify patterns, and make decisions with minimal human intervention. Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks) to analyze various factors of data.\" ] }\\n\\neval_data = pd.DataFrame(data)\\n\\nConvert to a format Ragas can use\\n\\nevaluation_dataset = EvaluationDataset.from_pandas(eval_data) evaluation_dataset\\n\\n```\\n\\nStep 3: Selecting and Configuring Metrics\\n\\nRagas offers various metrics to evaluate different aspects of your system:\\n\\nCore RAG Metrics:\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nFactual Correctness: Assesses if the response is accurate and free from factual errors.\\n\\nResponse Relevancy: Evaluates if the response directly addresses the user query.\\n\\nContext Entity Recall: Measures how well the retrieved context captures relevant entities from the ground truth.\\n\\nNoise Sensitivity: Assesses the robustness of the response to irrelevant or noisy context.\\n\\nLLM Context Recall: Evaluates how effectively the LLM utilizes the provided context to generate the response.\\n\\nFor metrics that require an LLM (like faithfulness), you need to configure the LLM provider:\\n\\n```python\\n\\nConfigure LLM for evaluation\\n\\nfrom langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nDefine metrics to use\\n\\nmetrics = [ Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity(), LLMContextRecall() ] ```\\n\\nStep 4: Running the Evaluation\\n\\nNow, run the evaluation with your selected metrics:\\n\\n```python\\n\\nRun evaluation\\n\\nresults = evaluate( evaluation_dataset, metrics=metrics, llm=evaluator_llm # Required for LLM-based metrics )\\n\\nView results\\n\\nprint(results) ```\\n\\nOutput:\\n\\nValues will vary based on your data and LLM performance.\\n\\npython { \"faithfulness\": 1.0000, \"factual_correctness\": 0.6750, \"answer_relevancy\": 0.9897, \"context_entity_recall\": 0.8889, \"noise_sensitivity_relevant\": 0.1667, \"context_recall\": 0.5000 }\\n\\nStep 5: Interpreting Results\\n\\nRagas metrics typically return scores between 0 and 1, where higher is better:\\n\\nUnderstanding Score Ranges:\\n\\n0.8-1.0: Excellent performance\\n\\n0.6-0.8: Good performance\\n\\n0.4-0.6: Moderate performance, needs improvement\\n\\n0.4 or lower: Poor performance, requires significant attention\\n\\nAdvanced Use: Custom Evaluation for Specific Examples\\n\\nFor more detailed analysis of specific examples:\\n\\n```python from ragas import SingleTurnSample from ragas.metrics import AspectCritic\\n\\nDefine a specific test case\\n\\ntest_data = { \"user_input\": \"What are quantum computers?\", \"response\": \"Quantum computers use quantum bits or qubits that can exist in multiple states simultaneously, unlike classical bits that can only be 0 or 1.\", \"retrieved_contexts\": [\"Quantum computing is a type of computation that harnesses quantum mechanical phenomena.\"] }\\n\\nCreate a custom evaluation metric\\n\\ncustom_metric = AspectCritic( name=\"quantum_accuracy\", llm=llm, definition=\"Verify if the explanation of quantum computing is accurate and complete.\" )\\n\\nScore the sample\\n\\nsample = SingleTurnSample(**test_data) score = await custom_metric.single_turn_ascore(sample) print(f\"Quantum accuracy score: {score}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 02_Basic_Evaluation_Workflow_with_Ragas\\n\\nCommon Evaluation Patterns and Metrics\\n\\nBelow is a matrix mapping evaluation patterns to the metrics used, along with definitions for each metric:\\n\\nMetric Comprehensive RAG Evaluation Content Quality Evaluation Retrieval Quality Evaluation Faithfulness ✓ ✓ Answer Relevancy ✓ ✓ Context Recall ✓ ✓ Context Precision ✓ ✓ Harmfulness ✓ Coherence ✓ Context Relevancy ✓\\n\\nMetric Definitions\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nAnswer Relevancy: Assesses if the response addresses the question.\\n\\nContext Recall: Measures how well the retrieved context covers the information in the ground truth.\\n\\nContext Precision: Evaluates the proportion of relevant information in the retrieved context.\\n\\nHarmfulness: Evaluates if the response contains harmful or inappropriate content.\\n\\nCoherence: Measures the logical flow and clarity of the response.\\n\\nContext Relevancy: Evaluates if the retrieved context is relevant to the question.\\n\\nThis matrix provides a clear overview of which metrics to use for specific evaluation patterns and their respective definitions.\\n\\nBest Practices for Ragas Evaluation\\n\\nStart simple: Begin with core metrics before adding more specialized ones\\n\\nUse diverse test cases: Include a variety of questions, from simple to complex\\n\\nConsider edge cases: Test with queries that might challenge your system\\n\\nCompare versions: Track metrics across different versions of your application\\n\\nCombine with human evaluation: Use Ragas alongside human feedback for a comprehensive assessment\\n\\nConclusion\\n\\nSetting up a basic evaluation workflow with Ragas is straightforward yet powerful. By systematically evaluating your LLM applications, you gain objective insights into their performance and clear directions for improvement.\\n\\nIn our next post, we\\'ll delve deeper into specialized evaluation techniques for RAG systems, exploring advanced metrics and evaluation strategies for retrieval-augmented generation applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow — You are here Next up in the series: Part 3: Evaluating RAG Systems Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHave you set up your first Ragas evaluation? What aspects of your LLM application are you most interested in measuring? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true\\n\\nIn our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs.\\n\\nBeyond the Basics: Why Advanced Metrics Matter\\n\\nWhile Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements:\\n\\nDomain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements\\n\\nCustom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches\\n\\nSpecialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics\\n\\nBusiness-specific KPIs: Aligning evaluation with business objectives requires customized metrics\\n\\nLet\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs.\\n\\nUnderstanding Ragas\\' Metric Architecture\\n\\nBefore creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture:\\n\\n1. Understand the Metric Base Classes\\n\\nAll metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these:\\n\\nSingleTurnMetric: For metrics that evaluate a single question/response pair.\\n\\nMultiTurnMetric: For metrics that evaluate multi-turn conversations.\\n\\nMetricWithLLM: For metrics that require an LLM for evaluation.\\n\\nMetricWithEmbeddings: For metrics that use embeddings.\\n\\nYou can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric).\\n\\nEach metric implements specific scoring methods depending on its type:\\n\\n_single_turn_ascore: For single-turn metrics\\n\\n_multi_turn_ascore: For multi-turn metrics\\n\\nCreating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel\\n\\nDefine input/output models for the prompt\\n\\nclass TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel): score: float feedback: str\\n\\nDefine the prompt\\n\\nclass TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ]\\n\\nCreate the metric\\n\\n@dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\",\\n\\n }\\n }\\n)\\noutput_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS\\nevaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt)\\n\\nasync def _single_turn_ascore(self, sample, callbacks) -> float:\\n assert self.llm is not None, \"LLM must be set\"\\n\\n question = sample.user_input\\n response = sample.response\\n # Extract programming language from question if possible\\n programming_language = \"python\" # Default\\n languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"]\\n for lang in languages:\\n if lang in question.lower():\\n programming_language = lang\\n break\\n\\n # Get the context\\n context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\"\\n\\n # Prepare input for prompt\\n prompt_input = TechnicalAccuracyInput(\\n question=question,\\n context=context,\\n response=response,\\n programming_language=programming_language\\n )\\n\\n # Generate evaluation\\n evaluation = await self.evaluation_prompt.generate(\\n data=prompt_input, llm=self.llm, callbacks=callbacks\\n )\\n\\n return evaluation.score\\n\\n```\\n\\nUsing the Custom Metric\\n\\nTo use the custom metric, simply include it in your evaluation pipeline:\\n\\n```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\ntest_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\")\\n\\nNote: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions,\\n\\n`` You can also use theevaluate` function to evaluate a dataset:\\n\\n```python from ragas import evaluate from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization\\n\\nCustomizing Metrics for Your Application\\n\\nYou can further refine your evaluation by customizing existing metrics—such as adjusting thresholds or criteria—to better fit your application\\'s requirements. For multi-turn conversations, you might configure metrics like topic adherence to emphasize specific aspects, such as precision or recall, based on your evaluation objectives.\\n\\nIn specialized domains like healthcare or legal, it\\'s crucial to design custom metrics that capture domain-specific accuracy and compliance needs. For complex applications, consider combining several metrics into composite scores to represent multiple quality dimensions.\\n\\nWhen assessing capabilities like code generation or structured outputs, develop metrics that evaluate execution correctness or schema compliance. For advanced scenarios, you can build metric pipelines that orchestrate several metrics and aggregate their results using strategies like weighted averages or minimum scores.\\n\\nBy thoughtfully customizing and combining metrics, you can achieve a comprehensive and meaningful evaluation framework tailored to your unique use case.\\n\\nBest Practices for Custom Metric Development\\n\\nSingle Responsibility: Each metric should evaluate one specific aspect\\n\\nClear Definition: Define precisely what your metric measures\\n\\nBounded Output: Scores should be normalized, typically in [0,1]\\n\\nReproducibility: Minimize randomness in evaluation\\n\\nDocumentation: Document criteria, prompt design, and interpretation guidelines\\n\\nTest with Examples: Verify metric behavior on clear-cut examples\\n\\nHuman Correlation: Validate that metrics correlate with human judgment\\n\\nStandardizing Custom Metrics\\n\\nTo ensure consistency across custom metrics, consider the following best practices:\\n\\nDefine a clear, human-readable description for each metric.\\n\\nProvide interpretation guidelines to help users understand score meanings.\\n\\nInclude metadata such as metric name, required columns, and output type.\\n\\nUse a standardized interface or base class for all custom metrics.\\n\\nImplementation Patterns for Advanced Metrics\\n\\nWhen developing advanced metrics like topic adherence:\\n\\nDesign multi-step evaluation workflows for complex tasks.\\n\\nUse specialized prompts for different sub-tasks within the metric.\\n\\nAllow configurable scoring modes (e.g., precision, recall, F1).\\n\\nSupport conversational context for multi-turn evaluations.\\n\\nDebugging Custom Metrics\\n\\nEffective debugging strategies include:\\n\\nImplementing a debug mode to capture prompt inputs, outputs, and intermediate results.\\n\\nLogging detailed evaluation steps for easier troubleshooting.\\n\\nReviewing final scores alongside intermediate calculations to identify issues.\\n\\nConclusion: Building an Evaluation Ecosystem\\n\\nCustom metrics allow you to build a comprehensive evaluation ecosystem tailored to your application\\'s specific needs:\\n\\nBaseline metrics: Start with Ragas\\' core metrics for fundamental quality aspects\\n\\nDomain adaptation: Add specialized metrics for your application domain\\n\\nFeature-specific metrics: Develop metrics for unique features of your system\\n\\nBusiness alignment: Create metrics that reflect specific business KPIs and requirements\\n\\nBy extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques — You are here Next up in the series: Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!', 'title: \"Part 7: Integrations and Observability with Ragas\" date: 2025-04-30T07:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/integrations-and-observability.png\" readingTime: 12 published: true\\n\\nPart 6: Integrations and Observability with Ragas\\n\\nIn our previous post, we explored how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications. Now, let\\'s discuss how to integrate Ragas into your broader LLM development ecosystem and establish observability practices that transform evaluation from a one-time exercise into a continuous improvement cycle.\\n\\nWhy Integrations and Observability Matter\\n\\nEvaluation is most powerful when it\\'s:\\n\\nIntegrated into your existing workflow and tools\\n\\nAutomated to run consistently with minimal friction\\n\\nObservable so insights are easily accessible and actionable\\n\\nContinuous rather than a one-time or sporadic effort\\n\\nLet\\'s explore how Ragas helps you achieve these goals through its extensive integration capabilities.\\n\\nFramework Integrations\\n\\nRagas seamlessly connects with popular LLM application frameworks, allowing you to evaluate systems built with your preferred tools.\\n\\nLangChain Integration\\n\\nFor LangChain-based applications, Ragas provides dedicated integration support. Here’s how you can integrate Ragas step by step:\\n\\nPrepare your documents: Load your source documents and split them into manageable chunks for retrieval.\\n\\nSet up vector storage: Embed the document chunks and store them in a vector database to enable efficient retrieval.\\n\\nConfigure the retriever and QA chain: Use LangChain components to create a retriever and a question-answering (QA) chain powered by your chosen language model.\\n\\nGenerate a test set: Use Ragas to automatically generate a set of test questions and answers from your documents, or supply your own.\\n\\nEvaluate retrieval and QA performance: Apply Ragas metrics to assess both the retriever and the full QA chain, measuring aspects like context relevancy, faithfulness, and answer quality.\\n\\nReview results: Analyze the evaluation outputs to identify strengths and areas for improvement in your RAG pipeline.\\n\\nThis integration allows you to continuously measure and improve the effectiveness of your retrieval and generation components within the LangChain framework.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nRagas supports integration with a variety of popular LLM and RAG frameworks beyond LangChain, including LlamaIndex and Haystack. These integrations enable seamless evaluation of retrieval and generation components within your preferred stack. If you need guidance or code examples for integrating Ragas with platforms such as LlamaIndex, Haystack, or others, support and tailored examples can be provided on demand to fit your specific workflow and requirements.\\n\\nObservability Platform Integrations\\n\\nBeyond framework integrations, Ragas connects with leading observability platforms to help you monitor, track, and analyze evaluation results over time.\\n\\nLangSmith Integration\\n\\nFor LangChain users, LangSmith provides comprehensive tracing and evaluation. To integrate Ragas evaluation with LangSmith, follow these steps:\\n\\nSet up your environment\\n\\nUpload dataset to LangSmith\\n\\nDefine your LLM or chain\\n\\nSelect Ragas metrics\\n\\nRun evaluation with LangSmith\\n\\nYou can now view detailed experiment results in your LangSmith project dashboard. This integration enables you to trace, evaluate, and monitor your RAG pipeline performance directly within LangSmith, leveraging Ragas metrics for deeper insights.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nOther Platform Integrations\\n\\nRagas can be integrated with a range of observability and monitoring platforms beyond LangSmith, such as Langfuse and others. If you need help connecting Ragas to platforms like Langfuse or have specific requirements for your observability stack, tailored support and examples are available to fit your workflow.\\n\\nBuilding Automated Evaluation Pipelines\\n\\nTo ensure evaluation is a continuous part of your development process, set up automated pipelines that run evaluations regularly and automatically.\\n\\nCI/CD Integration\\n\\nYou can incorporate Ragas into your CI/CD pipeline so that every code change is automatically evaluated. This helps catch regressions early and ensures your RAG system maintains high performance before merging new changes.\\n\\nScheduled Evaluations\\n\\nRegularly scheduled evaluations allow you to monitor your system’s performance over time. By running evaluations at set intervals, you can track trends, spot regressions, and ensure your system continues to meet quality standards.\\n\\nMonitoring Evaluation Metrics Over Time\\n\\nTracking evaluation metrics over time helps you identify performance trends and quickly detect any drops in quality. By visualizing these metrics, you can better understand how changes to your system impact its effectiveness.\\n\\nCreating Custom Dashboards\\n\\nBuilding custom dashboards gives you a comprehensive view of your evaluation results. Dashboards can display current performance, trends, and detailed breakdowns of recent evaluations, making it easier to monitor your system and identify areas for improvement.\\n\\nWith these practices, you can make evaluation an ongoing, automated, and visible part of your development workflow, leading to more reliable and robust RAG systems.\\n\\nBest Practices for Observability\\n\\nDefine clear thresholds: Establish performance baselines and alert thresholds for each metric\\n\\nSegment evaluations: Break down results by query type, data source, or other relevant factors\\n\\nHistorical tracking: Maintain historical evaluation data to identify trends and regressions\\n\\nCorrelation analysis: Link evaluation metrics to user feedback and business outcomes\\n\\nRegular benchmarking: Periodically evaluate against fixed test sets to ensure consistency\\n\\nAlert on regressions: Implement automated alerts when metrics drop below thresholds\\n\\nContextualize metrics: Include example failures alongside aggregate metrics for better understanding\\n\\nBuilding a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:\\n\\nCapture evaluation results with Ragas\\n\\nIdentify patterns in failures and underperforming areas\\n\\nPrioritize improvements based on impact and effort\\n\\nImplement changes to your RAG components\\n\\nValidate improvements with focused re-evaluation\\n\\nMonitor continuously to catch regressions\\n\\nConclusion: From Evaluation to Action\\n\\nIntegrating Ragas with your frameworks and observability tools transforms evaluation from a point-in-time activity to a continuous improvement cycle. By making evaluation metrics visible, actionable, and integrated into your workflows, you create a foundation for systematic improvement of your LLM applications.\\n\\nThe most successful teams don\\'t just evaluate occasionally — they build evaluation into their development culture, making data-driven decisions based on objective metrics rather than subjective impressions.\\n\\nIn our final post, we\\'ll explore how to build effective feedback loops that translate evaluation insights into concrete improvements for your LLM applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Metrics and Customization Part 6: Evaluating AI Agents Part 7: Integrations and Observability with Ragas — You are here Next up in the series: Part 8: Building Feedback Loops\\n\\nHow are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!']",
- "['<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation Workflow with Ragas\" date: 2025-04-26T19:00:00-06:00 layout: blog description: \"Learn how to set up a basic evaluation workflow for LLM applications using Ragas. This guide walks you through data preparation, metric selection, and result analysis.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1600132806370-bf17e65e942f?q=80&w=1988&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 8 published: true In our previous post, we introduced Ragas as a powerful framework for evaluating LLM applications. Now, let\\'s dive into the practical aspects of setting up your first evaluation pipeline. Understanding the Evaluation Workflow A typical Ragas evaluation workflow consists of four key steps: Prepare your data: Collect queries, contexts, responses, and reference answers Select appropriate metrics: Choose metrics that align with what you want to evaluate Run the evaluation: Process your data through the selected metrics Analyze the results: Interpret scores and identify areas for improvement Let\\'s walk through each step with practical examples. Step 1: Setting Up Your Environment First, ensure you have Ragas installed: bash uv add ragas Next, import the necessary components: python import pandas as pd from ragas import EvaluationDataset from ragas import evaluate, RunConfig from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity Step 2: Preparing Your Evaluation Data For a RAG system evaluation, you\\'ll need: Questions: User queries to your system Contexts: Documents or chunks retrieved by your system Responses: Answers generated by your system Ground truth (optional): Reference answers or documents for comparison Here\\'s how to organize this data: ```python Sample data data = { \"user_input\": [ \"What are the main symptoms of COVID-19?\", \"How does machine learning differ from deep learning?\" ], \"retrieved_contexts\": [ [ \"Common symptoms of COVID-19 include fever, cough, and fatigue. Some patients also report loss of taste or smell, body aches, and difficulty breathing.\", \"COVID-19 is caused by the SARS-CoV-2 virus and spreads primarily through respiratory droplets.\" ], [ \"Machine learning is a subset of AI focused on algorithms that learn from data without being explicitly programmed.\", \"Deep learning is a specialized form of machine learning using neural networks with many layers (deep neural networks).\" ] ], \"response\": [ \"The main symptoms of COVID-19 include fever, cough, fatigue, and sometimes loss of taste or smell, body aches, and breathing difficulties.\", \"Machine learning is a subset of AI that focuses on algorithms learning from data, while deep learning is a specialized form of machine learning that uses deep neural networks with multiple layers.\" ], \"reference\": [ \"COVID-19 symptoms commonly include fever, dry cough, fatigue, loss of taste or smell, body aches, sore throat, and in severe cases, difficulty breathing.\", \"Machine learning is a branch of AI where systems learn from data, identify patterns, and make decisions with minimal human intervention. Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks) to analyze various factors of data.\" ] } eval_data = pd.DataFrame(data) Convert to a format Ragas can use evaluation_dataset = EvaluationDataset.from_pandas(eval_data) evaluation_dataset ``` Step 3: Selecting and Configuring Metrics Ragas offers various metrics to evaluate different aspects of your system: Core RAG Metrics: Faithfulness: Measures if the response is factually consistent with the provided context. Factual Correctness: Assesses if the response is accurate and free from factual errors. Response Relevancy: Evaluates if the response directly addresses the user query. Context Entity Recall: Measures how well the retrieved context captures relevant entities from the ground truth. Noise Sensitivity: Assesses the robustness of the response to irrelevant or noisy context. LLM Context Recall: Evaluates how effectively the LLM utilizes the provided context to generate the response. For metrics that require an LLM (like faithfulness), you need to configure the LLM provider: ```python Configure LLM for evaluation from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper Initialize the LLM, you are going to OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Define metrics to use metrics = [ Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity(), LLMContextRecall() ] ```', '<2-hop>\\n\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true As Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in. What is Ragas? Ragas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems. At its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations? Why Evaluate LLM Applications? LLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable. Evaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application Key Features of Ragas 🎯 Specialized Metrics Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications: Faithfulness: Measures if the response is factually consistent with the retrieved context Context Relevancy: Evaluates if the retrieved information is relevant to the query Answer Relevancy: Assesses if the response addresses the user\\'s question Topic Adherence: Gauges how well multi-turn conversations stay on topic 🧪 Test Data Generation Creating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage. 🔗 Seamless Integrations Ragas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI Observability platforms - Phoenix - LangSmith - Langfuse 📊 Comprehensive Analysis Beyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements. Getting Started with Ragas Installing Ragas is straightforward: bash uv init && uv add ragas Here\\'s a simple example of evaluating a response using Ragas: ```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI Initialize the LLM, you are going to new OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Your evaluation data test_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" } Create a sample sample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor Create metric faithfulness = Faithfulness(llm=evaluator_llm) Calculate the score result = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ``` 💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas']",
- "Ragas provides several specialized evaluation metrics for LLMs, including:\n\n1. **Faithfulness**: Measures if the response is factually consistent with the provided context.\n2. **Factual Correctness**: Assesses if the response is accurate and free from factual errors.\n3. **Response Relevancy**: Evaluates if the response directly addresses the user query.\n4. **Context Entity Recall**: Measures how well the retrieved context captures relevant entities from the ground truth.\n5. **Noise Sensitivity**: Assesses the robustness of the response to irrelevant or noisy context.\n6. **LLM Context Recall**: Evaluates how effectively the LLM utilizes the provided context to generate the response.\n\n### Selecting and Configuring Metrics in a Basic Evaluation Workflow\n\n1. **Prepare Your Data**: Collect user queries, contexts, responses, and reference answers.\n2. **Select Appropriate Metrics**: Choose metrics that align with your evaluation goals.\n3. **Run the Evaluation**: Process your data through the selected metrics.\n4. **Analyze the Results**: Interpret scores and identify areas for improvement.\n\nTo configure the metrics, you can initialize them in your code as follows:\n\n```python\nfrom ragas.metrics import Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity, LLMContextRecall\n\nmetrics = [\n Faithfulness(),\n FactualCorrectness(),\n ResponseRelevancy(),\n ContextEntityRecall(),\n NoiseSensitivity(),\n LLMContextRecall()\n]\n```\n\nFor more detailed guidance, you can refer to the following resources:\n\n- [Basic Evaluation Workflow With Ragas](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)\n- [Introduction To Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Advanced Metrics And Customization With Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)",
- "Ragas provieds speshulized evalushun metrix for LLMs, such as Faithfulness, Factual Correctness, Response Relevancy, Context Entity Recall, Noise Sensitivity, and LLM Context Recall. These metrix are taylored to address the unique challeenges of LLM-powred systems, like ensuring responses are factually consistant with the context and relevunt to the user query. In a basic evalushun workflow, you selekt metrix that align with your evalushun goals, then configure them—sum metrix, like Faithfulness, require setting up an LLM provider (for example, using LangchainLLMWrapper with a model like gpt-4o). You then run your evalushun by processing your data through the selekted metrix to analyze results and identify improvemint areas.",
+ "How does Ragas support the evaluation of both LLM applications and AI agents, and what specialized metrics are introduced for evaluating AI agents?",
+ "[\"In our next post, we'll explore advanced metrics and customization techniques for specialized evaluation needs.\\n\\n---\\n\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**Part 4: Test Data Generation — _You are here_** \\n*Next up in the series:* \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**Part 6: Evaluating AI Agents — _You are here_** \\n*Next up in the series:* \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '---\\n*This is the eighth part of a series on Ragas, a research-driven evaluation framework for LLM and RAG systems. If you missed the previous parts, check them out below:*\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Metrics and Customization](/blog/advanced-metrics-and-customization-with-ragas/)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**Part 8: Building Feedback Loops — _You are here_**']",
+ "['<1-hop>\\n\\n# Create a sample\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\n# Create metric\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n# Calculate the score\\nresult = await faithfulness.single_turn_ascore(sample)\\nprint(f\"Faithfulness score: {result}\")\\n```\\n\\n> 💡 **Try it yourself:** \\n> Explore the hands-on notebook for this workflow: \\n> [01_Introduction_to_Ragas](https://github.com/mafzaal/intro-to-ragas/blob/master/01_Introduction_to_Ragas.ipynb)\\n\\n## What\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\nWe\\'ll explore each metric in detail, explaining when and how to use them effectively.', \"<2-hop>\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas)** \\n**Part 5: Advanced Evaluation Techniques — _You are here_** \\n*Next up in the series:* \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", '<3-hop>\\n\\n---\\ntitle: \"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\"\\ndate: 2025-04-28T06:00:00-06:00\\nlayout: blog\\ndescription: \"Learn how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications.\"\\ncategories: [\"AI\", \"Agents\", \"Evaluation\", \"Ragas\", \"LLM\"]\\ncoverImage: \"/images/ai_agent_evaluation.png\" \\nreadingTime: 8\\npublished: true\\n---\\n\\nIn our previous posts, we\\'ve explored how Ragas evaluates RAG systems and enables custom metrics for specialized applications. As LLMs evolve beyond simple question-answering to become powerful AI agents, evaluation needs have grown more sophisticated too. In this post, we\\'ll explore Ragas\\' specialized metrics for evaluating AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\n## The Challenge of Evaluating AI Agents']",
+ "Ragas is an evaluation framework designed to support both LLM (Large Language Model) applications and AI agents by providing a structured approach to assess their performance. It introduces specialized metrics tailored for evaluating AI agents, focusing on aspects beyond simple answer accuracy, such as contextual understanding and interaction quality.\n\nFor more detailed insights, you can explore the following parts of the Ragas series:\n\n- **[Part 1: Introduction to Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)**\n- **[Part 2: Basic Evaluation Workflow](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)**\n- **[Part 3: Evaluating RAG Systems with Ragas](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)**\n- **[Part 5: Advanced Metrics and Customization](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)**\n- **[Part 6: Evaluating AI Agents](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/)**\n\nThese resources will provide a comprehensive understanding of how Ragas enhances evaluation processes for both LLMs and AI agents.",
+ "Ragas supports the evaluation of LLM applications by providing metrics such as faithfulness, which can be applied to single-turn samples as shown in the introductory workflow. For AI agents, Ragas introduces specialized metrics designed to assess more complex behaviors, including goal accuracy, tool call accuracy, and topic adherence. These metrics enable the evaluation of AI agents that perform multi-turn interactions, utilize tools, and work toward specific goals, thus addressing the advanced requirements of evaluating agent-based applications.",
"multi_hop_abstract_query_synthesizer"
],
[
"7",
- "Which specialized metrics does Ragas provide for evaluating Retrieval-Augmented Generation (RAG) systems, and how do these metrics address the unique evaluation challenges posed by the multi-component nature of RAG systems?",
- "['title: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true\\n\\nIn our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature.\\n\\nUnderstanding RAG Systems: More Than the Sum of Their Parts\\n\\nRAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information\\n\\nThis dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content.\\n\\nThe RAG Evaluation Triad\\n\\nEffective RAG evaluation requires examining three key dimensions:\\n\\nRetrieval Quality: How well does the system find relevant information?\\n\\nGeneration Quality: How well does the system produce responses from retrieved information?\\n\\nEnd-to-End Performance: How well does the complete system satisfy user needs?\\n\\nLet\\'s explore how Ragas helps evaluate each dimension of RAG systems.\\n\\nCore RAG Metrics in Ragas\\n\\nRagas provides specialized metrics to assess RAG systems across retrieval, generation, and end-to-end performance.\\n\\nRetrieval Quality Metrics\\n\\n1. Context Relevancy\\n\\nMeasures how relevant the retrieved documents are to the user\\'s question.\\n\\nHow it works:\\n\\nTakes the user\\'s question (user_input) and the retrieved documents (retrieved_contexts).\\n\\nUses an LLM to score relevance with two different prompts, averaging the results for robustness.\\n\\nScores are normalized between 0.0 (irrelevant) and 1.0 (fully relevant).\\n\\nWhy it matters: Low scores indicate your retriever is pulling in unrelated or noisy documents. Monitoring this helps you improve the retrieval step.\\n\\n2. Context Precision\\n\\nAssesses how much of the retrieved context is actually useful for generating the answer.\\n\\nHow it works:\\n\\nFor each retrieved chunk, an LLM judges if it was necessary for the answer, using the ground truth (reference) or the generated response.\\n\\nCalculates Average Precision, rewarding systems that rank useful chunks higher.\\n\\nVariants:\\n\\nContextUtilization: Uses the generated response instead of ground truth.\\n\\nNon-LLM version: Compares retrieved chunks to ideal reference contexts using string similarity.\\n\\nWhy it matters: High precision means your retriever is efficient; low precision means too much irrelevant information is included.\\n\\n3. Context Recall\\n\\nEvaluates whether all necessary information from the ground truth answer is present in the retrieved context.\\n\\nHow it works:\\n\\nBreaks down the reference answer into sentences.\\n\\nFor each sentence, an LLM checks if it can be supported by the retrieved context.\\n\\nThe score is the proportion of reference sentences attributed to the retrieved context.\\n\\nVariants:\\n\\nNon-LLM version: Compares reference and retrieved contexts using similarity and thresholds.\\n\\nWhy it matters: High recall means your retriever finds all needed information; low recall means critical information is missing.\\n\\nSummary: - Low context relevancy: Retriever needs better query understanding or semantic matching. - Low context precision: Retriever includes unnecessary information. - Low context recall: Retriever misses critical information.\\n\\nGeneration Quality Metrics\\n\\n1. Faithfulness\\n\\nChecks if the generated answer is factually consistent with the retrieved context, addressing hallucination.\\n\\nHow it works:\\n\\nBreaks the answer into simple statements.\\n\\nFor each, an LLM checks if it can be inferred from the retrieved context.\\n\\nThe score is the proportion of faithful statements.\\n\\nAlternative:\\n\\nFaithfulnesswithHHEM: Uses a specialized NLI model for verification.\\n\\nWhy it matters: High faithfulness means answers are grounded in context; low faithfulness signals hallucination.\\n\\n2. Answer Relevancy\\n\\nMeasures if the generated answer directly addresses the user\\'s question.\\n\\nHow it works:\\n\\nAsks an LLM to generate possible questions for the answer.\\n\\nCompares these to the original question using embedding similarity.\\n\\nPenalizes noncommittal answers.\\n\\nWhy it matters: High relevancy means answers are on-topic; low relevancy means answers are off-topic or incomplete.\\n\\nSummary: - Low faithfulness: Generator adds facts not supported by context. - Low answer relevancy: Generator doesn\\'t focus on the specific question.\\n\\nEnd-to-End Metrics\\n\\n1. Correctness\\n\\nAssesses factual alignment between the generated answer and a ground truth reference.\\n\\nHow it works:\\n\\nBreaks both the answer and reference into claims.\\n\\nUses NLI to verify claims in both directions.\\n\\nCalculates precision, recall, or F1-score.\\n\\nWhy it matters: High correctness means answers match the ground truth; low correctness signals factual errors.\\n\\nKey distinction: - Faithfulness: Compares answer to retrieved context. - FactualCorrectness: Compares answer to ground truth.\\n\\nCommon RAG Evaluation Patterns\\n\\n1. High Retrieval, Low Generation Scores\\n\\nDiagnosis: Good retrieval, poor use of information.\\n\\nFixes: Improve prompts, use better generation models, or verify responses post-generation.\\n\\n2. Low Retrieval, High Generation Scores\\n\\nDiagnosis: Good generation, inadequate information.\\n\\nFixes: Enhance indexing, retrieval algorithms, or expand the knowledge base.\\n\\n3. Low Context Precision, High Faithfulness\\n\\nDiagnosis: Retrieves too much, but generates reliably.\\n\\nFixes: Filter passages, optimize chunk size, or use re-ranking.\\n\\nBest Practices for RAG Evaluation\\n\\nEvaluate components independently: Assess retrieval and generation separately.\\n\\nUse diverse queries: Include factoid, explanatory, and complex questions.\\n\\nCompare against baselines: Test against simpler systems.\\n\\nPerform ablation studies: Try variations like different chunk sizes or retrieval models.\\n\\nCombine with human evaluation: Use Ragas with human judgment for a complete view.\\n\\nConclusion: The Iterative RAG Evaluation Cycle\\n\\nEffective RAG development is iterative:\\n\\nEvaluate: Measure performance.\\n\\nAnalyze: Identify weaknesses.\\n\\nImprove: Apply targeted enhancements.\\n\\nRe-evaluate: Measure the impact of changes.\\n\\nThe Iterative RAG Evaluation Cycle\\n\\nBy using Ragas to implement this cycle, you can systematically improve your RAG system\\'s performance across all dimensions.\\n\\nIn our next post, we\\'ll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas — You are here Next up in the series: Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true\\n\\nIn our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs.\\n\\nBeyond the Basics: Why Advanced Metrics Matter\\n\\nWhile Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements:\\n\\nDomain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements\\n\\nCustom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches\\n\\nSpecialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics\\n\\nBusiness-specific KPIs: Aligning evaluation with business objectives requires customized metrics\\n\\nLet\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs.\\n\\nUnderstanding Ragas\\' Metric Architecture\\n\\nBefore creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture:\\n\\n1. Understand the Metric Base Classes\\n\\nAll metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these:\\n\\nSingleTurnMetric: For metrics that evaluate a single question/response pair.\\n\\nMultiTurnMetric: For metrics that evaluate multi-turn conversations.\\n\\nMetricWithLLM: For metrics that require an LLM for evaluation.\\n\\nMetricWithEmbeddings: For metrics that use embeddings.\\n\\nYou can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric).\\n\\nEach metric implements specific scoring methods depending on its type:\\n\\n_single_turn_ascore: For single-turn metrics\\n\\n_multi_turn_ascore: For multi-turn metrics\\n\\nCreating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel\\n\\nDefine input/output models for the prompt\\n\\nclass TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel): score: float feedback: str\\n\\nDefine the prompt\\n\\nclass TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ]\\n\\nCreate the metric\\n\\n@dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\",\\n\\n }\\n }\\n)\\noutput_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS\\nevaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt)\\n\\nasync def _single_turn_ascore(self, sample, callbacks) -> float:\\n assert self.llm is not None, \"LLM must be set\"\\n\\n question = sample.user_input\\n response = sample.response\\n # Extract programming language from question if possible\\n programming_language = \"python\" # Default\\n languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"]\\n for lang in languages:\\n if lang in question.lower():\\n programming_language = lang\\n break\\n\\n # Get the context\\n context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\"\\n\\n # Prepare input for prompt\\n prompt_input = TechnicalAccuracyInput(\\n question=question,\\n context=context,\\n response=response,\\n programming_language=programming_language\\n )\\n\\n # Generate evaluation\\n evaluation = await self.evaluation_prompt.generate(\\n data=prompt_input, llm=self.llm, callbacks=callbacks\\n )\\n\\n return evaluation.score\\n\\n```\\n\\nUsing the Custom Metric\\n\\nTo use the custom metric, simply include it in your evaluation pipeline:\\n\\n```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\ntest_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\")\\n\\nNote: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions,\\n\\n`` You can also use theevaluate` function to evaluate a dataset:\\n\\n```python from ragas import evaluate from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization\\n\\nCustomizing Metrics for Your Application\\n\\nYou can further refine your evaluation by customizing existing metrics—such as adjusting thresholds or criteria—to better fit your application\\'s requirements. For multi-turn conversations, you might configure metrics like topic adherence to emphasize specific aspects, such as precision or recall, based on your evaluation objectives.\\n\\nIn specialized domains like healthcare or legal, it\\'s crucial to design custom metrics that capture domain-specific accuracy and compliance needs. For complex applications, consider combining several metrics into composite scores to represent multiple quality dimensions.\\n\\nWhen assessing capabilities like code generation or structured outputs, develop metrics that evaluate execution correctness or schema compliance. For advanced scenarios, you can build metric pipelines that orchestrate several metrics and aggregate their results using strategies like weighted averages or minimum scores.\\n\\nBy thoughtfully customizing and combining metrics, you can achieve a comprehensive and meaningful evaluation framework tailored to your unique use case.\\n\\nBest Practices for Custom Metric Development\\n\\nSingle Responsibility: Each metric should evaluate one specific aspect\\n\\nClear Definition: Define precisely what your metric measures\\n\\nBounded Output: Scores should be normalized, typically in [0,1]\\n\\nReproducibility: Minimize randomness in evaluation\\n\\nDocumentation: Document criteria, prompt design, and interpretation guidelines\\n\\nTest with Examples: Verify metric behavior on clear-cut examples\\n\\nHuman Correlation: Validate that metrics correlate with human judgment\\n\\nStandardizing Custom Metrics\\n\\nTo ensure consistency across custom metrics, consider the following best practices:\\n\\nDefine a clear, human-readable description for each metric.\\n\\nProvide interpretation guidelines to help users understand score meanings.\\n\\nInclude metadata such as metric name, required columns, and output type.\\n\\nUse a standardized interface or base class for all custom metrics.\\n\\nImplementation Patterns for Advanced Metrics\\n\\nWhen developing advanced metrics like topic adherence:\\n\\nDesign multi-step evaluation workflows for complex tasks.\\n\\nUse specialized prompts for different sub-tasks within the metric.\\n\\nAllow configurable scoring modes (e.g., precision, recall, F1).\\n\\nSupport conversational context for multi-turn evaluations.\\n\\nDebugging Custom Metrics\\n\\nEffective debugging strategies include:\\n\\nImplementing a debug mode to capture prompt inputs, outputs, and intermediate results.\\n\\nLogging detailed evaluation steps for easier troubleshooting.\\n\\nReviewing final scores alongside intermediate calculations to identify issues.\\n\\nConclusion: Building an Evaluation Ecosystem\\n\\nCustom metrics allow you to build a comprehensive evaluation ecosystem tailored to your application\\'s specific needs:\\n\\nBaseline metrics: Start with Ragas\\' core metrics for fundamental quality aspects\\n\\nDomain adaptation: Add specialized metrics for your application domain\\n\\nFeature-specific metrics: Develop metrics for unique features of your system\\n\\nBusiness alignment: Create metrics that reflect specific business KPIs and requirements\\n\\nBy extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques — You are here Next up in the series: Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!', 'title: \"Part 7: Integrations and Observability with Ragas\" date: 2025-04-30T07:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/integrations-and-observability.png\" readingTime: 12 published: true\\n\\nPart 6: Integrations and Observability with Ragas\\n\\nIn our previous post, we explored how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications. Now, let\\'s discuss how to integrate Ragas into your broader LLM development ecosystem and establish observability practices that transform evaluation from a one-time exercise into a continuous improvement cycle.\\n\\nWhy Integrations and Observability Matter\\n\\nEvaluation is most powerful when it\\'s:\\n\\nIntegrated into your existing workflow and tools\\n\\nAutomated to run consistently with minimal friction\\n\\nObservable so insights are easily accessible and actionable\\n\\nContinuous rather than a one-time or sporadic effort\\n\\nLet\\'s explore how Ragas helps you achieve these goals through its extensive integration capabilities.\\n\\nFramework Integrations\\n\\nRagas seamlessly connects with popular LLM application frameworks, allowing you to evaluate systems built with your preferred tools.\\n\\nLangChain Integration\\n\\nFor LangChain-based applications, Ragas provides dedicated integration support. Here’s how you can integrate Ragas step by step:\\n\\nPrepare your documents: Load your source documents and split them into manageable chunks for retrieval.\\n\\nSet up vector storage: Embed the document chunks and store them in a vector database to enable efficient retrieval.\\n\\nConfigure the retriever and QA chain: Use LangChain components to create a retriever and a question-answering (QA) chain powered by your chosen language model.\\n\\nGenerate a test set: Use Ragas to automatically generate a set of test questions and answers from your documents, or supply your own.\\n\\nEvaluate retrieval and QA performance: Apply Ragas metrics to assess both the retriever and the full QA chain, measuring aspects like context relevancy, faithfulness, and answer quality.\\n\\nReview results: Analyze the evaluation outputs to identify strengths and areas for improvement in your RAG pipeline.\\n\\nThis integration allows you to continuously measure and improve the effectiveness of your retrieval and generation components within the LangChain framework.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nRagas supports integration with a variety of popular LLM and RAG frameworks beyond LangChain, including LlamaIndex and Haystack. These integrations enable seamless evaluation of retrieval and generation components within your preferred stack. If you need guidance or code examples for integrating Ragas with platforms such as LlamaIndex, Haystack, or others, support and tailored examples can be provided on demand to fit your specific workflow and requirements.\\n\\nObservability Platform Integrations\\n\\nBeyond framework integrations, Ragas connects with leading observability platforms to help you monitor, track, and analyze evaluation results over time.\\n\\nLangSmith Integration\\n\\nFor LangChain users, LangSmith provides comprehensive tracing and evaluation. To integrate Ragas evaluation with LangSmith, follow these steps:\\n\\nSet up your environment\\n\\nUpload dataset to LangSmith\\n\\nDefine your LLM or chain\\n\\nSelect Ragas metrics\\n\\nRun evaluation with LangSmith\\n\\nYou can now view detailed experiment results in your LangSmith project dashboard. This integration enables you to trace, evaluate, and monitor your RAG pipeline performance directly within LangSmith, leveraging Ragas metrics for deeper insights.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nOther Platform Integrations\\n\\nRagas can be integrated with a range of observability and monitoring platforms beyond LangSmith, such as Langfuse and others. If you need help connecting Ragas to platforms like Langfuse or have specific requirements for your observability stack, tailored support and examples are available to fit your workflow.\\n\\nBuilding Automated Evaluation Pipelines\\n\\nTo ensure evaluation is a continuous part of your development process, set up automated pipelines that run evaluations regularly and automatically.\\n\\nCI/CD Integration\\n\\nYou can incorporate Ragas into your CI/CD pipeline so that every code change is automatically evaluated. This helps catch regressions early and ensures your RAG system maintains high performance before merging new changes.\\n\\nScheduled Evaluations\\n\\nRegularly scheduled evaluations allow you to monitor your system’s performance over time. By running evaluations at set intervals, you can track trends, spot regressions, and ensure your system continues to meet quality standards.\\n\\nMonitoring Evaluation Metrics Over Time\\n\\nTracking evaluation metrics over time helps you identify performance trends and quickly detect any drops in quality. By visualizing these metrics, you can better understand how changes to your system impact its effectiveness.\\n\\nCreating Custom Dashboards\\n\\nBuilding custom dashboards gives you a comprehensive view of your evaluation results. Dashboards can display current performance, trends, and detailed breakdowns of recent evaluations, making it easier to monitor your system and identify areas for improvement.\\n\\nWith these practices, you can make evaluation an ongoing, automated, and visible part of your development workflow, leading to more reliable and robust RAG systems.\\n\\nBest Practices for Observability\\n\\nDefine clear thresholds: Establish performance baselines and alert thresholds for each metric\\n\\nSegment evaluations: Break down results by query type, data source, or other relevant factors\\n\\nHistorical tracking: Maintain historical evaluation data to identify trends and regressions\\n\\nCorrelation analysis: Link evaluation metrics to user feedback and business outcomes\\n\\nRegular benchmarking: Periodically evaluate against fixed test sets to ensure consistency\\n\\nAlert on regressions: Implement automated alerts when metrics drop below thresholds\\n\\nContextualize metrics: Include example failures alongside aggregate metrics for better understanding\\n\\nBuilding a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:\\n\\nCapture evaluation results with Ragas\\n\\nIdentify patterns in failures and underperforming areas\\n\\nPrioritize improvements based on impact and effort\\n\\nImplement changes to your RAG components\\n\\nValidate improvements with focused re-evaluation\\n\\nMonitor continuously to catch regressions\\n\\nConclusion: From Evaluation to Action\\n\\nIntegrating Ragas with your frameworks and observability tools transforms evaluation from a point-in-time activity to a continuous improvement cycle. By making evaluation metrics visible, actionable, and integrated into your workflows, you create a foundation for systematic improvement of your LLM applications.\\n\\nThe most successful teams don\\'t just evaluate occasionally — they build evaluation into their development culture, making data-driven decisions based on objective metrics rather than subjective impressions.\\n\\nIn our final post, we\\'ll explore how to build effective feedback loops that translate evaluation insights into concrete improvements for your LLM applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Metrics and Customization Part 6: Evaluating AI Agents Part 7: Integrations and Observability with Ragas — You are here Next up in the series: Part 8: Building Feedback Loops\\n\\nHow are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!']",
- "['<1-hop>\\n\\ntitle: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true In our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature. Understanding RAG Systems: More Than the Sum of Their Parts RAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information This dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content. The RAG Evaluation Triad Effective RAG evaluation requires examining three key dimensions: Retrieval Quality: How well does the system find relevant information? Generation Quality: How well does the system produce responses from retrieved information? End-to-End Performance: How well does the complete system satisfy user needs? Let\\'s explore how Ragas helps evaluate each dimension of RAG systems.', \"<2-hop>\\n\\nCore RAG Metrics in Ragas Ragas provides specialized metrics to assess RAG systems across retrieval, generation, and end-to-end performance. Retrieval Quality Metrics 1. Context Relevancy Measures how relevant the retrieved documents are to the user's question. How it works: Takes the user's question (user_input) and the retrieved documents (retrieved_contexts). Uses an LLM to score relevance with two different prompts, averaging the results for robustness. Scores are normalized between 0.0 (irrelevant) and 1.0 (fully relevant). Why it matters: Low scores indicate your retriever is pulling in unrelated or noisy documents. Monitoring this helps you improve the retrieval step. 2. Context Precision Assesses how much of the retrieved context is actually useful for generating the answer. How it works: For each retrieved chunk, an LLM judges if it was necessary for the answer, using the ground truth (reference) or the generated response. Calculates Average Precision, rewarding systems that rank useful chunks higher. Variants: ContextUtilization: Uses the generated response instead of ground truth. Non-LLM version: Compares retrieved chunks to ideal reference contexts using string similarity. Why it matters: High precision means your retriever is efficient; low precision means too much irrelevant information is included. 3. Context Recall Evaluates whether all necessary information from the ground truth answer is present in the retrieved context. How it works: Breaks down the reference answer into sentences. For each sentence, an LLM checks if it can be supported by the retrieved context. The score is the proportion of reference sentences attributed to the retrieved context. Variants: Non-LLM version: Compares reference and retrieved contexts using similarity and thresholds. Why it matters: High recall means your retriever finds all needed information; low recall means critical information is missing. Summary: - Low context relevancy: Retriever needs better query understanding or semantic matching. - Low context precision: Retriever includes unnecessary information. - Low context recall: Retriever misses critical information. Generation Quality Metrics 1. Faithfulness Checks if the generated answer is factually consistent with the retrieved context, addressing hallucination. How it works: Breaks the answer into simple statements. For each, an LLM checks if it can be inferred from the retrieved context. The score is the proportion of faithful statements. Alternative: FaithfulnesswithHHEM: Uses a specialized NLI model for verification. Why it matters: High faithfulness means answers are grounded in context; low faithfulness signals hallucination. 2. Answer Relevancy Measures if the generated answer directly addresses the user's question. How it works: Asks an LLM to generate possible questions for the answer. Compares these to the original question using embedding similarity. Penalizes noncommittal answers. Why it matters: High relevancy means answers are on-topic; low relevancy means answers are off-topic or incomplete. Summary: - Low faithfulness: Generator adds facts not supported by context. - Low answer relevancy: Generator doesn't focus on the specific question. End-to-End Metrics 1. Correctness Assesses factual alignment between the generated answer and a ground truth reference. How it works: Breaks both the answer and reference into claims. Uses NLI to verify claims in both directions. Calculates precision, recall, or F1-score. Why it matters: High correctness means answers match the ground truth; low correctness signals factual errors. Key distinction: - Faithfulness: Compares answer to retrieved context. - FactualCorrectness: Compares answer to ground truth.\", '<3-hop>\\n\\ntitle: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true As Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in. What is Ragas? Ragas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems. At its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations? Why Evaluate LLM Applications? LLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable. Evaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application Key Features of Ragas 🎯 Specialized Metrics Ragas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications: Faithfulness: Measures if the response is factually consistent with the retrieved context Context Relevancy: Evaluates if the retrieved information is relevant to the query Answer Relevancy: Assesses if the response addresses the user\\'s question Topic Adherence: Gauges how well multi-turn conversations stay on topic 🧪 Test Data Generation Creating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage. 🔗 Seamless Integrations Ragas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI Observability platforms - Phoenix - LangSmith - Langfuse 📊 Comprehensive Analysis Beyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements. Getting Started with Ragas Installing Ragas is straightforward: bash uv init && uv add ragas Here\\'s a simple example of evaluating a response using Ragas: ```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI Initialize the LLM, you are going to new OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Your evaluation data test_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" } Create a sample sample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor Create metric faithfulness = Faithfulness(llm=evaluator_llm) Calculate the score result = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ``` 💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas', \"<4-hop>\\n\\nWhat's Coming in This Blog Series This introduction is just the beginning. In the upcoming posts, we'll dive deeper into all aspects of evaluating LLM applications with Ragas: Part 2: Basic Evaluation Workflow We'll explore each metric in detail, explaining when and how to use them effectively. Part 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance. Part 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application's capabilities. Part 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments. Part 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals. Part 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows. Part 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications. Conclusion In a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications. Ready to Elevate Your LLM Applications? Start exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you're facing specific evaluation hurdles, don't hesitate to reach out—we'd love to help!\"]",
- "Ragas provides several specialized metrics for evaluating Retrieval-Augmented Generation (RAG) systems, addressing the unique challenges posed by their multi-component nature. These metrics focus on three key dimensions: retrieval quality, generation quality, and end-to-end performance.\n\n### Specialized Metrics in Ragas\n\n1. **Retrieval Quality Metrics**:\n - **Context Relevancy**: Measures how relevant the retrieved documents are to the user's question.\n - **Context Precision**: Assesses how much of the retrieved context is useful for generating the answer.\n - **Context Recall**: Evaluates whether all necessary information from the ground truth answer is present in the retrieved context.\n\n2. **Generation Quality Metrics**:\n - **Faithfulness**: Checks if the generated answer is factually consistent with the retrieved context.\n - **Answer Relevancy**: Measures if the generated answer directly addresses the user's question.\n\n3. **End-to-End Performance Metrics**:\n - **Correctness**: Assesses factual alignment between the generated answer and a ground truth reference.\n\n### Addressing Unique Evaluation Challenges\nThe multi-component nature of RAG systems means that:\n- A system might retrieve relevant information but generate poor responses, or vice versa. Ragas metrics help identify weaknesses in either the retrieval or generation components.\n- The interaction between retrieval and generation must be evaluated, ensuring that the system not only retrieves relevant content but also generates coherent and accurate responses based on that content.\n\nBy focusing on these specialized metrics, Ragas enables a comprehensive evaluation of RAG systems, ensuring that both retrieval and generation components are effectively assessed.\n\nFor more detailed information, you can explore the following links:\n- [Evaluating Rag Systems With Ragas](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)\n- [Introduction To Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)",
- "Ragas provides specialized metrics for evaluating Retrieval-Augmented Generation (RAG) systems that address the unique challenges arising from their multi-component structure, which includes both retrieval and generation. The core metrics are divided into three key dimensions: Retrieval Quality, Generation Quality, and End-to-End Performance. For Retrieval Quality, Ragas offers Context Relevancy (measuring how relevant retrieved documents are to the user's question), Context Precision (assessing how much of the retrieved context is actually useful for generating the answer), and Context Recall (evaluating whether all necessary information from the ground truth answer is present in the retrieved context). For Generation Quality, Ragas includes Faithfulness (checking if the generated answer is factually consistent with the retrieved context) and Answer Relevancy (measuring if the generated answer directly addresses the user's question). For End-to-End Performance, the Correctness metric assesses factual alignment between the generated answer and a ground truth reference. These metrics collectively ensure that both the retrieval and generation components are evaluated individually and in combination, addressing the unique evaluation challenges of RAG systems.",
+ "How does Metric-Driven Development (MDD) utilize combined performance metrics to guide project outcomes, and what are some examples of such metrics in practice?",
+ "[\"Reward functions embody the core MDD idea: set a clear, measurable goal, and let it guide actions towards success.\\n\\n## Metric-Driven Development Across Industries: Real-World Examples\\n\\nMDD isn't just for software. Here's how different fields use it:\", '* **Team A** is stuck in debates. Should they focus on improving video load speed or making the recommendation engine more accurate? One engineer insists, \"Faster videos keep users from leaving!\" Another counters, \"But better recommendations are what make them subscribe!\" They argue based on gut feelings.\\n* **Team B** operates differently. They have a clear, agreed-upon goal: ***Improve the average \"Watch Time per User\" metric, while ensuring video buffering times stay below 2 seconds.*** They rapidly test ideas, measuring the impact of each change against this specific target.\\n\\nWhich team do you think will make faster, smarter progress?\\n\\n\\nTeam B has the edge because they\\'re using **Metric-Driven Development (MDD)**. This is a powerful strategy where teams unite around measurable goals to eliminate guesswork and make real strides. Let\\'s break down how it works, what makes a metric truly useful, and see how industries from healthcare to e-commerce use it to succeed.', '| **Harmfulness** | | ✓ | |\\n| **Coherence** | | ✓ | |\\n| **Context Relevancy** | | | ✓ |', 'MDD reminds us to track both:\\n* **Leading indicators** (like training loss, code coverage) to monitor progress during development.\\n* **Lagging indicators** (like user engagement, revenue, customer support tickets) to measure the actual impact.\\n\\n## The Takeaway: Use Metrics as Your Compass\\nMetric-Driven Development isn\\'t a complex theory reserved for tech giants. It\\'s a fundamental mindset applicable everywhere:\\n\\n* A local bakery might track *\"Daily Units Sold per Pastry Type\"* to optimize baking schedules.\\n* A city planner could use *\"Average Commute Time Reduction\"* to evaluate the success of new traffic light patterns.\\n* A project manager might measure progress through *\"Sprint Velocity\"* or *\"Percentage of On-Time Task Completions\"* rather than subjective assessments of how \"busy\" the team appears.\\n\\n\\nBy choosing metrics that are **measurable, actionable, aligned, simple, and robust**, you transform ambiguity into clarity and opinion into evidence.']",
+ "['<1-hop>\\n\\n## What Exactly is Metric-Driven Development?\\n\\nMetric-Driven Development (MDD) is a simple but effective framework where teams:\\n\\n1. **Define Clear, Measurable Goals:** Set specific numerical targets (e.g., \"Increase user sign-ups by 20% this quarter\").\\n2. **Base Decisions on Data:** Rely on evidence and measurements, not just opinions or assumptions.\\n3. **Iterate and Learn Quickly:** Continuously measure the impact of changes to see what works and what doesn\\'t.\\n\\nThink of MDD as a **GPS for your project**. Without clear metrics, you\\'re driving in the fog, hoping you\\'re heading in the right direction. With MDD, you get real-time feedback, ensuring you\\'re moving towards your destination efficiently.\\n\\n## Why Teams Struggle Without Clear Metrics\\n\\nWithout a metric-driven approach, teams often fall into common traps:', '<2-hop>\\n\\n* **Metric:** A combined score, e.g., `Points Scored - (Time Taken * Penalty Factor)`.\\n * **Impact:** Trains AI opponents that are challenging but fair, balancing speed and skill.\\n* **Autonomous Vehicles: Safety & Comfort Score**\\n * **Metric:** Combination of factors like smooth acceleration/braking, lane adherence, and deductions for interventions or near-misses.\\n * **Impact:** Guides development towards vehicles that are not only safe but also provide a comfortable ride.']",
+ "Metric-Driven Development (MDD) utilizes combined performance metrics to provide clear, measurable goals that guide project outcomes. By focusing on specific metrics, teams can eliminate guesswork and make informed decisions based on data rather than opinions. \n\n### Key Aspects of MDD:\n- **Leading Indicators**: Metrics like training loss and code coverage help monitor progress during development.\n- **Lagging Indicators**: Metrics such as user engagement, revenue, and customer support tickets measure the actual impact of changes made.\n\n### Examples of Metrics in Practice:\n- A bakery might track **\"Daily Units Sold per Pastry Type\"** to optimize baking schedules.\n- A city planner could use **\"Average Commute Time Reduction\"** to evaluate the success of new traffic light patterns.\n- A project manager might measure progress through **\"Sprint Velocity\"** or **\"Percentage of On-Time Task Completions\"**.\n\nThese metrics transform ambiguity into clarity, allowing teams to focus on actionable outcomes.\n\nFor more information, you can explore the following links:\n- [Metric-Driven Development: Make Smarter Decisions, Faster](https://thedataguy.pro/blog/metric-driven-development/)",
+ "Metric-Driven Development (MDD) utilizes combined performance metrics by defining clear, measurable goals and basing decisions on data rather than assumptions. This approach ensures that teams receive real-time feedback and can iterate quickly to improve outcomes. Examples of combined performance metrics in practice include a score such as 'Points Scored - (Time Taken * Penalty Factor)' to train AI opponents that balance speed and skill, and a 'Safety & Comfort Score' for autonomous vehicles, which combines factors like smooth acceleration, lane adherence, and deductions for interventions or near-misses. These combined metrics help guide development towards achieving specific, balanced objectives.",
"multi_hop_abstract_query_synthesizer"
],
[
"8",
- "How does RAGAS facilitate metric-driven development in RAG system evaluation, and what specific metrics does it introduce to improve the assessment process?",
- "['title: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true\\n\\nIn our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs.\\n\\nBeyond the Basics: Why Advanced Metrics Matter\\n\\nWhile Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements:\\n\\nDomain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements\\n\\nCustom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches\\n\\nSpecialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics\\n\\nBusiness-specific KPIs: Aligning evaluation with business objectives requires customized metrics\\n\\nLet\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs.\\n\\nUnderstanding Ragas\\' Metric Architecture\\n\\nBefore creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture:\\n\\n1. Understand the Metric Base Classes\\n\\nAll metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these:\\n\\nSingleTurnMetric: For metrics that evaluate a single question/response pair.\\n\\nMultiTurnMetric: For metrics that evaluate multi-turn conversations.\\n\\nMetricWithLLM: For metrics that require an LLM for evaluation.\\n\\nMetricWithEmbeddings: For metrics that use embeddings.\\n\\nYou can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric).\\n\\nEach metric implements specific scoring methods depending on its type:\\n\\n_single_turn_ascore: For single-turn metrics\\n\\n_multi_turn_ascore: For multi-turn metrics\\n\\nCreating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel\\n\\nDefine input/output models for the prompt\\n\\nclass TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel): score: float feedback: str\\n\\nDefine the prompt\\n\\nclass TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ]\\n\\nCreate the metric\\n\\n@dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\",\\n\\n }\\n }\\n)\\noutput_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS\\nevaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt)\\n\\nasync def _single_turn_ascore(self, sample, callbacks) -> float:\\n assert self.llm is not None, \"LLM must be set\"\\n\\n question = sample.user_input\\n response = sample.response\\n # Extract programming language from question if possible\\n programming_language = \"python\" # Default\\n languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"]\\n for lang in languages:\\n if lang in question.lower():\\n programming_language = lang\\n break\\n\\n # Get the context\\n context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\"\\n\\n # Prepare input for prompt\\n prompt_input = TechnicalAccuracyInput(\\n question=question,\\n context=context,\\n response=response,\\n programming_language=programming_language\\n )\\n\\n # Generate evaluation\\n evaluation = await self.evaluation_prompt.generate(\\n data=prompt_input, llm=self.llm, callbacks=callbacks\\n )\\n\\n return evaluation.score\\n\\n```\\n\\nUsing the Custom Metric\\n\\nTo use the custom metric, simply include it in your evaluation pipeline:\\n\\n```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\ntest_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\")\\n\\nNote: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions,\\n\\n`` You can also use theevaluate` function to evaluate a dataset:\\n\\n```python from ragas import evaluate from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization\\n\\nCustomizing Metrics for Your Application\\n\\nYou can further refine your evaluation by customizing existing metrics—such as adjusting thresholds or criteria—to better fit your application\\'s requirements. For multi-turn conversations, you might configure metrics like topic adherence to emphasize specific aspects, such as precision or recall, based on your evaluation objectives.\\n\\nIn specialized domains like healthcare or legal, it\\'s crucial to design custom metrics that capture domain-specific accuracy and compliance needs. For complex applications, consider combining several metrics into composite scores to represent multiple quality dimensions.\\n\\nWhen assessing capabilities like code generation or structured outputs, develop metrics that evaluate execution correctness or schema compliance. For advanced scenarios, you can build metric pipelines that orchestrate several metrics and aggregate their results using strategies like weighted averages or minimum scores.\\n\\nBy thoughtfully customizing and combining metrics, you can achieve a comprehensive and meaningful evaluation framework tailored to your unique use case.\\n\\nBest Practices for Custom Metric Development\\n\\nSingle Responsibility: Each metric should evaluate one specific aspect\\n\\nClear Definition: Define precisely what your metric measures\\n\\nBounded Output: Scores should be normalized, typically in [0,1]\\n\\nReproducibility: Minimize randomness in evaluation\\n\\nDocumentation: Document criteria, prompt design, and interpretation guidelines\\n\\nTest with Examples: Verify metric behavior on clear-cut examples\\n\\nHuman Correlation: Validate that metrics correlate with human judgment\\n\\nStandardizing Custom Metrics\\n\\nTo ensure consistency across custom metrics, consider the following best practices:\\n\\nDefine a clear, human-readable description for each metric.\\n\\nProvide interpretation guidelines to help users understand score meanings.\\n\\nInclude metadata such as metric name, required columns, and output type.\\n\\nUse a standardized interface or base class for all custom metrics.\\n\\nImplementation Patterns for Advanced Metrics\\n\\nWhen developing advanced metrics like topic adherence:\\n\\nDesign multi-step evaluation workflows for complex tasks.\\n\\nUse specialized prompts for different sub-tasks within the metric.\\n\\nAllow configurable scoring modes (e.g., precision, recall, F1).\\n\\nSupport conversational context for multi-turn evaluations.\\n\\nDebugging Custom Metrics\\n\\nEffective debugging strategies include:\\n\\nImplementing a debug mode to capture prompt inputs, outputs, and intermediate results.\\n\\nLogging detailed evaluation steps for easier troubleshooting.\\n\\nReviewing final scores alongside intermediate calculations to identify issues.\\n\\nConclusion: Building an Evaluation Ecosystem\\n\\nCustom metrics allow you to build a comprehensive evaluation ecosystem tailored to your application\\'s specific needs:\\n\\nBaseline metrics: Start with Ragas\\' core metrics for fundamental quality aspects\\n\\nDomain adaptation: Add specialized metrics for your application domain\\n\\nFeature-specific metrics: Develop metrics for unique features of your system\\n\\nBusiness alignment: Create metrics that reflect specific business KPIs and requirements\\n\\nBy extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques — You are here Next up in the series: Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true\\n\\nIn our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature.\\n\\nUnderstanding RAG Systems: More Than the Sum of Their Parts\\n\\nRAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information\\n\\nThis dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content.\\n\\nThe RAG Evaluation Triad\\n\\nEffective RAG evaluation requires examining three key dimensions:\\n\\nRetrieval Quality: How well does the system find relevant information?\\n\\nGeneration Quality: How well does the system produce responses from retrieved information?\\n\\nEnd-to-End Performance: How well does the complete system satisfy user needs?\\n\\nLet\\'s explore how Ragas helps evaluate each dimension of RAG systems.\\n\\nCore RAG Metrics in Ragas\\n\\nRagas provides specialized metrics to assess RAG systems across retrieval, generation, and end-to-end performance.\\n\\nRetrieval Quality Metrics\\n\\n1. Context Relevancy\\n\\nMeasures how relevant the retrieved documents are to the user\\'s question.\\n\\nHow it works:\\n\\nTakes the user\\'s question (user_input) and the retrieved documents (retrieved_contexts).\\n\\nUses an LLM to score relevance with two different prompts, averaging the results for robustness.\\n\\nScores are normalized between 0.0 (irrelevant) and 1.0 (fully relevant).\\n\\nWhy it matters: Low scores indicate your retriever is pulling in unrelated or noisy documents. Monitoring this helps you improve the retrieval step.\\n\\n2. Context Precision\\n\\nAssesses how much of the retrieved context is actually useful for generating the answer.\\n\\nHow it works:\\n\\nFor each retrieved chunk, an LLM judges if it was necessary for the answer, using the ground truth (reference) or the generated response.\\n\\nCalculates Average Precision, rewarding systems that rank useful chunks higher.\\n\\nVariants:\\n\\nContextUtilization: Uses the generated response instead of ground truth.\\n\\nNon-LLM version: Compares retrieved chunks to ideal reference contexts using string similarity.\\n\\nWhy it matters: High precision means your retriever is efficient; low precision means too much irrelevant information is included.\\n\\n3. Context Recall\\n\\nEvaluates whether all necessary information from the ground truth answer is present in the retrieved context.\\n\\nHow it works:\\n\\nBreaks down the reference answer into sentences.\\n\\nFor each sentence, an LLM checks if it can be supported by the retrieved context.\\n\\nThe score is the proportion of reference sentences attributed to the retrieved context.\\n\\nVariants:\\n\\nNon-LLM version: Compares reference and retrieved contexts using similarity and thresholds.\\n\\nWhy it matters: High recall means your retriever finds all needed information; low recall means critical information is missing.\\n\\nSummary: - Low context relevancy: Retriever needs better query understanding or semantic matching. - Low context precision: Retriever includes unnecessary information. - Low context recall: Retriever misses critical information.\\n\\nGeneration Quality Metrics\\n\\n1. Faithfulness\\n\\nChecks if the generated answer is factually consistent with the retrieved context, addressing hallucination.\\n\\nHow it works:\\n\\nBreaks the answer into simple statements.\\n\\nFor each, an LLM checks if it can be inferred from the retrieved context.\\n\\nThe score is the proportion of faithful statements.\\n\\nAlternative:\\n\\nFaithfulnesswithHHEM: Uses a specialized NLI model for verification.\\n\\nWhy it matters: High faithfulness means answers are grounded in context; low faithfulness signals hallucination.\\n\\n2. Answer Relevancy\\n\\nMeasures if the generated answer directly addresses the user\\'s question.\\n\\nHow it works:\\n\\nAsks an LLM to generate possible questions for the answer.\\n\\nCompares these to the original question using embedding similarity.\\n\\nPenalizes noncommittal answers.\\n\\nWhy it matters: High relevancy means answers are on-topic; low relevancy means answers are off-topic or incomplete.\\n\\nSummary: - Low faithfulness: Generator adds facts not supported by context. - Low answer relevancy: Generator doesn\\'t focus on the specific question.\\n\\nEnd-to-End Metrics\\n\\n1. Correctness\\n\\nAssesses factual alignment between the generated answer and a ground truth reference.\\n\\nHow it works:\\n\\nBreaks both the answer and reference into claims.\\n\\nUses NLI to verify claims in both directions.\\n\\nCalculates precision, recall, or F1-score.\\n\\nWhy it matters: High correctness means answers match the ground truth; low correctness signals factual errors.\\n\\nKey distinction: - Faithfulness: Compares answer to retrieved context. - FactualCorrectness: Compares answer to ground truth.\\n\\nCommon RAG Evaluation Patterns\\n\\n1. High Retrieval, Low Generation Scores\\n\\nDiagnosis: Good retrieval, poor use of information.\\n\\nFixes: Improve prompts, use better generation models, or verify responses post-generation.\\n\\n2. Low Retrieval, High Generation Scores\\n\\nDiagnosis: Good generation, inadequate information.\\n\\nFixes: Enhance indexing, retrieval algorithms, or expand the knowledge base.\\n\\n3. Low Context Precision, High Faithfulness\\n\\nDiagnosis: Retrieves too much, but generates reliably.\\n\\nFixes: Filter passages, optimize chunk size, or use re-ranking.\\n\\nBest Practices for RAG Evaluation\\n\\nEvaluate components independently: Assess retrieval and generation separately.\\n\\nUse diverse queries: Include factoid, explanatory, and complex questions.\\n\\nCompare against baselines: Test against simpler systems.\\n\\nPerform ablation studies: Try variations like different chunk sizes or retrieval models.\\n\\nCombine with human evaluation: Use Ragas with human judgment for a complete view.\\n\\nConclusion: The Iterative RAG Evaluation Cycle\\n\\nEffective RAG development is iterative:\\n\\nEvaluate: Measure performance.\\n\\nAnalyze: Identify weaknesses.\\n\\nImprove: Apply targeted enhancements.\\n\\nRe-evaluate: Measure the impact of changes.\\n\\nThe Iterative RAG Evaluation Cycle\\n\\nBy using Ragas to implement this cycle, you can systematically improve your RAG system\\'s performance across all dimensions.\\n\\nIn our next post, we\\'ll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas — You are here Next up in the series: Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!', 'title: \"Part 2: Basic Evaluation Workflow with Ragas\" date: 2025-04-26T19:00:00-06:00 layout: blog description: \"Learn how to set up a basic evaluation workflow for LLM applications using Ragas. This guide walks you through data preparation, metric selection, and result analysis.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1600132806370-bf17e65e942f?q=80&w=1988&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 8 published: true\\n\\nIn our previous post, we introduced Ragas as a powerful framework for evaluating LLM applications. Now, let\\'s dive into the practical aspects of setting up your first evaluation pipeline.\\n\\nUnderstanding the Evaluation Workflow\\n\\nA typical Ragas evaluation workflow consists of four key steps:\\n\\nPrepare your data: Collect queries, contexts, responses, and reference answers\\n\\nSelect appropriate metrics: Choose metrics that align with what you want to evaluate\\n\\nRun the evaluation: Process your data through the selected metrics\\n\\nAnalyze the results: Interpret scores and identify areas for improvement\\n\\nLet\\'s walk through each step with practical examples.\\n\\nStep 1: Setting Up Your Environment\\n\\nFirst, ensure you have Ragas installed:\\n\\nbash uv add ragas\\n\\nNext, import the necessary components:\\n\\npython import pandas as pd from ragas import EvaluationDataset from ragas import evaluate, RunConfig from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity\\n\\nStep 2: Preparing Your Evaluation Data\\n\\nFor a RAG system evaluation, you\\'ll need:\\n\\nQuestions: User queries to your system\\n\\nContexts: Documents or chunks retrieved by your system\\n\\nResponses: Answers generated by your system\\n\\nGround truth (optional): Reference answers or documents for comparison\\n\\nHere\\'s how to organize this data:\\n\\n```python\\n\\nSample data\\n\\ndata = { \"user_input\": [ \"What are the main symptoms of COVID-19?\", \"How does machine learning differ from deep learning?\" ], \"retrieved_contexts\": [ [ \"Common symptoms of COVID-19 include fever, cough, and fatigue. Some patients also report loss of taste or smell, body aches, and difficulty breathing.\", \"COVID-19 is caused by the SARS-CoV-2 virus and spreads primarily through respiratory droplets.\" ], [ \"Machine learning is a subset of AI focused on algorithms that learn from data without being explicitly programmed.\", \"Deep learning is a specialized form of machine learning using neural networks with many layers (deep neural networks).\" ] ], \"response\": [ \"The main symptoms of COVID-19 include fever, cough, fatigue, and sometimes loss of taste or smell, body aches, and breathing difficulties.\", \"Machine learning is a subset of AI that focuses on algorithms learning from data, while deep learning is a specialized form of machine learning that uses deep neural networks with multiple layers.\" ], \"reference\": [ \"COVID-19 symptoms commonly include fever, dry cough, fatigue, loss of taste or smell, body aches, sore throat, and in severe cases, difficulty breathing.\", \"Machine learning is a branch of AI where systems learn from data, identify patterns, and make decisions with minimal human intervention. Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks) to analyze various factors of data.\" ] }\\n\\neval_data = pd.DataFrame(data)\\n\\nConvert to a format Ragas can use\\n\\nevaluation_dataset = EvaluationDataset.from_pandas(eval_data) evaluation_dataset\\n\\n```\\n\\nStep 3: Selecting and Configuring Metrics\\n\\nRagas offers various metrics to evaluate different aspects of your system:\\n\\nCore RAG Metrics:\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nFactual Correctness: Assesses if the response is accurate and free from factual errors.\\n\\nResponse Relevancy: Evaluates if the response directly addresses the user query.\\n\\nContext Entity Recall: Measures how well the retrieved context captures relevant entities from the ground truth.\\n\\nNoise Sensitivity: Assesses the robustness of the response to irrelevant or noisy context.\\n\\nLLM Context Recall: Evaluates how effectively the LLM utilizes the provided context to generate the response.\\n\\nFor metrics that require an LLM (like faithfulness), you need to configure the LLM provider:\\n\\n```python\\n\\nConfigure LLM for evaluation\\n\\nfrom langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nDefine metrics to use\\n\\nmetrics = [ Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity(), LLMContextRecall() ] ```\\n\\nStep 4: Running the Evaluation\\n\\nNow, run the evaluation with your selected metrics:\\n\\n```python\\n\\nRun evaluation\\n\\nresults = evaluate( evaluation_dataset, metrics=metrics, llm=evaluator_llm # Required for LLM-based metrics )\\n\\nView results\\n\\nprint(results) ```\\n\\nOutput:\\n\\nValues will vary based on your data and LLM performance.\\n\\npython { \"faithfulness\": 1.0000, \"factual_correctness\": 0.6750, \"answer_relevancy\": 0.9897, \"context_entity_recall\": 0.8889, \"noise_sensitivity_relevant\": 0.1667, \"context_recall\": 0.5000 }\\n\\nStep 5: Interpreting Results\\n\\nRagas metrics typically return scores between 0 and 1, where higher is better:\\n\\nUnderstanding Score Ranges:\\n\\n0.8-1.0: Excellent performance\\n\\n0.6-0.8: Good performance\\n\\n0.4-0.6: Moderate performance, needs improvement\\n\\n0.4 or lower: Poor performance, requires significant attention\\n\\nAdvanced Use: Custom Evaluation for Specific Examples\\n\\nFor more detailed analysis of specific examples:\\n\\n```python from ragas import SingleTurnSample from ragas.metrics import AspectCritic\\n\\nDefine a specific test case\\n\\ntest_data = { \"user_input\": \"What are quantum computers?\", \"response\": \"Quantum computers use quantum bits or qubits that can exist in multiple states simultaneously, unlike classical bits that can only be 0 or 1.\", \"retrieved_contexts\": [\"Quantum computing is a type of computation that harnesses quantum mechanical phenomena.\"] }\\n\\nCreate a custom evaluation metric\\n\\ncustom_metric = AspectCritic( name=\"quantum_accuracy\", llm=llm, definition=\"Verify if the explanation of quantum computing is accurate and complete.\" )\\n\\nScore the sample\\n\\nsample = SingleTurnSample(**test_data) score = await custom_metric.single_turn_ascore(sample) print(f\"Quantum accuracy score: {score}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 02_Basic_Evaluation_Workflow_with_Ragas\\n\\nCommon Evaluation Patterns and Metrics\\n\\nBelow is a matrix mapping evaluation patterns to the metrics used, along with definitions for each metric:\\n\\nMetric Comprehensive RAG Evaluation Content Quality Evaluation Retrieval Quality Evaluation Faithfulness ✓ ✓ Answer Relevancy ✓ ✓ Context Recall ✓ ✓ Context Precision ✓ ✓ Harmfulness ✓ Coherence ✓ Context Relevancy ✓\\n\\nMetric Definitions\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nAnswer Relevancy: Assesses if the response addresses the question.\\n\\nContext Recall: Measures how well the retrieved context covers the information in the ground truth.\\n\\nContext Precision: Evaluates the proportion of relevant information in the retrieved context.\\n\\nHarmfulness: Evaluates if the response contains harmful or inappropriate content.\\n\\nCoherence: Measures the logical flow and clarity of the response.\\n\\nContext Relevancy: Evaluates if the retrieved context is relevant to the question.\\n\\nThis matrix provides a clear overview of which metrics to use for specific evaluation patterns and their respective definitions.\\n\\nBest Practices for Ragas Evaluation\\n\\nStart simple: Begin with core metrics before adding more specialized ones\\n\\nUse diverse test cases: Include a variety of questions, from simple to complex\\n\\nConsider edge cases: Test with queries that might challenge your system\\n\\nCompare versions: Track metrics across different versions of your application\\n\\nCombine with human evaluation: Use Ragas alongside human feedback for a comprehensive assessment\\n\\nConclusion\\n\\nSetting up a basic evaluation workflow with Ragas is straightforward yet powerful. By systematically evaluating your LLM applications, you gain objective insights into their performance and clear directions for improvement.\\n\\nIn our next post, we\\'ll delve deeper into specialized evaluation techniques for RAG systems, exploring advanced metrics and evaluation strategies for retrieval-augmented generation applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow — You are here Next up in the series: Part 3: Evaluating RAG Systems Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHave you set up your first Ragas evaluation? What aspects of your LLM application are you most interested in measuring? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!']",
- "['<1-hop>\\n\\nHow to Generate Synthetic Data for RAG Evaluation In the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, synthetic data generation is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like RAGAS and LangSmith. Why Generate Synthetic Data? Early Signal, Fast Iteration Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production. Controlled Complexity You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases. Benchmarking and Comparison Synthetic test sets provide a repeatable, comparable way to measure improvements as you tweak your pipeline (e.g., changing chunk size, embeddings, or prompts). How to Generate Synthetic Data 1. Prepare Your Source Data Start with a set of documents relevant to your domain. For example, you might download and load HTML blog posts into a document format using tools like LangChain’s DirectoryLoader. 2. Build a Knowledge Graph Use RAGAS to convert your documents into a knowledge graph. This graph captures entities, relationships, and summaries, forming the backbone for generating meaningful queries. RAGAS applies default transformations are dependent on the corpus length, here are some examples: Producing Summaries -> produces summaries of the documents Extracting Headlines -> finding the overall headline for the document Theme Extractor -> extracts broad themes about the documents It then uses cosine-similarity and heuristics between the embeddings of the above transformations to construct relationships between the nodes. This is a crucial step, as the quality of your knowledge graph directly impacts the relevance and accuracy of the generated queries. 3. Configure Query Synthesizers RAGAS provides several query synthesizers: - SingleHopSpecificQuerySynthesizer: Generates direct, fact-based questions. - MultiHopAbstractQuerySynthesizer: Creates broader, multi-step reasoning questions. - MultiHopSpecificQuerySynthesizer: Focuses on questions that require connecting specific entities across documents. By mixing these, you get a diverse and challenging test set. 4. Generate the Test Set With your knowledge graph and query synthesizers, use RAGAS’s TestsetGenerator to create a synthetic dataset. This dataset will include questions, reference answers, and supporting contexts. 5. Evaluate and Iterate Load your synthetic dataset into an evaluation platform like LangSmith. Run your RAG pipeline against the test set, and use automated evaluators (for accuracy, helpfulness, style, etc.) to identify strengths and weaknesses. Tweak your pipeline and re-evaluate to drive improvements. Minimal Example Here’s a high-level pseudocode outline (see the notebook for full details): ````python 1. Load documents from langchain_community.document_loaders import DirectoryLoader path = \"data/\" loader = DirectoryLoader(path, glob=\"*.md\") docs = loader.load() 2. Generate data from ragas.testset import TestsetGenerator from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings Initialize the generator with the LLM and embedding model generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4.1\")) generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) Create the test set generator generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings) dataset = generator.generate_with_langchain_docs(docs, testset_size=10) ```` dataset will now contain a set of questions, answers, and contexts that you can use to evaluate your RAG system. 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 04_Synthetic_Data_Generation', '<2-hop>\\n\\ntitle: \"Metric-Driven Development: Make Smarter Decisions, Faster\" date: 2025-05-05T00:00:00-06:00 layout: blog description: \"Your Team\\'s Secret Weapon for Cutting Through Noise and Driving Real Progress. Learn how to use clear metrics to eliminate guesswork and make faster, smarter progress in your projects.\" categories: [\"Development\", \"Productivity\", \"AI\", \"Management\"] coverImage: \"/images/metric-driven-development.png\" readingTime: 9 published: true In today\\'s data-driven world, success depends increasingly on our ability to measure the right things at the right time. Whether you\\'re developing AI systems, building web applications, or managing projects, having clear metrics guides your team toward meaningful progress while eliminating subjective debates. The Power of Metrics in AI Evaluation Recent advances in generative AI and large language models (LLMs) highlight the critical importance of proper evaluation frameworks. Projects like RAGAS (Retrieval Augmented Generation Assessment System) demonstrate how specialized metrics can transform vague goals into actionable insights. For example, when evaluating retrieval-augmented generation systems, generic metrics like BLEU or ROUGE scores often fail to capture what truly matters - the accuracy, relevance, and contextual understanding of the generated responses. RAGAS instead introduces metrics specifically designed for RAG systems: Faithfulness: Measures how well the generated answer aligns with the retrieved context Answer Relevancy: Evaluates whether the response correctly addresses the user\\'s query Context Relevancy: Assesses if the system retrieves information that\\'s actually needed Context Precision: Quantifies how efficiently the system uses retrieved information These targeted metrics provide clearer direction than general-purpose evaluations, allowing teams to make precise improvements where they matter most. Imagine two teams building a new feature for a streaming platform: Team A is stuck in debates. Should they focus on improving video load speed or making the recommendation engine more accurate? One engineer insists, \"Faster videos keep users from leaving!\" Another counters, \"But better recommendations are what make them subscribe!\" They argue based on gut feelings. Team B operates differently. They have a clear, agreed-upon goal: Improve the average \"Watch Time per User\" metric, while ensuring video buffering times stay below 2 seconds. They rapidly test ideas, measuring the impact of each change against this specific target. Which team do you think will make faster, smarter progress? Team B has the edge because they\\'re using Metric-Driven Development (MDD). This is a powerful strategy where teams unite around measurable goals to eliminate guesswork and make real strides. Let\\'s break down how it works, what makes a metric truly useful, and see how industries from healthcare to e-commerce use it to succeed. What Exactly is Metric-Driven Development? Metric-Driven Development (MDD) is a simple but effective framework where teams: Define Clear, Measurable Goals: Set specific numerical targets (e.g., \"Increase user sign-ups by 20% this quarter\"). Base Decisions on Data: Rely on evidence and measurements, not just opinions or assumptions. Iterate and Learn Quickly: Continuously measure the impact of changes to see what works and what doesn\\'t. Think of MDD as a GPS for your project. Without clear metrics, you\\'re driving in the fog, hoping you\\'re heading in the right direction. With MDD, you get real-time feedback, ensuring you\\'re moving towards your destination efficiently. Why Teams Struggle Without Clear Metrics Without a metric-driven approach, teams often fall into common traps: Chasing Too Many Goals: Trying to improve everything at once (\"We need higher accuracy and faster speed and lower costs!\") leads to scattered effort and slow progress. Endless Subjective Debates: Arguments arise that are hard to resolve with data (\"Is Model A\\'s slightly better performance worth the extra complexity?\"). Difficulty Measuring Progress: It\\'s hard to know if you\\'re actually improving (\"Are we doing better than last quarter? How can we be sure?\"). In machine learning (ML), this often happens when teams track various technical scores (like precision, recall, or F1 score – measures of model accuracy) without a single, unifying metric tied to the actual business outcome they want to achieve.']",
- "Ragas facilitates metric-driven development in RAG (Retrieval-Augmented Generation) system evaluation by providing a structured framework that includes specialized metrics tailored to assess the unique components of RAG systems. This allows developers to systematically evaluate and improve their applications based on quantifiable performance indicators.\n\n### Key Metrics Introduced by Ragas:\n1. **Retrieval Quality Metrics**:\n - **Context Relevancy**: Measures the relevance of retrieved documents to the user's question.\n - **Context Precision**: Assesses how much of the retrieved context is useful for generating the answer.\n - **Context Recall**: Evaluates whether all necessary information from the ground truth answer is present in the retrieved context.\n\n2. **Generation Quality Metrics**:\n - **Faithfulness**: Checks if the generated answer is factually consistent with the retrieved context.\n - **Answer Relevancy**: Measures if the generated answer directly addresses the user's question.\n\n3. **End-to-End Performance Metrics**:\n - **Correctness**: Assesses factual alignment between the generated answer and a ground truth reference.\n\nBy utilizing these metrics, Ragas enables developers to identify strengths and weaknesses in their systems, facilitating continuous improvement and alignment with user needs.\n\nFor more detailed information, you can explore the following links:\n- [Advanced Metrics And Customization With Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Evaluating Rag Systems With Ragas](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)\n- [Introduction To Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Basic Evaluation Workflow With Ragas](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)",
- "RAGAS facilitates metric-driven development in RAG system evaluation by providing tools to generate synthetic datasets and by introducing specialized metrics tailored for RAG systems. According to the context, RAGAS enables the creation of synthetic test sets that mimic real user queries and contexts, allowing teams to benchmark and compare system performance in a controlled and repeatable way. This supports rapid iteration and targeted improvements. In addition, RAGAS introduces specific metrics designed for RAG evaluation, such as Faithfulness (measuring alignment of generated answers with retrieved context), Answer Relevancy (evaluating if the response addresses the user’s query), Context Relevancy (assessing if the retrieved information is needed), and Context Precision (quantifying efficient use of retrieved information). These targeted metrics provide clearer direction than generic metrics, enabling teams to make precise, data-driven improvements and embodying the principles of metric-driven development.",
- "multi_hop_specific_query_synthesizer"
+ "How does Ragas provide specialized evaluation metrics for LLMs, and what steps are involved in creating a custom metric to assess technical accuracy in programming explanations?",
+ "[\"In our next post, we'll explore advanced metrics and customization techniques for specialized evaluation needs.\\n\\n---\\n\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**[Part 3: Evaluating RAG Systems with Ragas](/blog/evaluating-rag-systems-with-ragas/)** \\n**Part 4: Test Data Generation — _You are here_** \\n*Next up in the series:* \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"By extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we'll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\", \"In our next post, we'll delve deeper into specialized evaluation techniques for RAG systems, exploring advanced metrics and evaluation strategies for retrieval-augmented generation applications.\\n\\n---\\n \\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**Part 2: Basic Evaluation Workflow — _You are here_** \\n*Next up in the series:* \\n**[Part 3: Evaluating RAG Systems](/blog/evaluating-rag-systems-with-ragas/)** \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\", \"In our next post, we'll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\n---\\n\\n**[Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications](/blog/introduction-to-ragas/)** \\n**[Part 2: Basic Evaluation Workflow](/blog/basic-evaluation-workflow-with-ragas/)** \\n**Part 3: Evaluating RAG Systems with Ragas — _You are here_** \\n*Next up in the series:* \\n**[Part 4: Test Data Generation](/blog/generating-test-data-with-ragas/)** \\n**[Part 5: Advanced Evaluation Techniques](/blog/advanced-metrics-and-customization-with-ragas)** \\n**[Part 6: Evaluating AI Agents](/blog/evaluating-ai-agents-with-ragas/)** \\n**[Part 7: Integrations and Observability](/blog/integrations-and-observability-with-ragas/)** \\n**[Part 8: Building Feedback Loops](/blog/building-feedback-loops-with-ragas/)**\"]",
+ "[\"<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https://docs.ragas.io/en/stable/) is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions:\\n- Is my application retrieving the right information?\\n- Are the responses factually accurate and consistent with the retrieved context?\\n- Does the system appropriately address the user's query?\\n- How well does my application handle multi-turn conversations?\\n\\n## Why Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\", '<2-hop>\\n\\n## Creating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python\\nfrom dataclasses import dataclass, field\\nfrom typing import Dict, Optional, Set\\nimport typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric\\nfrom ragas.prompt import PydanticPrompt\\nfrom ragas.metrics import MetricType, MetricOutputType\\nfrom pydantic import BaseModel\\n\\n# Define input/output models for the prompt\\nclass TechnicalAccuracyInput(BaseModel):\\n question: str\\n context: str\\n response: str\\n programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel):\\n score: float\\n feedback: str']",
+ "Ragas provides specialized evaluation metrics for LLMs by allowing users to extend its framework with custom metrics tailored to specific evaluation needs. This enables the measurement of aspects that are most relevant for particular applications, such as technical accuracy in programming explanations.\n\nTo create a custom metric for assessing technical accuracy, the following steps are typically involved:\n\n1. **Define the Evaluation Criteria**: Identify what constitutes technical accuracy in the context of programming explanations.\n2. **Develop the Metric**: Create a formula or algorithm that quantifies the defined criteria.\n3. **Integrate with Ragas**: Implement the custom metric within the Ragas framework to ensure it can be used in evaluations.\n4. **Test and Validate**: Run evaluations using the custom metric to ensure it accurately reflects the technical accuracy of programming explanations.\n\nFor more detailed information, you can explore the following links:\n\n- [Part 1: Introduction to Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Part 2: Basic Evaluation Workflow](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)\n- [Part 3: Evaluating RAG Systems](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)\n- [Part 5: Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Part 6: Evaluating AI Agents](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/)",
+ "Ragas is an open-source evaluation framework specifically designed for LLM applications, offering specialized metrics that address challenges unique to LLM-powered systems, such as ensuring factual accuracy, consistency with retrieved context, and appropriate query handling. To create a custom metric for evaluating technical accuracy in programming explanations, Ragas allows developers to define input and output models (for example, using Pydantic BaseModel classes for technical accuracy input and output), and implement the metric logic using its extensible metric classes. This enables tailored evaluation beyond traditional NLP metrics, supporting the needs of high-stakes LLM applications.",
+ "multi_hop_abstract_query_synthesizer"
],
[
"9",
- "How does the use of Ragas facilitate the evaluation of Retrieval-Augmented Generation (RAG) systems by generating robust EvaluationDatasets, and what are some best practices for ensuring comprehensive evaluation of AI agents according to the provided context?",
- "['title: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true\\n\\nIn our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature.\\n\\nUnderstanding RAG Systems: More Than the Sum of Their Parts\\n\\nRAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information\\n\\nThis dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content.\\n\\nThe RAG Evaluation Triad\\n\\nEffective RAG evaluation requires examining three key dimensions:\\n\\nRetrieval Quality: How well does the system find relevant information?\\n\\nGeneration Quality: How well does the system produce responses from retrieved information?\\n\\nEnd-to-End Performance: How well does the complete system satisfy user needs?\\n\\nLet\\'s explore how Ragas helps evaluate each dimension of RAG systems.\\n\\nCore RAG Metrics in Ragas\\n\\nRagas provides specialized metrics to assess RAG systems across retrieval, generation, and end-to-end performance.\\n\\nRetrieval Quality Metrics\\n\\n1. Context Relevancy\\n\\nMeasures how relevant the retrieved documents are to the user\\'s question.\\n\\nHow it works:\\n\\nTakes the user\\'s question (user_input) and the retrieved documents (retrieved_contexts).\\n\\nUses an LLM to score relevance with two different prompts, averaging the results for robustness.\\n\\nScores are normalized between 0.0 (irrelevant) and 1.0 (fully relevant).\\n\\nWhy it matters: Low scores indicate your retriever is pulling in unrelated or noisy documents. Monitoring this helps you improve the retrieval step.\\n\\n2. Context Precision\\n\\nAssesses how much of the retrieved context is actually useful for generating the answer.\\n\\nHow it works:\\n\\nFor each retrieved chunk, an LLM judges if it was necessary for the answer, using the ground truth (reference) or the generated response.\\n\\nCalculates Average Precision, rewarding systems that rank useful chunks higher.\\n\\nVariants:\\n\\nContextUtilization: Uses the generated response instead of ground truth.\\n\\nNon-LLM version: Compares retrieved chunks to ideal reference contexts using string similarity.\\n\\nWhy it matters: High precision means your retriever is efficient; low precision means too much irrelevant information is included.\\n\\n3. Context Recall\\n\\nEvaluates whether all necessary information from the ground truth answer is present in the retrieved context.\\n\\nHow it works:\\n\\nBreaks down the reference answer into sentences.\\n\\nFor each sentence, an LLM checks if it can be supported by the retrieved context.\\n\\nThe score is the proportion of reference sentences attributed to the retrieved context.\\n\\nVariants:\\n\\nNon-LLM version: Compares reference and retrieved contexts using similarity and thresholds.\\n\\nWhy it matters: High recall means your retriever finds all needed information; low recall means critical information is missing.\\n\\nSummary: - Low context relevancy: Retriever needs better query understanding or semantic matching. - Low context precision: Retriever includes unnecessary information. - Low context recall: Retriever misses critical information.\\n\\nGeneration Quality Metrics\\n\\n1. Faithfulness\\n\\nChecks if the generated answer is factually consistent with the retrieved context, addressing hallucination.\\n\\nHow it works:\\n\\nBreaks the answer into simple statements.\\n\\nFor each, an LLM checks if it can be inferred from the retrieved context.\\n\\nThe score is the proportion of faithful statements.\\n\\nAlternative:\\n\\nFaithfulnesswithHHEM: Uses a specialized NLI model for verification.\\n\\nWhy it matters: High faithfulness means answers are grounded in context; low faithfulness signals hallucination.\\n\\n2. Answer Relevancy\\n\\nMeasures if the generated answer directly addresses the user\\'s question.\\n\\nHow it works:\\n\\nAsks an LLM to generate possible questions for the answer.\\n\\nCompares these to the original question using embedding similarity.\\n\\nPenalizes noncommittal answers.\\n\\nWhy it matters: High relevancy means answers are on-topic; low relevancy means answers are off-topic or incomplete.\\n\\nSummary: - Low faithfulness: Generator adds facts not supported by context. - Low answer relevancy: Generator doesn\\'t focus on the specific question.\\n\\nEnd-to-End Metrics\\n\\n1. Correctness\\n\\nAssesses factual alignment between the generated answer and a ground truth reference.\\n\\nHow it works:\\n\\nBreaks both the answer and reference into claims.\\n\\nUses NLI to verify claims in both directions.\\n\\nCalculates precision, recall, or F1-score.\\n\\nWhy it matters: High correctness means answers match the ground truth; low correctness signals factual errors.\\n\\nKey distinction: - Faithfulness: Compares answer to retrieved context. - FactualCorrectness: Compares answer to ground truth.\\n\\nCommon RAG Evaluation Patterns\\n\\n1. High Retrieval, Low Generation Scores\\n\\nDiagnosis: Good retrieval, poor use of information.\\n\\nFixes: Improve prompts, use better generation models, or verify responses post-generation.\\n\\n2. Low Retrieval, High Generation Scores\\n\\nDiagnosis: Good generation, inadequate information.\\n\\nFixes: Enhance indexing, retrieval algorithms, or expand the knowledge base.\\n\\n3. Low Context Precision, High Faithfulness\\n\\nDiagnosis: Retrieves too much, but generates reliably.\\n\\nFixes: Filter passages, optimize chunk size, or use re-ranking.\\n\\nBest Practices for RAG Evaluation\\n\\nEvaluate components independently: Assess retrieval and generation separately.\\n\\nUse diverse queries: Include factoid, explanatory, and complex questions.\\n\\nCompare against baselines: Test against simpler systems.\\n\\nPerform ablation studies: Try variations like different chunk sizes or retrieval models.\\n\\nCombine with human evaluation: Use Ragas with human judgment for a complete view.\\n\\nConclusion: The Iterative RAG Evaluation Cycle\\n\\nEffective RAG development is iterative:\\n\\nEvaluate: Measure performance.\\n\\nAnalyze: Identify weaknesses.\\n\\nImprove: Apply targeted enhancements.\\n\\nRe-evaluate: Measure the impact of changes.\\n\\nThe Iterative RAG Evaluation Cycle\\n\\nBy using Ragas to implement this cycle, you can systematically improve your RAG system\\'s performance across all dimensions.\\n\\nIn our next post, we\\'ll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas — You are here Next up in the series: Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!', 'title: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true\\n\\nIn our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs.\\n\\nBeyond the Basics: Why Advanced Metrics Matter\\n\\nWhile Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements:\\n\\nDomain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements\\n\\nCustom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches\\n\\nSpecialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics\\n\\nBusiness-specific KPIs: Aligning evaluation with business objectives requires customized metrics\\n\\nLet\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs.\\n\\nUnderstanding Ragas\\' Metric Architecture\\n\\nBefore creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture:\\n\\n1. Understand the Metric Base Classes\\n\\nAll metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these:\\n\\nSingleTurnMetric: For metrics that evaluate a single question/response pair.\\n\\nMultiTurnMetric: For metrics that evaluate multi-turn conversations.\\n\\nMetricWithLLM: For metrics that require an LLM for evaluation.\\n\\nMetricWithEmbeddings: For metrics that use embeddings.\\n\\nYou can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric).\\n\\nEach metric implements specific scoring methods depending on its type:\\n\\n_single_turn_ascore: For single-turn metrics\\n\\n_multi_turn_ascore: For multi-turn metrics\\n\\nCreating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel\\n\\nDefine input/output models for the prompt\\n\\nclass TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel): score: float feedback: str\\n\\nDefine the prompt\\n\\nclass TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ]\\n\\nCreate the metric\\n\\n@dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\",\\n\\n }\\n }\\n)\\noutput_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS\\nevaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt)\\n\\nasync def _single_turn_ascore(self, sample, callbacks) -> float:\\n assert self.llm is not None, \"LLM must be set\"\\n\\n question = sample.user_input\\n response = sample.response\\n # Extract programming language from question if possible\\n programming_language = \"python\" # Default\\n languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"]\\n for lang in languages:\\n if lang in question.lower():\\n programming_language = lang\\n break\\n\\n # Get the context\\n context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\"\\n\\n # Prepare input for prompt\\n prompt_input = TechnicalAccuracyInput(\\n question=question,\\n context=context,\\n response=response,\\n programming_language=programming_language\\n )\\n\\n # Generate evaluation\\n evaluation = await self.evaluation_prompt.generate(\\n data=prompt_input, llm=self.llm, callbacks=callbacks\\n )\\n\\n return evaluation.score\\n\\n```\\n\\nUsing the Custom Metric\\n\\nTo use the custom metric, simply include it in your evaluation pipeline:\\n\\n```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\ntest_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\")\\n\\nNote: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions,\\n\\n`` You can also use theevaluate` function to evaluate a dataset:\\n\\n```python from ragas import evaluate from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization\\n\\nCustomizing Metrics for Your Application\\n\\nYou can further refine your evaluation by customizing existing metrics—such as adjusting thresholds or criteria—to better fit your application\\'s requirements. For multi-turn conversations, you might configure metrics like topic adherence to emphasize specific aspects, such as precision or recall, based on your evaluation objectives.\\n\\nIn specialized domains like healthcare or legal, it\\'s crucial to design custom metrics that capture domain-specific accuracy and compliance needs. For complex applications, consider combining several metrics into composite scores to represent multiple quality dimensions.\\n\\nWhen assessing capabilities like code generation or structured outputs, develop metrics that evaluate execution correctness or schema compliance. For advanced scenarios, you can build metric pipelines that orchestrate several metrics and aggregate their results using strategies like weighted averages or minimum scores.\\n\\nBy thoughtfully customizing and combining metrics, you can achieve a comprehensive and meaningful evaluation framework tailored to your unique use case.\\n\\nBest Practices for Custom Metric Development\\n\\nSingle Responsibility: Each metric should evaluate one specific aspect\\n\\nClear Definition: Define precisely what your metric measures\\n\\nBounded Output: Scores should be normalized, typically in [0,1]\\n\\nReproducibility: Minimize randomness in evaluation\\n\\nDocumentation: Document criteria, prompt design, and interpretation guidelines\\n\\nTest with Examples: Verify metric behavior on clear-cut examples\\n\\nHuman Correlation: Validate that metrics correlate with human judgment\\n\\nStandardizing Custom Metrics\\n\\nTo ensure consistency across custom metrics, consider the following best practices:\\n\\nDefine a clear, human-readable description for each metric.\\n\\nProvide interpretation guidelines to help users understand score meanings.\\n\\nInclude metadata such as metric name, required columns, and output type.\\n\\nUse a standardized interface or base class for all custom metrics.\\n\\nImplementation Patterns for Advanced Metrics\\n\\nWhen developing advanced metrics like topic adherence:\\n\\nDesign multi-step evaluation workflows for complex tasks.\\n\\nUse specialized prompts for different sub-tasks within the metric.\\n\\nAllow configurable scoring modes (e.g., precision, recall, F1).\\n\\nSupport conversational context for multi-turn evaluations.\\n\\nDebugging Custom Metrics\\n\\nEffective debugging strategies include:\\n\\nImplementing a debug mode to capture prompt inputs, outputs, and intermediate results.\\n\\nLogging detailed evaluation steps for easier troubleshooting.\\n\\nReviewing final scores alongside intermediate calculations to identify issues.\\n\\nConclusion: Building an Evaluation Ecosystem\\n\\nCustom metrics allow you to build a comprehensive evaluation ecosystem tailored to your application\\'s specific needs:\\n\\nBaseline metrics: Start with Ragas\\' core metrics for fundamental quality aspects\\n\\nDomain adaptation: Add specialized metrics for your application domain\\n\\nFeature-specific metrics: Develop metrics for unique features of your system\\n\\nBusiness alignment: Create metrics that reflect specific business KPIs and requirements\\n\\nBy extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques — You are here Next up in the series: Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 7: Integrations and Observability with Ragas\" date: 2025-04-30T07:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/integrations-and-observability.png\" readingTime: 12 published: true\\n\\nPart 6: Integrations and Observability with Ragas\\n\\nIn our previous post, we explored how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications. Now, let\\'s discuss how to integrate Ragas into your broader LLM development ecosystem and establish observability practices that transform evaluation from a one-time exercise into a continuous improvement cycle.\\n\\nWhy Integrations and Observability Matter\\n\\nEvaluation is most powerful when it\\'s:\\n\\nIntegrated into your existing workflow and tools\\n\\nAutomated to run consistently with minimal friction\\n\\nObservable so insights are easily accessible and actionable\\n\\nContinuous rather than a one-time or sporadic effort\\n\\nLet\\'s explore how Ragas helps you achieve these goals through its extensive integration capabilities.\\n\\nFramework Integrations\\n\\nRagas seamlessly connects with popular LLM application frameworks, allowing you to evaluate systems built with your preferred tools.\\n\\nLangChain Integration\\n\\nFor LangChain-based applications, Ragas provides dedicated integration support. Here’s how you can integrate Ragas step by step:\\n\\nPrepare your documents: Load your source documents and split them into manageable chunks for retrieval.\\n\\nSet up vector storage: Embed the document chunks and store them in a vector database to enable efficient retrieval.\\n\\nConfigure the retriever and QA chain: Use LangChain components to create a retriever and a question-answering (QA) chain powered by your chosen language model.\\n\\nGenerate a test set: Use Ragas to automatically generate a set of test questions and answers from your documents, or supply your own.\\n\\nEvaluate retrieval and QA performance: Apply Ragas metrics to assess both the retriever and the full QA chain, measuring aspects like context relevancy, faithfulness, and answer quality.\\n\\nReview results: Analyze the evaluation outputs to identify strengths and areas for improvement in your RAG pipeline.\\n\\nThis integration allows you to continuously measure and improve the effectiveness of your retrieval and generation components within the LangChain framework.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nRagas supports integration with a variety of popular LLM and RAG frameworks beyond LangChain, including LlamaIndex and Haystack. These integrations enable seamless evaluation of retrieval and generation components within your preferred stack. If you need guidance or code examples for integrating Ragas with platforms such as LlamaIndex, Haystack, or others, support and tailored examples can be provided on demand to fit your specific workflow and requirements.\\n\\nObservability Platform Integrations\\n\\nBeyond framework integrations, Ragas connects with leading observability platforms to help you monitor, track, and analyze evaluation results over time.\\n\\nLangSmith Integration\\n\\nFor LangChain users, LangSmith provides comprehensive tracing and evaluation. To integrate Ragas evaluation with LangSmith, follow these steps:\\n\\nSet up your environment\\n\\nUpload dataset to LangSmith\\n\\nDefine your LLM or chain\\n\\nSelect Ragas metrics\\n\\nRun evaluation with LangSmith\\n\\nYou can now view detailed experiment results in your LangSmith project dashboard. This integration enables you to trace, evaluate, and monitor your RAG pipeline performance directly within LangSmith, leveraging Ragas metrics for deeper insights.\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 07_Integrations_and_Observability\\n\\nOther Platform Integrations\\n\\nRagas can be integrated with a range of observability and monitoring platforms beyond LangSmith, such as Langfuse and others. If you need help connecting Ragas to platforms like Langfuse or have specific requirements for your observability stack, tailored support and examples are available to fit your workflow.\\n\\nBuilding Automated Evaluation Pipelines\\n\\nTo ensure evaluation is a continuous part of your development process, set up automated pipelines that run evaluations regularly and automatically.\\n\\nCI/CD Integration\\n\\nYou can incorporate Ragas into your CI/CD pipeline so that every code change is automatically evaluated. This helps catch regressions early and ensures your RAG system maintains high performance before merging new changes.\\n\\nScheduled Evaluations\\n\\nRegularly scheduled evaluations allow you to monitor your system’s performance over time. By running evaluations at set intervals, you can track trends, spot regressions, and ensure your system continues to meet quality standards.\\n\\nMonitoring Evaluation Metrics Over Time\\n\\nTracking evaluation metrics over time helps you identify performance trends and quickly detect any drops in quality. By visualizing these metrics, you can better understand how changes to your system impact its effectiveness.\\n\\nCreating Custom Dashboards\\n\\nBuilding custom dashboards gives you a comprehensive view of your evaluation results. Dashboards can display current performance, trends, and detailed breakdowns of recent evaluations, making it easier to monitor your system and identify areas for improvement.\\n\\nWith these practices, you can make evaluation an ongoing, automated, and visible part of your development workflow, leading to more reliable and robust RAG systems.\\n\\nBest Practices for Observability\\n\\nDefine clear thresholds: Establish performance baselines and alert thresholds for each metric\\n\\nSegment evaluations: Break down results by query type, data source, or other relevant factors\\n\\nHistorical tracking: Maintain historical evaluation data to identify trends and regressions\\n\\nCorrelation analysis: Link evaluation metrics to user feedback and business outcomes\\n\\nRegular benchmarking: Periodically evaluate against fixed test sets to ensure consistency\\n\\nAlert on regressions: Implement automated alerts when metrics drop below thresholds\\n\\nContextualize metrics: Include example failures alongside aggregate metrics for better understanding\\n\\nBuilding a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:\\n\\nCapture evaluation results with Ragas\\n\\nIdentify patterns in failures and underperforming areas\\n\\nPrioritize improvements based on impact and effort\\n\\nImplement changes to your RAG components\\n\\nValidate improvements with focused re-evaluation\\n\\nMonitor continuously to catch regressions\\n\\nConclusion: From Evaluation to Action\\n\\nIntegrating Ragas with your frameworks and observability tools transforms evaluation from a point-in-time activity to a continuous improvement cycle. By making evaluation metrics visible, actionable, and integrated into your workflows, you create a foundation for systematic improvement of your LLM applications.\\n\\nThe most successful teams don\\'t just evaluate occasionally — they build evaluation into their development culture, making data-driven decisions based on objective metrics rather than subjective impressions.\\n\\nIn our final post, we\\'ll explore how to build effective feedback loops that translate evaluation insights into concrete improvements for your LLM applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Metrics and Customization Part 6: Evaluating AI Agents Part 7: Integrations and Observability with Ragas — You are here Next up in the series: Part 8: Building Feedback Loops\\n\\nHow are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!']",
- "['<1-hop>\\n\\ntitle: \"Part 4: Generating Test Data with Ragas\" date: 2025-04-27T16:00:00-06:00 layout: blog description: \"Discover how to generate robust test datasets for evaluating Retrieval-Augmented Generation systems using Ragas, including document-based, domain-specific, and adversarial test generation techniques.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"/images/generating_test_data.png\" readingTime: 14 published: true In our previous post, we explored how to comprehensively evaluate RAG systems using specialized metrics. However, even the best evaluation framework requires high-quality test data to yield meaningful insights. In this post, we\\'ll dive into how Ragas helps you generate robust test datasets for evaluating your LLM applications. Why and', '<2-hop>\\n\\nImplementing Agent Evaluation in Practice Let\\'s look at a practical example of evaluating an AI agent using these metrics: ```python from ragas.metrics import AgentGoalAccuracyWithoutReference, ToolCallAccuracy, TopicAdherenceScore from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import MultiTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper Initialize the LLM evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) Example conversation with a travel booking agent test_data = { \"user_input\": [ {\"role\": \"user\", \"content\": \"I need to book a flight from New York to London next Friday\"}, {\"role\": \"assistant\", \"content\": \"I\\'d be happy to help you book a flight. Let me search for options...\", \"tool_calls\": [{\"name\": \"search_flights\", \"arguments\": {\"origin\": \"NYC\", \"destination\": \"LON\", \"date\": \"next Friday\"}}]}, {\"role\": \"tool\", \"name\": \"search_flights\", \"content\": \"Found 5 flights: Flight 1 (Delta, $750), Flight 2 (British Airways, $820)...\"}, {\"role\": \"assistant\", \"content\": \"I found several flights from New York to London next Friday. The cheapest option is Delta for $750. Would you like to book this one?\"}, {\"role\": \"user\", \"content\": \"Yes, please book the Delta flight\"}, {\"role\": \"assistant\", \"content\": \"I\\'ll book that for you now.\", \"tool_calls\": [{\"name\": \"book_flight\", \"arguments\": {\"flight_id\": \"delta_123\", \"price\": \"$750\"}}]}, {\"role\": \"tool\", \"name\": \"book_flight\", \"content\": \"Booking confirmed. Confirmation #: ABC123\"}, {\"role\": \"assistant\", \"content\": \"Great news! Your flight is confirmed. Your confirmation number is ABC123. The flight is scheduled for next Friday. Is there anything else you need help with?\"} ], \"reference_topics\": [\"travel\", \"flight booking\", \"schedules\", \"prices\"], \"reference_tool_calls\": [ {\"name\": \"search_flights\", \"args\": {\"origin\": \"NYC\", \"destination\": \"LON\", \"date\": \"next Friday\"}}, {\"name\": \"book_flight\", \"args\": {\"flight_id\": \"delta_123\", \"price\": \"$750\"}} ] } Create a sample sample = MultiTurnSample(**test_data) Initialize metrics goal_accuracy = AgentGoalAccuracyWithoutReference(llm=evaluator_llm) tool_accuracy = ToolCallAccuracy() topic_adherence = TopicAdherenceScore(llm=evaluator_llm) Calculate scores goal_score = await goal_accuracy.multi_turn_ascore(sample) tool_score = tool_accuracy.multi_turn_score(sample) topic_score = await topic_adherence.multi_turn_ascore(sample) print(f\"Goal Accuracy: {goal_score}\") print(f\"Tool Call Accuracy: {tool_score}\") print(f\"Topic Adherence: {topic_score}\") ``` 💡 Try it yourself: Explore the hands-on notebook for agent evaluation: 06_Evaluating_AI_Agents Advanced Agent Evaluation Techniques Combining Metrics for Comprehensive Evaluation For a complete assessment of agent capabilities, combine multiple metrics: ```python from ragas import evaluate results = evaluate( dataset, # Your dataset of agent conversations metrics=[ AgentGoalAccuracyWithoutReference(llm=evaluator_llm), ToolCallAccuracy(), TopicAdherence(llm=evaluator_llm) ] ) ``` Best Practices for Agent Evaluation Test scenario coverage: Include a diverse range of interaction scenarios Edge case handling: Test how agents handle unexpected inputs or failures Longitudinal evaluation: Track performance over time to identify regressions Human-in-the-loop validation: Periodically verify metric alignment with human judgments Continuous feedback loops: Use evaluation insights to guide agent improvements Conclusion Evaluating AI agents requires specialized metrics that go beyond traditional RAG evaluation. Ragas\\' agent_goal_accuracy, tool_call_accuracy, and topic_adherence provide crucial insights into whether an agent can successfully complete tasks, use tools correctly, and stay within designated boundaries. By incorporating these metrics into your evaluation pipeline, you can build more reliable and effective AI agents that truly deliver on the promise of helpful, goal-oriented AI assistants. In our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows. Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Metrics and Customization Part 6: Evaluating AI Agents — You are here Next up in the series: Part 7: Integrations and Observability Part 8: Building Feedback Loops How are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!']",
- "### Ragas and Evaluation of RAG Systems\n\nRagas facilitates the evaluation of Retrieval-Augmented Generation (RAG) systems by providing specialized metrics and tools to generate robust EvaluationDatasets. This includes assessing retrieval quality, generation quality, and end-to-end performance through metrics like context relevancy, context precision, and faithfulness. Ragas also automates the creation of comprehensive test datasets, ensuring thorough coverage of various evaluation scenarios.\n\n### Best Practices for Comprehensive Evaluation of AI Agents\n\n1. **Evaluate Components Independently**: Assess retrieval and generation separately to identify specific weaknesses.\n2. **Use Diverse Queries**: Include a variety of question types (factoid, explanatory, complex) to test the system's robustness.\n3. **Compare Against Baselines**: Test against simpler systems to establish performance benchmarks.\n4. **Perform Ablation Studies**: Experiment with different configurations (e.g., chunk sizes, retrieval models) to understand their impact.\n5. **Combine with Human Evaluation**: Use Ragas alongside human judgment for a more complete assessment.\n\nFor more detailed insights, you can explore the following resources:\n\n- [Evaluating Rag Systems With Ragas](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)\n- [Introduction To Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Advanced Metrics And Customization With Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)\n- [Integrations And Observability With Ragas](https://thedataguy.pro/blog/integrations-and-observability-with-ragas/)",
- "Ragas facilitates the evaluation of Retrieval-Augmented Generation (RAG) systems by enabling the generation of robust test datasets, which are essential for meaningful evaluation. According to the context, Ragas supports the creation of document-based, domain-specific, and adversarial test datasets, ensuring that LLM applications are tested under diverse and challenging scenarios. The EvaluationDataset class in Ragas allows for the structuring of multi-turn agent conversations, which can then be assessed using specialized metrics such as AgentGoalAccuracyWithoutReference, ToolCallAccuracy, and TopicAdherenceScore. These metrics provide insights into an agent's ability to complete tasks, use tools correctly, and adhere to designated topics. Best practices for comprehensive evaluation include covering a wide range of interaction scenarios, testing edge case handling, conducting longitudinal evaluations to track performance over time, incorporating human-in-the-loop validation to align metrics with human judgment, and establishing continuous feedback loops to guide agent improvements. By combining robust EvaluationDatasets with these best practices and metrics, Ragas enables a thorough and reliable evaluation process for AI agents in RAG systems.",
- "multi_hop_specific_query_synthesizer"
- ],
- [
- "10",
- "How can ChatOpenAI be integrated into a RAG evaluation pipeline for both synthetic data generation and advanced metric evaluation using Ragas, and what are the key steps involved in this process?",
- "['title: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true\\n\\nIn our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs.\\n\\nBeyond the Basics: Why Advanced Metrics Matter\\n\\nWhile Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements:\\n\\nDomain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements\\n\\nCustom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches\\n\\nSpecialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics\\n\\nBusiness-specific KPIs: Aligning evaluation with business objectives requires customized metrics\\n\\nLet\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs.\\n\\nUnderstanding Ragas\\' Metric Architecture\\n\\nBefore creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture:\\n\\n1. Understand the Metric Base Classes\\n\\nAll metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these:\\n\\nSingleTurnMetric: For metrics that evaluate a single question/response pair.\\n\\nMultiTurnMetric: For metrics that evaluate multi-turn conversations.\\n\\nMetricWithLLM: For metrics that require an LLM for evaluation.\\n\\nMetricWithEmbeddings: For metrics that use embeddings.\\n\\nYou can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric).\\n\\nEach metric implements specific scoring methods depending on its type:\\n\\n_single_turn_ascore: For single-turn metrics\\n\\n_multi_turn_ascore: For multi-turn metrics\\n\\nCreating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel\\n\\nDefine input/output models for the prompt\\n\\nclass TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel): score: float feedback: str\\n\\nDefine the prompt\\n\\nclass TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ]\\n\\nCreate the metric\\n\\n@dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\",\\n\\n }\\n }\\n)\\noutput_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS\\nevaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt)\\n\\nasync def _single_turn_ascore(self, sample, callbacks) -> float:\\n assert self.llm is not None, \"LLM must be set\"\\n\\n question = sample.user_input\\n response = sample.response\\n # Extract programming language from question if possible\\n programming_language = \"python\" # Default\\n languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"]\\n for lang in languages:\\n if lang in question.lower():\\n programming_language = lang\\n break\\n\\n # Get the context\\n context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\"\\n\\n # Prepare input for prompt\\n prompt_input = TechnicalAccuracyInput(\\n question=question,\\n context=context,\\n response=response,\\n programming_language=programming_language\\n )\\n\\n # Generate evaluation\\n evaluation = await self.evaluation_prompt.generate(\\n data=prompt_input, llm=self.llm, callbacks=callbacks\\n )\\n\\n return evaluation.score\\n\\n```\\n\\nUsing the Custom Metric\\n\\nTo use the custom metric, simply include it in your evaluation pipeline:\\n\\n```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\ntest_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\")\\n\\nNote: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions,\\n\\n`` You can also use theevaluate` function to evaluate a dataset:\\n\\n```python from ragas import evaluate from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization\\n\\nCustomizing Metrics for Your Application\\n\\nYou can further refine your evaluation by customizing existing metrics—such as adjusting thresholds or criteria—to better fit your application\\'s requirements. For multi-turn conversations, you might configure metrics like topic adherence to emphasize specific aspects, such as precision or recall, based on your evaluation objectives.\\n\\nIn specialized domains like healthcare or legal, it\\'s crucial to design custom metrics that capture domain-specific accuracy and compliance needs. For complex applications, consider combining several metrics into composite scores to represent multiple quality dimensions.\\n\\nWhen assessing capabilities like code generation or structured outputs, develop metrics that evaluate execution correctness or schema compliance. For advanced scenarios, you can build metric pipelines that orchestrate several metrics and aggregate their results using strategies like weighted averages or minimum scores.\\n\\nBy thoughtfully customizing and combining metrics, you can achieve a comprehensive and meaningful evaluation framework tailored to your unique use case.\\n\\nBest Practices for Custom Metric Development\\n\\nSingle Responsibility: Each metric should evaluate one specific aspect\\n\\nClear Definition: Define precisely what your metric measures\\n\\nBounded Output: Scores should be normalized, typically in [0,1]\\n\\nReproducibility: Minimize randomness in evaluation\\n\\nDocumentation: Document criteria, prompt design, and interpretation guidelines\\n\\nTest with Examples: Verify metric behavior on clear-cut examples\\n\\nHuman Correlation: Validate that metrics correlate with human judgment\\n\\nStandardizing Custom Metrics\\n\\nTo ensure consistency across custom metrics, consider the following best practices:\\n\\nDefine a clear, human-readable description for each metric.\\n\\nProvide interpretation guidelines to help users understand score meanings.\\n\\nInclude metadata such as metric name, required columns, and output type.\\n\\nUse a standardized interface or base class for all custom metrics.\\n\\nImplementation Patterns for Advanced Metrics\\n\\nWhen developing advanced metrics like topic adherence:\\n\\nDesign multi-step evaluation workflows for complex tasks.\\n\\nUse specialized prompts for different sub-tasks within the metric.\\n\\nAllow configurable scoring modes (e.g., precision, recall, F1).\\n\\nSupport conversational context for multi-turn evaluations.\\n\\nDebugging Custom Metrics\\n\\nEffective debugging strategies include:\\n\\nImplementing a debug mode to capture prompt inputs, outputs, and intermediate results.\\n\\nLogging detailed evaluation steps for easier troubleshooting.\\n\\nReviewing final scores alongside intermediate calculations to identify issues.\\n\\nConclusion: Building an Evaluation Ecosystem\\n\\nCustom metrics allow you to build a comprehensive evaluation ecosystem tailored to your application\\'s specific needs:\\n\\nBaseline metrics: Start with Ragas\\' core metrics for fundamental quality aspects\\n\\nDomain adaptation: Add specialized metrics for your application domain\\n\\nFeature-specific metrics: Develop metrics for unique features of your system\\n\\nBusiness alignment: Create metrics that reflect specific business KPIs and requirements\\n\\nBy extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques — You are here Next up in the series: Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true\\n\\nIn our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature.\\n\\nUnderstanding RAG Systems: More Than the Sum of Their Parts\\n\\nRAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information\\n\\nThis dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content.\\n\\nThe RAG Evaluation Triad\\n\\nEffective RAG evaluation requires examining three key dimensions:\\n\\nRetrieval Quality: How well does the system find relevant information?\\n\\nGeneration Quality: How well does the system produce responses from retrieved information?\\n\\nEnd-to-End Performance: How well does the complete system satisfy user needs?\\n\\nLet\\'s explore how Ragas helps evaluate each dimension of RAG systems.\\n\\nCore RAG Metrics in Ragas\\n\\nRagas provides specialized metrics to assess RAG systems across retrieval, generation, and end-to-end performance.\\n\\nRetrieval Quality Metrics\\n\\n1. Context Relevancy\\n\\nMeasures how relevant the retrieved documents are to the user\\'s question.\\n\\nHow it works:\\n\\nTakes the user\\'s question (user_input) and the retrieved documents (retrieved_contexts).\\n\\nUses an LLM to score relevance with two different prompts, averaging the results for robustness.\\n\\nScores are normalized between 0.0 (irrelevant) and 1.0 (fully relevant).\\n\\nWhy it matters: Low scores indicate your retriever is pulling in unrelated or noisy documents. Monitoring this helps you improve the retrieval step.\\n\\n2. Context Precision\\n\\nAssesses how much of the retrieved context is actually useful for generating the answer.\\n\\nHow it works:\\n\\nFor each retrieved chunk, an LLM judges if it was necessary for the answer, using the ground truth (reference) or the generated response.\\n\\nCalculates Average Precision, rewarding systems that rank useful chunks higher.\\n\\nVariants:\\n\\nContextUtilization: Uses the generated response instead of ground truth.\\n\\nNon-LLM version: Compares retrieved chunks to ideal reference contexts using string similarity.\\n\\nWhy it matters: High precision means your retriever is efficient; low precision means too much irrelevant information is included.\\n\\n3. Context Recall\\n\\nEvaluates whether all necessary information from the ground truth answer is present in the retrieved context.\\n\\nHow it works:\\n\\nBreaks down the reference answer into sentences.\\n\\nFor each sentence, an LLM checks if it can be supported by the retrieved context.\\n\\nThe score is the proportion of reference sentences attributed to the retrieved context.\\n\\nVariants:\\n\\nNon-LLM version: Compares reference and retrieved contexts using similarity and thresholds.\\n\\nWhy it matters: High recall means your retriever finds all needed information; low recall means critical information is missing.\\n\\nSummary: - Low context relevancy: Retriever needs better query understanding or semantic matching. - Low context precision: Retriever includes unnecessary information. - Low context recall: Retriever misses critical information.\\n\\nGeneration Quality Metrics\\n\\n1. Faithfulness\\n\\nChecks if the generated answer is factually consistent with the retrieved context, addressing hallucination.\\n\\nHow it works:\\n\\nBreaks the answer into simple statements.\\n\\nFor each, an LLM checks if it can be inferred from the retrieved context.\\n\\nThe score is the proportion of faithful statements.\\n\\nAlternative:\\n\\nFaithfulnesswithHHEM: Uses a specialized NLI model for verification.\\n\\nWhy it matters: High faithfulness means answers are grounded in context; low faithfulness signals hallucination.\\n\\n2. Answer Relevancy\\n\\nMeasures if the generated answer directly addresses the user\\'s question.\\n\\nHow it works:\\n\\nAsks an LLM to generate possible questions for the answer.\\n\\nCompares these to the original question using embedding similarity.\\n\\nPenalizes noncommittal answers.\\n\\nWhy it matters: High relevancy means answers are on-topic; low relevancy means answers are off-topic or incomplete.\\n\\nSummary: - Low faithfulness: Generator adds facts not supported by context. - Low answer relevancy: Generator doesn\\'t focus on the specific question.\\n\\nEnd-to-End Metrics\\n\\n1. Correctness\\n\\nAssesses factual alignment between the generated answer and a ground truth reference.\\n\\nHow it works:\\n\\nBreaks both the answer and reference into claims.\\n\\nUses NLI to verify claims in both directions.\\n\\nCalculates precision, recall, or F1-score.\\n\\nWhy it matters: High correctness means answers match the ground truth; low correctness signals factual errors.\\n\\nKey distinction: - Faithfulness: Compares answer to retrieved context. - FactualCorrectness: Compares answer to ground truth.\\n\\nCommon RAG Evaluation Patterns\\n\\n1. High Retrieval, Low Generation Scores\\n\\nDiagnosis: Good retrieval, poor use of information.\\n\\nFixes: Improve prompts, use better generation models, or verify responses post-generation.\\n\\n2. Low Retrieval, High Generation Scores\\n\\nDiagnosis: Good generation, inadequate information.\\n\\nFixes: Enhance indexing, retrieval algorithms, or expand the knowledge base.\\n\\n3. Low Context Precision, High Faithfulness\\n\\nDiagnosis: Retrieves too much, but generates reliably.\\n\\nFixes: Filter passages, optimize chunk size, or use re-ranking.\\n\\nBest Practices for RAG Evaluation\\n\\nEvaluate components independently: Assess retrieval and generation separately.\\n\\nUse diverse queries: Include factoid, explanatory, and complex questions.\\n\\nCompare against baselines: Test against simpler systems.\\n\\nPerform ablation studies: Try variations like different chunk sizes or retrieval models.\\n\\nCombine with human evaluation: Use Ragas with human judgment for a complete view.\\n\\nConclusion: The Iterative RAG Evaluation Cycle\\n\\nEffective RAG development is iterative:\\n\\nEvaluate: Measure performance.\\n\\nAnalyze: Identify weaknesses.\\n\\nImprove: Apply targeted enhancements.\\n\\nRe-evaluate: Measure the impact of changes.\\n\\nThe Iterative RAG Evaluation Cycle\\n\\nBy using Ragas to implement this cycle, you can systematically improve your RAG system\\'s performance across all dimensions.\\n\\nIn our next post, we\\'ll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas — You are here Next up in the series: Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 2: Basic Evaluation Workflow with Ragas\" date: 2025-04-26T19:00:00-06:00 layout: blog description: \"Learn how to set up a basic evaluation workflow for LLM applications using Ragas. This guide walks you through data preparation, metric selection, and result analysis.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1600132806370-bf17e65e942f?q=80&w=1988&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 8 published: true\\n\\nIn our previous post, we introduced Ragas as a powerful framework for evaluating LLM applications. Now, let\\'s dive into the practical aspects of setting up your first evaluation pipeline.\\n\\nUnderstanding the Evaluation Workflow\\n\\nA typical Ragas evaluation workflow consists of four key steps:\\n\\nPrepare your data: Collect queries, contexts, responses, and reference answers\\n\\nSelect appropriate metrics: Choose metrics that align with what you want to evaluate\\n\\nRun the evaluation: Process your data through the selected metrics\\n\\nAnalyze the results: Interpret scores and identify areas for improvement\\n\\nLet\\'s walk through each step with practical examples.\\n\\nStep 1: Setting Up Your Environment\\n\\nFirst, ensure you have Ragas installed:\\n\\nbash uv add ragas\\n\\nNext, import the necessary components:\\n\\npython import pandas as pd from ragas import EvaluationDataset from ragas import evaluate, RunConfig from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity\\n\\nStep 2: Preparing Your Evaluation Data\\n\\nFor a RAG system evaluation, you\\'ll need:\\n\\nQuestions: User queries to your system\\n\\nContexts: Documents or chunks retrieved by your system\\n\\nResponses: Answers generated by your system\\n\\nGround truth (optional): Reference answers or documents for comparison\\n\\nHere\\'s how to organize this data:\\n\\n```python\\n\\nSample data\\n\\ndata = { \"user_input\": [ \"What are the main symptoms of COVID-19?\", \"How does machine learning differ from deep learning?\" ], \"retrieved_contexts\": [ [ \"Common symptoms of COVID-19 include fever, cough, and fatigue. Some patients also report loss of taste or smell, body aches, and difficulty breathing.\", \"COVID-19 is caused by the SARS-CoV-2 virus and spreads primarily through respiratory droplets.\" ], [ \"Machine learning is a subset of AI focused on algorithms that learn from data without being explicitly programmed.\", \"Deep learning is a specialized form of machine learning using neural networks with many layers (deep neural networks).\" ] ], \"response\": [ \"The main symptoms of COVID-19 include fever, cough, fatigue, and sometimes loss of taste or smell, body aches, and breathing difficulties.\", \"Machine learning is a subset of AI that focuses on algorithms learning from data, while deep learning is a specialized form of machine learning that uses deep neural networks with multiple layers.\" ], \"reference\": [ \"COVID-19 symptoms commonly include fever, dry cough, fatigue, loss of taste or smell, body aches, sore throat, and in severe cases, difficulty breathing.\", \"Machine learning is a branch of AI where systems learn from data, identify patterns, and make decisions with minimal human intervention. Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks) to analyze various factors of data.\" ] }\\n\\neval_data = pd.DataFrame(data)\\n\\nConvert to a format Ragas can use\\n\\nevaluation_dataset = EvaluationDataset.from_pandas(eval_data) evaluation_dataset\\n\\n```\\n\\nStep 3: Selecting and Configuring Metrics\\n\\nRagas offers various metrics to evaluate different aspects of your system:\\n\\nCore RAG Metrics:\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nFactual Correctness: Assesses if the response is accurate and free from factual errors.\\n\\nResponse Relevancy: Evaluates if the response directly addresses the user query.\\n\\nContext Entity Recall: Measures how well the retrieved context captures relevant entities from the ground truth.\\n\\nNoise Sensitivity: Assesses the robustness of the response to irrelevant or noisy context.\\n\\nLLM Context Recall: Evaluates how effectively the LLM utilizes the provided context to generate the response.\\n\\nFor metrics that require an LLM (like faithfulness), you need to configure the LLM provider:\\n\\n```python\\n\\nConfigure LLM for evaluation\\n\\nfrom langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nDefine metrics to use\\n\\nmetrics = [ Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity(), LLMContextRecall() ] ```\\n\\nStep 4: Running the Evaluation\\n\\nNow, run the evaluation with your selected metrics:\\n\\n```python\\n\\nRun evaluation\\n\\nresults = evaluate( evaluation_dataset, metrics=metrics, llm=evaluator_llm # Required for LLM-based metrics )\\n\\nView results\\n\\nprint(results) ```\\n\\nOutput:\\n\\nValues will vary based on your data and LLM performance.\\n\\npython { \"faithfulness\": 1.0000, \"factual_correctness\": 0.6750, \"answer_relevancy\": 0.9897, \"context_entity_recall\": 0.8889, \"noise_sensitivity_relevant\": 0.1667, \"context_recall\": 0.5000 }\\n\\nStep 5: Interpreting Results\\n\\nRagas metrics typically return scores between 0 and 1, where higher is better:\\n\\nUnderstanding Score Ranges:\\n\\n0.8-1.0: Excellent performance\\n\\n0.6-0.8: Good performance\\n\\n0.4-0.6: Moderate performance, needs improvement\\n\\n0.4 or lower: Poor performance, requires significant attention\\n\\nAdvanced Use: Custom Evaluation for Specific Examples\\n\\nFor more detailed analysis of specific examples:\\n\\n```python from ragas import SingleTurnSample from ragas.metrics import AspectCritic\\n\\nDefine a specific test case\\n\\ntest_data = { \"user_input\": \"What are quantum computers?\", \"response\": \"Quantum computers use quantum bits or qubits that can exist in multiple states simultaneously, unlike classical bits that can only be 0 or 1.\", \"retrieved_contexts\": [\"Quantum computing is a type of computation that harnesses quantum mechanical phenomena.\"] }\\n\\nCreate a custom evaluation metric\\n\\ncustom_metric = AspectCritic( name=\"quantum_accuracy\", llm=llm, definition=\"Verify if the explanation of quantum computing is accurate and complete.\" )\\n\\nScore the sample\\n\\nsample = SingleTurnSample(**test_data) score = await custom_metric.single_turn_ascore(sample) print(f\"Quantum accuracy score: {score}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 02_Basic_Evaluation_Workflow_with_Ragas\\n\\nCommon Evaluation Patterns and Metrics\\n\\nBelow is a matrix mapping evaluation patterns to the metrics used, along with definitions for each metric:\\n\\nMetric Comprehensive RAG Evaluation Content Quality Evaluation Retrieval Quality Evaluation Faithfulness ✓ ✓ Answer Relevancy ✓ ✓ Context Recall ✓ ✓ Context Precision ✓ ✓ Harmfulness ✓ Coherence ✓ Context Relevancy ✓\\n\\nMetric Definitions\\n\\nFaithfulness: Measures if the response is factually consistent with the provided context.\\n\\nAnswer Relevancy: Assesses if the response addresses the question.\\n\\nContext Recall: Measures how well the retrieved context covers the information in the ground truth.\\n\\nContext Precision: Evaluates the proportion of relevant information in the retrieved context.\\n\\nHarmfulness: Evaluates if the response contains harmful or inappropriate content.\\n\\nCoherence: Measures the logical flow and clarity of the response.\\n\\nContext Relevancy: Evaluates if the retrieved context is relevant to the question.\\n\\nThis matrix provides a clear overview of which metrics to use for specific evaluation patterns and their respective definitions.\\n\\nBest Practices for Ragas Evaluation\\n\\nStart simple: Begin with core metrics before adding more specialized ones\\n\\nUse diverse test cases: Include a variety of questions, from simple to complex\\n\\nConsider edge cases: Test with queries that might challenge your system\\n\\nCompare versions: Track metrics across different versions of your application\\n\\nCombine with human evaluation: Use Ragas alongside human feedback for a comprehensive assessment\\n\\nConclusion\\n\\nSetting up a basic evaluation workflow with Ragas is straightforward yet powerful. By systematically evaluating your LLM applications, you gain objective insights into their performance and clear directions for improvement.\\n\\nIn our next post, we\\'ll delve deeper into specialized evaluation techniques for RAG systems, exploring advanced metrics and evaluation strategies for retrieval-augmented generation applications.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow — You are here Next up in the series: Part 3: Evaluating RAG Systems Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHave you set up your first Ragas evaluation? What aspects of your LLM application are you most interested in measuring? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!']",
- "['<1-hop>\\n\\nHow to Generate Synthetic Data for RAG Evaluation In the world of Retrieval-Augmented Generation (RAG) and LLM-powered applications, synthetic data generation is a game-changer for rapid iteration and robust evaluation. This blog post explains why synthetic data is essential, and how you can generate it for your own RAG pipelines—using modern tools like RAGAS and LangSmith. Why Generate Synthetic Data? Early Signal, Fast Iteration Real-world data is often scarce or expensive to label. Synthetic data lets you quickly create test sets that mimic real user queries and contexts, so you can evaluate your system’s performance before deploying to production. Controlled Complexity You can design synthetic datasets to cover edge cases, multi-hop reasoning, or specific knowledge domains—ensuring your RAG system is robust, not just good at the “easy” cases. Benchmarking and Comparison Synthetic test sets provide a repeatable, comparable way to measure improvements as you tweak your pipeline (e.g., changing chunk size, embeddings, or prompts). How to Generate Synthetic Data 1. Prepare Your Source Data Start with a set of documents relevant to your domain. For example, you might download and load HTML blog posts into a document format using tools like LangChain’s DirectoryLoader. 2. Build a Knowledge Graph Use RAGAS to convert your documents into a knowledge graph. This graph captures entities, relationships, and summaries, forming the backbone for generating meaningful queries. RAGAS applies default transformations are dependent on the corpus length, here are some examples: Producing Summaries -> produces summaries of the documents Extracting Headlines -> finding the overall headline for the document Theme Extractor -> extracts broad themes about the documents It then uses cosine-similarity and heuristics between the embeddings of the above transformations to construct relationships between the nodes. This is a crucial step, as the quality of your knowledge graph directly impacts the relevance and accuracy of the generated queries. 3. Configure Query Synthesizers RAGAS provides several query synthesizers: - SingleHopSpecificQuerySynthesizer: Generates direct, fact-based questions. - MultiHopAbstractQuerySynthesizer: Creates broader, multi-step reasoning questions. - MultiHopSpecificQuerySynthesizer: Focuses on questions that require connecting specific entities across documents. By mixing these, you get a diverse and challenging test set. 4. Generate the Test Set With your knowledge graph and query synthesizers, use RAGAS’s TestsetGenerator to create a synthetic dataset. This dataset will include questions, reference answers, and supporting contexts. 5. Evaluate and Iterate Load your synthetic dataset into an evaluation platform like LangSmith. Run your RAG pipeline against the test set, and use automated evaluators (for accuracy, helpfulness, style, etc.) to identify strengths and weaknesses. Tweak your pipeline and re-evaluate to drive improvements. Minimal Example Here’s a high-level pseudocode outline (see the notebook for full details): ````python 1. Load documents from langchain_community.document_loaders import DirectoryLoader path = \"data/\" loader = DirectoryLoader(path, glob=\"*.md\") docs = loader.load() 2. Generate data from ragas.testset import TestsetGenerator from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings Initialize the generator with the LLM and embedding model generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4.1\")) generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) Create the test set generator generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings) dataset = generator.generate_with_langchain_docs(docs, testset_size=10) ```` dataset will now contain a set of questions, answers, and contexts that you can use to evaluate your RAG system. 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 04_Synthetic_Data_Generation', '<2-hop>\\n\\ntitle: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true In our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs. Beyond the Basics: Why Advanced Metrics Matter While Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements: Domain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements Custom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches Specialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics Business-specific KPIs: Aligning evaluation with business objectives requires customized metrics Let\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs. Understanding Ragas\\' Metric Architecture Before creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture: 1. Understand the Metric Base Classes All metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these: SingleTurnMetric: For metrics that evaluate a single question/response pair. MultiTurnMetric: For metrics that evaluate multi-turn conversations. MetricWithLLM: For metrics that require an LLM for evaluation. MetricWithEmbeddings: For metrics that use embeddings. You can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric). Each metric implements specific scoring methods depending on its type: _single_turn_ascore: For single-turn metrics _multi_turn_ascore: For multi-turn metrics Creating Your First Custom Metric Let\\'s create a custom metric that evaluates technical accuracy in programming explanations: ```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t from ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel Define input/output models for the prompt class TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\" class TechnicalAccuracyOutput(BaseModel): score: float feedback: str Define the prompt class TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ] Create the metric @dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\", } } ) output_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS evaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt) async def _single_turn_ascore(self, sample, callbacks) -> float: assert self.llm is not None, \"LLM must be set\" question = sample.user_input response = sample.response # Extract programming language from question if possible programming_language = \"python\" # Default languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"] for lang in languages: if lang in question.lower(): programming_language = lang break # Get the context context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\" # Prepare input for prompt prompt_input = TechnicalAccuracyInput( question=question, context=context, response=response, programming_language=programming_language ) # Generate evaluation evaluation = await self.evaluation_prompt.generate( data=prompt_input, llm=self.llm, callbacks=callbacks ) return evaluation.score ``` Using the Custom Metric To use the custom metric, simply include it in your evaluation pipeline: ```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper Initialize the LLM, you are going to OPENAI API key evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")) test_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", } Create a sample sample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\") Note: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions, `` You can also use theevaluate` function to evaluate a dataset: ```python from ragas import evaluate from ragas import evaluate results = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ``` 💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization']",
- "To integrate ChatOpenAI into a RAG evaluation pipeline using Ragas for both synthetic data generation and advanced metric evaluation, follow these key steps:\n\n1. **Install Ragas**: Ensure Ragas is installed in your environment.\n ```bash\n uv init && uv add ragas\n ```\n\n2. **Set Up the LLM**: Initialize the ChatOpenAI model with your API key.\n ```python\n from langchain_openai import ChatOpenAI\n from ragas.llms import LangchainLLMWrapper\n\n evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\n ```\n\n3. **Generate Synthetic Data**: Create a dataset for evaluation, including user inputs, retrieved contexts, and expected responses.\n ```python\n import pandas as pd\n from ragas import EvaluationDataset\n\n data = {\n \"user_input\": [\"What is the capital of France?\"],\n \"retrieved_contexts\": [[\"Paris is the capital of France.\"]],\n \"response\": [\"The capital of France is Paris.\"],\n \"reference\": [\"Paris is the capital of France.\"]\n }\n eval_data = pd.DataFrame(data)\n evaluation_dataset = EvaluationDataset.from_pandas(eval_data)\n ```\n\n4. **Select Metrics**: Choose appropriate metrics for evaluation, such as Faithfulness, Factual Correctness, and Response Relevancy.\n ```python\n from ragas.metrics import Faithfulness, FactualCorrectness, ResponseRelevancy\n\n metrics = [Faithfulness(llm=evaluator_llm), FactualCorrectness(), ResponseRelevancy()]\n ```\n\n5. **Run the Evaluation**: Process the evaluation dataset through the selected metrics.\n ```python\n from ragas import evaluate\n\n results = evaluate(evaluation_dataset, metrics=metrics, llm=evaluator_llm)\n print(results)\n ```\n\n6. **Analyze Results**: Interpret the scores returned by the metrics to identify areas for improvement.\n\nFor more detailed guidance, you can refer to the following resources:\n- [Introduction to Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Basic Evaluation Workflow with Ragas](https://thedataguy.pro/blog/basic-evaluation-workflow-with-ragas/)\n- [Advanced Metrics and Customization with Ragas](https://thedataguy.pro/blog/advanced-metrics-and-customization-with-ragas/)",
- "ChatOpenAI can be integrated into a RAG evaluation pipeline for both synthetic data generation and advanced metric evaluation using Ragas by following a series of steps outlined in the provided context. First, for synthetic data generation, you initialize the generator with the LLM and embedding model by wrapping ChatOpenAI (for example, with LangchainLLMWrapper using ChatOpenAI(model=\"gpt-4.1\")) and OpenAIEmbeddings. This setup is used with Ragas’s TestsetGenerator to create a synthetic dataset containing questions, answers, and contexts for RAG evaluation. Second, for advanced metric evaluation, ChatOpenAI can be used as the underlying LLM in custom metrics within Ragas. For instance, when creating a custom metric such as TechnicalAccuracy, you initialize the evaluator LLM with LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\")), and use it to score responses based on criteria like syntax correctness and algorithmic accuracy. The key steps are: (1) loading and preparing source documents, (2) initializing the LLM and embedding models with ChatOpenAI, (3) generating synthetic datasets with Ragas, and (4) evaluating the datasets using both built-in and custom metrics powered by ChatOpenAI as the LLM. This approach enables robust, repeatable evaluation and supports both standard and domain-specific assessment needs.",
- "multi_hop_specific_query_synthesizer"
- ],
- [
- "11",
- "How does Ragas facilitate the comprehensive evaluation of RAG systems by addressing both their retrieval and generation components, and how does this approach differ when evaluating more complex AI agents that use tools and pursue specific goals?",
- "['title: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true\\n\\nIn our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature.\\n\\nUnderstanding RAG Systems: More Than the Sum of Their Parts\\n\\nRAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information\\n\\nThis dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content.\\n\\nThe RAG Evaluation Triad\\n\\nEffective RAG evaluation requires examining three key dimensions:\\n\\nRetrieval Quality: How well does the system find relevant information?\\n\\nGeneration Quality: How well does the system produce responses from retrieved information?\\n\\nEnd-to-End Performance: How well does the complete system satisfy user needs?\\n\\nLet\\'s explore how Ragas helps evaluate each dimension of RAG systems.\\n\\nCore RAG Metrics in Ragas\\n\\nRagas provides specialized metrics to assess RAG systems across retrieval, generation, and end-to-end performance.\\n\\nRetrieval Quality Metrics\\n\\n1. Context Relevancy\\n\\nMeasures how relevant the retrieved documents are to the user\\'s question.\\n\\nHow it works:\\n\\nTakes the user\\'s question (user_input) and the retrieved documents (retrieved_contexts).\\n\\nUses an LLM to score relevance with two different prompts, averaging the results for robustness.\\n\\nScores are normalized between 0.0 (irrelevant) and 1.0 (fully relevant).\\n\\nWhy it matters: Low scores indicate your retriever is pulling in unrelated or noisy documents. Monitoring this helps you improve the retrieval step.\\n\\n2. Context Precision\\n\\nAssesses how much of the retrieved context is actually useful for generating the answer.\\n\\nHow it works:\\n\\nFor each retrieved chunk, an LLM judges if it was necessary for the answer, using the ground truth (reference) or the generated response.\\n\\nCalculates Average Precision, rewarding systems that rank useful chunks higher.\\n\\nVariants:\\n\\nContextUtilization: Uses the generated response instead of ground truth.\\n\\nNon-LLM version: Compares retrieved chunks to ideal reference contexts using string similarity.\\n\\nWhy it matters: High precision means your retriever is efficient; low precision means too much irrelevant information is included.\\n\\n3. Context Recall\\n\\nEvaluates whether all necessary information from the ground truth answer is present in the retrieved context.\\n\\nHow it works:\\n\\nBreaks down the reference answer into sentences.\\n\\nFor each sentence, an LLM checks if it can be supported by the retrieved context.\\n\\nThe score is the proportion of reference sentences attributed to the retrieved context.\\n\\nVariants:\\n\\nNon-LLM version: Compares reference and retrieved contexts using similarity and thresholds.\\n\\nWhy it matters: High recall means your retriever finds all needed information; low recall means critical information is missing.\\n\\nSummary: - Low context relevancy: Retriever needs better query understanding or semantic matching. - Low context precision: Retriever includes unnecessary information. - Low context recall: Retriever misses critical information.\\n\\nGeneration Quality Metrics\\n\\n1. Faithfulness\\n\\nChecks if the generated answer is factually consistent with the retrieved context, addressing hallucination.\\n\\nHow it works:\\n\\nBreaks the answer into simple statements.\\n\\nFor each, an LLM checks if it can be inferred from the retrieved context.\\n\\nThe score is the proportion of faithful statements.\\n\\nAlternative:\\n\\nFaithfulnesswithHHEM: Uses a specialized NLI model for verification.\\n\\nWhy it matters: High faithfulness means answers are grounded in context; low faithfulness signals hallucination.\\n\\n2. Answer Relevancy\\n\\nMeasures if the generated answer directly addresses the user\\'s question.\\n\\nHow it works:\\n\\nAsks an LLM to generate possible questions for the answer.\\n\\nCompares these to the original question using embedding similarity.\\n\\nPenalizes noncommittal answers.\\n\\nWhy it matters: High relevancy means answers are on-topic; low relevancy means answers are off-topic or incomplete.\\n\\nSummary: - Low faithfulness: Generator adds facts not supported by context. - Low answer relevancy: Generator doesn\\'t focus on the specific question.\\n\\nEnd-to-End Metrics\\n\\n1. Correctness\\n\\nAssesses factual alignment between the generated answer and a ground truth reference.\\n\\nHow it works:\\n\\nBreaks both the answer and reference into claims.\\n\\nUses NLI to verify claims in both directions.\\n\\nCalculates precision, recall, or F1-score.\\n\\nWhy it matters: High correctness means answers match the ground truth; low correctness signals factual errors.\\n\\nKey distinction: - Faithfulness: Compares answer to retrieved context. - FactualCorrectness: Compares answer to ground truth.\\n\\nCommon RAG Evaluation Patterns\\n\\n1. High Retrieval, Low Generation Scores\\n\\nDiagnosis: Good retrieval, poor use of information.\\n\\nFixes: Improve prompts, use better generation models, or verify responses post-generation.\\n\\n2. Low Retrieval, High Generation Scores\\n\\nDiagnosis: Good generation, inadequate information.\\n\\nFixes: Enhance indexing, retrieval algorithms, or expand the knowledge base.\\n\\n3. Low Context Precision, High Faithfulness\\n\\nDiagnosis: Retrieves too much, but generates reliably.\\n\\nFixes: Filter passages, optimize chunk size, or use re-ranking.\\n\\nBest Practices for RAG Evaluation\\n\\nEvaluate components independently: Assess retrieval and generation separately.\\n\\nUse diverse queries: Include factoid, explanatory, and complex questions.\\n\\nCompare against baselines: Test against simpler systems.\\n\\nPerform ablation studies: Try variations like different chunk sizes or retrieval models.\\n\\nCombine with human evaluation: Use Ragas with human judgment for a complete view.\\n\\nConclusion: The Iterative RAG Evaluation Cycle\\n\\nEffective RAG development is iterative:\\n\\nEvaluate: Measure performance.\\n\\nAnalyze: Identify weaknesses.\\n\\nImprove: Apply targeted enhancements.\\n\\nRe-evaluate: Measure the impact of changes.\\n\\nThe Iterative RAG Evaluation Cycle\\n\\nBy using Ragas to implement this cycle, you can systematically improve your RAG system\\'s performance across all dimensions.\\n\\nIn our next post, we\\'ll explore how to generate high-quality test datasets for comprehensive RAG evaluation, addressing the common challenge of limited test data.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas — You are here Next up in the series: Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications\" date: 2025-04-26T18:00:00-06:00 layout: blog description: \"Explore the essential evaluation framework for LLM applications with Ragas. Learn how to assess performance, ensure accuracy, and improve reliability in Retrieval-Augmented Generation systems.\" categories: [\"AI\", \"RAG\", \"Evaluation\",\"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1593642634367-d91a135587b5?q=80&w=1770&auto=format&fit=crop&ixlib=rb-4.0.3\" readingTime: 7 published: true\\n\\nAs Large Language Models (LLMs) become fundamental components of modern applications, effectively evaluating their performance becomes increasingly critical. Whether you\\'re building a question-answering system, a document retrieval tool, or a conversational agent, you need reliable metrics to assess how well your application performs. This is where Ragas steps in.\\n\\nWhat is Ragas?\\n\\nRagas is an open-source evaluation framework specifically designed for LLM applications, with particular strengths in Retrieval-Augmented Generation (RAG) systems. Unlike traditional NLP evaluation methods, Ragas provides specialized metrics that address the unique challenges of LLM-powered systems.\\n\\nAt its core, Ragas helps answer crucial questions: - Is my application retrieving the right information? - Are the responses factually accurate and consistent with the retrieved context? - Does the system appropriately address the user\\'s query? - How well does my application handle multi-turn conversations?\\n\\nWhy Evaluate LLM Applications?\\n\\nLLMs are powerful but imperfect. They can hallucinate facts, misinterpret queries, or generate convincing but incorrect responses. For applications where accuracy and reliability matter—like healthcare, finance, or education—proper evaluation is non-negotiable.\\n\\nEvaluation serves several key purposes: - Quality assurance: Identify and fix issues before they reach users - Performance tracking: Monitor how changes impact system performance - Benchmarking: Compare different approaches objectively - Continuous improvement: Build feedback loops to enhance your application\\n\\nKey Features of Ragas\\n\\n🎯 Specialized Metrics\\n\\nRagas offers both LLM-based and computational metrics tailored to evaluate different aspects of LLM applications:\\n\\nFaithfulness: Measures if the response is factually consistent with the retrieved context\\n\\nContext Relevancy: Evaluates if the retrieved information is relevant to the query\\n\\nAnswer Relevancy: Assesses if the response addresses the user\\'s question\\n\\nTopic Adherence: Gauges how well multi-turn conversations stay on topic\\n\\n🧪 Test Data Generation\\n\\nCreating high-quality test data is often a bottleneck in evaluation. Ragas helps you generate comprehensive test datasets automatically, saving time and ensuring thorough coverage.\\n\\n🔗 Seamless Integrations\\n\\nRagas works with popular LLM frameworks and tools: - LangChain - LlamaIndex - Haystack - OpenAI\\n\\nObservability platforms - Phoenix - LangSmith - Langfuse\\n\\n📊 Comprehensive Analysis\\n\\nBeyond simple scores, Ragas provides detailed insights into your application\\'s strengths and weaknesses, enabling targeted improvements.\\n\\nGetting Started with Ragas\\n\\nInstalling Ragas is straightforward:\\n\\nbash uv init && uv add ragas\\n\\nHere\\'s a simple example of evaluating a response using Ragas:\\n\\n```python from ragas.metrics import Faithfulness from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import SingleTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper from langchain_openai import ChatOpenAI\\n\\nInitialize the LLM, you are going to new OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nYour evaluation data\\n\\ntest_data = { \"user_input\": \"What is the capital of France?\", \"retrieved_contexts\": [\"Paris is the capital and most populous city of France.\"], \"response\": \"The capital of France is Paris.\" }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor\\n\\nCreate metric\\n\\nfaithfulness = Faithfulness(llm=evaluator_llm)\\n\\nCalculate the score\\n\\nresult = await faithfulness.single_turn_ascore(sample) print(f\"Faithfulness score: {result}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for this workflow: 01_Introduction_to_Ragas\\n\\nWhat\\'s Coming in This Blog Series\\n\\nThis introduction is just the beginning. In the upcoming posts, we\\'ll dive deeper into all aspects of evaluating LLM applications with Ragas:\\n\\nPart 2: Basic Evaluation Workflow We\\'ll explore each metric in detail, explaining when and how to use them effectively.\\n\\nPart 3: Evaluating RAG Systems Learn specialized techniques for evaluating retrieval-augmented generation systems, including context precision, recall, and relevance.\\n\\nPart 4: Test Data Generation Discover how to create high-quality test datasets that thoroughly exercise your application\\'s capabilities.\\n\\nPart 5: Advanced Evaluation Techniques Go beyond basic metrics with custom evaluations, multi-aspect analysis, and domain-specific assessments.\\n\\nPart 6: Evaluating AI Agents Learn how to evaluate complex AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nPart 7: Integrations and Observability Connect Ragas with your existing tools and platforms for streamlined evaluation workflows.\\n\\nPart 8: Building Feedback Loops Learn how to implement feedback loops that drive continuous improvement in your LLM applications. Transform evaluation insights into concrete improvements for your LLM applications.\\n\\nConclusion\\n\\nIn a world increasingly powered by LLMs, robust evaluation is the difference between reliable applications and unpredictable ones. Ragas provides the tools you need to confidently assess and improve your LLM applications.\\n\\nReady to Elevate Your LLM Applications?\\n\\nStart exploring Ragas today by visiting the official documentation. Share your thoughts, challenges, or success stories. If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!', 'title: \"Part 5: Advanced Metrics and Customization with Ragas\" date: 2025-04-28T05:00:00-06:00 layout: blog description: \"Explore advanced metrics and customization techniques in Ragas for evaluating LLM applications, including creating custom metrics, domain-specific evaluation, composite scoring, and best practices for building a comprehensive evaluation ecosystem.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\",\"Data\"] coverImage: \"https://plus.unsplash.com/premium_photo-1661368994107-43200954c524?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 9 published: true\\n\\nIn our previous post, we explored how to generate comprehensive test datasets for evaluating LLM applications. Now, let\\'s dive into one of Ragas\\' most powerful capabilities: advanced metrics and custom evaluation approaches that address specialized evaluation needs.\\n\\nBeyond the Basics: Why Advanced Metrics Matter\\n\\nWhile Ragas\\' core metrics cover fundamental evaluation aspects, real-world applications often have unique requirements:\\n\\nDomain-specific quality criteria: Legal, medical, or financial applications have specialized accuracy requirements\\n\\nCustom interaction patterns: Applications with unique conversation flows need tailored evaluation approaches\\n\\nSpecialized capabilities: Features like reasoning, code generation, or structured output demand purpose-built metrics\\n\\nBusiness-specific KPIs: Aligning evaluation with business objectives requires customized metrics\\n\\nLet\\'s explore how to extend Ragas\\' capabilities to meet these specialized needs.\\n\\nUnderstanding Ragas\\' Metric Architecture\\n\\nBefore creating custom metrics, it\\'s helpful to understand Ragas\\' metric architecture:\\n\\n1. Understand the Metric Base Classes\\n\\nAll metrics in Ragas inherit from the abstract Metric class (see metrics/base.py). For most use cases, you’ll extend one of these:\\n\\nSingleTurnMetric: For metrics that evaluate a single question/response pair.\\n\\nMultiTurnMetric: For metrics that evaluate multi-turn conversations.\\n\\nMetricWithLLM: For metrics that require an LLM for evaluation.\\n\\nMetricWithEmbeddings: For metrics that use embeddings.\\n\\nYou can mix these as needed (e.g., MetricWithLLM, SingleTurnMetric).\\n\\nEach metric implements specific scoring methods depending on its type:\\n\\n_single_turn_ascore: For single-turn metrics\\n\\n_multi_turn_ascore: For multi-turn metrics\\n\\nCreating Your First Custom Metric\\n\\nLet\\'s create a custom metric that evaluates technical accuracy in programming explanations:\\n\\n```python from dataclasses import dataclass, field from typing import Dict, Optional, Set import typing as t\\n\\nfrom ragas.metrics.base import MetricWithLLM, SingleTurnMetric from ragas.prompt import PydanticPrompt from ragas.metrics import MetricType, MetricOutputType from pydantic import BaseModel\\n\\nDefine input/output models for the prompt\\n\\nclass TechnicalAccuracyInput(BaseModel): question: str context: str response: str programming_language: str = \"python\"\\n\\nclass TechnicalAccuracyOutput(BaseModel): score: float feedback: str\\n\\nDefine the prompt\\n\\nclass TechnicalAccuracyPrompt(PydanticPrompt[TechnicalAccuracyInput, TechnicalAccuracyOutput]): instruction: str = ( \"Evaluate the technical accuracy of the response to a programming question. \" \"Consider syntax correctness, algorithmic accuracy, and best practices.\" ) input_model = TechnicalAccuracyInput output_model = TechnicalAccuracyOutput examples = [ # Add examples here ]\\n\\nCreate the metric\\n\\n@dataclass class TechnicalAccuracy(MetricWithLLM, SingleTurnMetric): name: str = \"technical_accuracy\" _required_columns: Dict[MetricType, Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { \"user_input\", \"response\",\\n\\n }\\n }\\n)\\noutput_type: Optional[MetricOutputType] = MetricOutputType.CONTINUOUS\\nevaluation_prompt: PydanticPrompt = field(default_factory=TechnicalAccuracyPrompt)\\n\\nasync def _single_turn_ascore(self, sample, callbacks) -> float:\\n assert self.llm is not None, \"LLM must be set\"\\n\\n question = sample.user_input\\n response = sample.response\\n # Extract programming language from question if possible\\n programming_language = \"python\" # Default\\n languages = [\"python\", \"javascript\", \"java\", \"c++\", \"rust\", \"go\"]\\n for lang in languages:\\n if lang in question.lower():\\n programming_language = lang\\n break\\n\\n # Get the context\\n context = \"\\\\n\".join(sample.retrieved_contexts) if sample.retrieved_contexts else \"\"\\n\\n # Prepare input for prompt\\n prompt_input = TechnicalAccuracyInput(\\n question=question,\\n context=context,\\n response=response,\\n programming_language=programming_language\\n )\\n\\n # Generate evaluation\\n evaluation = await self.evaluation_prompt.generate(\\n data=prompt_input, llm=self.llm, callbacks=callbacks\\n )\\n\\n return evaluation.score\\n\\n```\\n\\nUsing the Custom Metric\\n\\nTo use the custom metric, simply include it in your evaluation pipeline:\\n\\n```python from langchain_openai import ChatOpenAI from ragas import SingleTurnSample from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM, you are going to OPENAI API key\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\ntest_data = { \"user_input\": \"Write a function to calculate the factorial of a number in Python.\", \"retrieved_contexts\": [\"Python is a programming language.\", \"A factorial of a number n is the product of all positive integers less than or equal to n.\"], \"response\": \"def factorial(n):\\\\n if n == 0:\\\\n return 1\\\\n else:\\\\n return n * factorial(n-1)\", }\\n\\nCreate a sample\\n\\nsample = SingleTurnSample(**test_data) # Unpack the dictionary into the constructor technical_accuracy = TechnicalAccuracy(llm=evaluator_llm) score = await technical_accuracy.single_turn_ascore(sample) print(f\"Technical Accuracy Score: {score}\")\\n\\nNote: The above code is a simplified example. In a real-world scenario, you would need to handle exceptions,\\n\\n`` You can also use theevaluate` function to evaluate a dataset:\\n\\n```python from ragas import evaluate from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of samples metrics=[TechnicalAccuracy(), ...], llm=myevaluator_llm_llm ) ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for synthetic data generation: 05_Advanced_Metrics_and_Customization\\n\\nCustomizing Metrics for Your Application\\n\\nYou can further refine your evaluation by customizing existing metrics—such as adjusting thresholds or criteria—to better fit your application\\'s requirements. For multi-turn conversations, you might configure metrics like topic adherence to emphasize specific aspects, such as precision or recall, based on your evaluation objectives.\\n\\nIn specialized domains like healthcare or legal, it\\'s crucial to design custom metrics that capture domain-specific accuracy and compliance needs. For complex applications, consider combining several metrics into composite scores to represent multiple quality dimensions.\\n\\nWhen assessing capabilities like code generation or structured outputs, develop metrics that evaluate execution correctness or schema compliance. For advanced scenarios, you can build metric pipelines that orchestrate several metrics and aggregate their results using strategies like weighted averages or minimum scores.\\n\\nBy thoughtfully customizing and combining metrics, you can achieve a comprehensive and meaningful evaluation framework tailored to your unique use case.\\n\\nBest Practices for Custom Metric Development\\n\\nSingle Responsibility: Each metric should evaluate one specific aspect\\n\\nClear Definition: Define precisely what your metric measures\\n\\nBounded Output: Scores should be normalized, typically in [0,1]\\n\\nReproducibility: Minimize randomness in evaluation\\n\\nDocumentation: Document criteria, prompt design, and interpretation guidelines\\n\\nTest with Examples: Verify metric behavior on clear-cut examples\\n\\nHuman Correlation: Validate that metrics correlate with human judgment\\n\\nStandardizing Custom Metrics\\n\\nTo ensure consistency across custom metrics, consider the following best practices:\\n\\nDefine a clear, human-readable description for each metric.\\n\\nProvide interpretation guidelines to help users understand score meanings.\\n\\nInclude metadata such as metric name, required columns, and output type.\\n\\nUse a standardized interface or base class for all custom metrics.\\n\\nImplementation Patterns for Advanced Metrics\\n\\nWhen developing advanced metrics like topic adherence:\\n\\nDesign multi-step evaluation workflows for complex tasks.\\n\\nUse specialized prompts for different sub-tasks within the metric.\\n\\nAllow configurable scoring modes (e.g., precision, recall, F1).\\n\\nSupport conversational context for multi-turn evaluations.\\n\\nDebugging Custom Metrics\\n\\nEffective debugging strategies include:\\n\\nImplementing a debug mode to capture prompt inputs, outputs, and intermediate results.\\n\\nLogging detailed evaluation steps for easier troubleshooting.\\n\\nReviewing final scores alongside intermediate calculations to identify issues.\\n\\nConclusion: Building an Evaluation Ecosystem\\n\\nCustom metrics allow you to build a comprehensive evaluation ecosystem tailored to your application\\'s specific needs:\\n\\nBaseline metrics: Start with Ragas\\' core metrics for fundamental quality aspects\\n\\nDomain adaptation: Add specialized metrics for your application domain\\n\\nFeature-specific metrics: Develop metrics for unique features of your system\\n\\nBusiness alignment: Create metrics that reflect specific business KPIs and requirements\\n\\nBy extending Ragas with custom metrics, you can create evaluation frameworks that precisely measure what matters most for your LLM applications, leading to more meaningful improvements and better user experiences.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Evaluation Techniques — You are here Next up in the series: Part 6: Evaluating AI Agents Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow have you implemented feedback loops in your LLM applications? What improvement strategies have been most effective for your use cases? If you’re facing specific evaluation hurdles, don’t hesitate to reach out—we’d love to help!', 'title: \"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\" date: 2025-04-28T06:00:00-06:00 layout: blog description: \"Learn how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications.\" categories: [\"AI\", \"Agents\", \"Evaluation\", \"Ragas\", \"LLM\"] coverImage: \"/images/ai_agent_evaluation.png\" readingTime: 8 published: true\\n\\nIn our previous posts, we\\'ve explored how Ragas evaluates RAG systems and enables custom metrics for specialized applications. As LLMs evolve beyond simple question-answering to become powerful AI agents, evaluation needs have grown more sophisticated too. In this post, we\\'ll explore Ragas\\' specialized metrics for evaluating AI agents that engage in multi-turn interactions, use tools, and work toward specific goals.\\n\\nThe Challenge of Evaluating AI Agents\\n\\nUnlike traditional RAG systems, AI agents present unique evaluation challenges:\\n\\nMulti-turn interactions: Agents maintain context across multiple exchanges\\n\\nTool usage: Agents call external tools and APIs to accomplish tasks\\n\\nGoal-oriented behavior: Success means achieving the user\\'s ultimate objective\\n\\nBoundaries and constraints: Agents must operate within defined topic boundaries\\n\\nStandard metrics like faithfulness or answer relevancy don\\'t fully capture these dimensions. Let\\'s explore three specialized metrics Ragas provides for agent evaluation.\\n\\nEvaluating AI Agents: Beyond Simple Answers with Ragas\\n\\n1. Goal Accuracy (agent_goal_accuracy)\\n\\nWhat it measures: Did the agent successfully achieve the user\\'s ultimate objective over the course of the interaction?\\n\\nHow it works: This metric analyzes the entire agent workflow (user inputs, AI responses, tool calls). * It uses an LLM (InferGoalOutcomePrompt) to identify the user_goal and the end_state (what actually happened). * It then compares the end_state to either: * A provided reference outcome (AgentGoalAccuracyWithReference). * The inferred user_goal (AgentGoalAccuracyWithoutReference). * An LLM (CompareOutcomePrompt) determines if the achieved outcome matches the desired one, resulting in a binary score (1 for success, 0 for failure).\\n\\nWhy it\\'s important: For task-oriented agents (like booking systems or assistants), success isn\\'t about individual responses but about completing the overall task correctly. This metric directly measures that end-to-end success.\\n\\n2. Tool Call Accuracy (tool_call_accuracy)\\n\\nWhat it measures: Did the agent use the correct tools, in the right order, and with the right arguments?\\n\\nHow it works: This metric compares the sequence and details of tool calls made by the agent against a reference_tool_calls list. * It checks if the sequence of tool names called by the agent aligns with the reference sequence (is_sequence_aligned). * For each matching tool call, it compares the arguments provided by the agent to the reference arguments, often using a sub-metric like ExactMatch (_get_arg_score). * The final score reflects both the sequence alignment and the argument correctness.\\n\\nWhy it\\'s important: Many agents rely on external tools (APIs, databases, etc.). Incorrect tool usage (wrong tool, bad parameters) leads to task failure. This metric pinpoints issues in the agent\\'s interaction with its tools.\\n\\n3. Topic Adherence (topic_adherence)\\n\\nWhat it measures: Did the agent stick to the allowed topics and appropriately handle requests about restricted topics?\\n\\nHow it works: This metric evaluates conversations against a list of reference_topics. * It extracts the topics discussed in the user\\'s input (TopicExtractionPrompt). * It checks if the agent refused to answer questions related to specific topics (TopicRefusedPrompt). * It classifies whether the discussed topics fall within the allowed reference_topics (TopicClassificationPrompt). * Based on these classifications and refusals, it calculates a score (Precision, Recall, or F1) indicating how well the agent adhered to the topic constraints.\\n\\nWhy it\\'s important: Ensures agents stay focused, avoid generating content on forbidden subjects (safety, policy), and handle out-of-scope requests gracefully.\\n\\nImplementing Agent Evaluation in Practice\\n\\nLet\\'s look at a practical example of evaluating an AI agent using these metrics:\\n\\n```python from ragas.metrics import AgentGoalAccuracyWithoutReference, ToolCallAccuracy, TopicAdherenceScore from ragas.evaluation import EvaluationDataset from ragas.dataset_schema import MultiTurnSample from langchain_openai import ChatOpenAI from ragas.llms import LangchainLLMWrapper\\n\\nInitialize the LLM\\n\\nevaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\\n\\nExample conversation with a travel booking agent\\n\\ntest_data = { \"user_input\": [ {\"role\": \"user\", \"content\": \"I need to book a flight from New York to London next Friday\"}, {\"role\": \"assistant\", \"content\": \"I\\'d be happy to help you book a flight. Let me search for options...\", \"tool_calls\": [{\"name\": \"search_flights\", \"arguments\": {\"origin\": \"NYC\", \"destination\": \"LON\", \"date\": \"next Friday\"}}]}, {\"role\": \"tool\", \"name\": \"search_flights\", \"content\": \"Found 5 flights: Flight 1 (Delta, $750), Flight 2 (British Airways, $820)...\"}, {\"role\": \"assistant\", \"content\": \"I found several flights from New York to London next Friday. The cheapest option is Delta for $750. Would you like to book this one?\"}, {\"role\": \"user\", \"content\": \"Yes, please book the Delta flight\"}, {\"role\": \"assistant\", \"content\": \"I\\'ll book that for you now.\", \"tool_calls\": [{\"name\": \"book_flight\", \"arguments\": {\"flight_id\": \"delta_123\", \"price\": \"$750\"}}]}, {\"role\": \"tool\", \"name\": \"book_flight\", \"content\": \"Booking confirmed. Confirmation #: ABC123\"}, {\"role\": \"assistant\", \"content\": \"Great news! Your flight is confirmed. Your confirmation number is ABC123. The flight is scheduled for next Friday. Is there anything else you need help with?\"} ], \"reference_topics\": [\"travel\", \"flight booking\", \"schedules\", \"prices\"], \"reference_tool_calls\": [ {\"name\": \"search_flights\", \"args\": {\"origin\": \"NYC\", \"destination\": \"LON\", \"date\": \"next Friday\"}}, {\"name\": \"book_flight\", \"args\": {\"flight_id\": \"delta_123\", \"price\": \"$750\"}} ] }\\n\\nCreate a sample\\n\\nsample = MultiTurnSample(**test_data)\\n\\nInitialize metrics\\n\\ngoal_accuracy = AgentGoalAccuracyWithoutReference(llm=evaluator_llm) tool_accuracy = ToolCallAccuracy() topic_adherence = TopicAdherenceScore(llm=evaluator_llm)\\n\\nCalculate scores\\n\\ngoal_score = await goal_accuracy.multi_turn_ascore(sample) tool_score = tool_accuracy.multi_turn_score(sample) topic_score = await topic_adherence.multi_turn_ascore(sample)\\n\\nprint(f\"Goal Accuracy: {goal_score}\") print(f\"Tool Call Accuracy: {tool_score}\") print(f\"Topic Adherence: {topic_score}\") ```\\n\\n💡 Try it yourself: Explore the hands-on notebook for agent evaluation: 06_Evaluating_AI_Agents\\n\\nAdvanced Agent Evaluation Techniques\\n\\nCombining Metrics for Comprehensive Evaluation\\n\\nFor a complete assessment of agent capabilities, combine multiple metrics:\\n\\n```python from ragas import evaluate\\n\\nresults = evaluate( dataset, # Your dataset of agent conversations metrics=[ AgentGoalAccuracyWithoutReference(llm=evaluator_llm), ToolCallAccuracy(), TopicAdherence(llm=evaluator_llm) ] ) ```\\n\\nBest Practices for Agent Evaluation\\n\\nTest scenario coverage: Include a diverse range of interaction scenarios\\n\\nEdge case handling: Test how agents handle unexpected inputs or failures\\n\\nLongitudinal evaluation: Track performance over time to identify regressions\\n\\nHuman-in-the-loop validation: Periodically verify metric alignment with human judgments\\n\\nContinuous feedback loops: Use evaluation insights to guide agent improvements\\n\\nConclusion\\n\\nEvaluating AI agents requires specialized metrics that go beyond traditional RAG evaluation. Ragas\\' agent_goal_accuracy, tool_call_accuracy, and topic_adherence provide crucial insights into whether an agent can successfully complete tasks, use tools correctly, and stay within designated boundaries.\\n\\nBy incorporating these metrics into your evaluation pipeline, you can build more reliable and effective AI agents that truly deliver on the promise of helpful, goal-oriented AI assistants.\\n\\nIn our next post, we\\'ll explore how to integrate Ragas with popular frameworks and observability tools for seamless evaluation workflows.\\n\\nPart 1: Introduction to Ragas: The Essential Evaluation Framework for LLM Applications Part 2: Basic Evaluation Workflow Part 3: Evaluating RAG Systems with Ragas Part 4: Test Data Generation Part 5: Advanced Metrics and Customization Part 6: Evaluating AI Agents — You are here Next up in the series: Part 7: Integrations and Observability Part 8: Building Feedback Loops\\n\\nHow are you evaluating your AI agents? What challenges have you encountered in measuring agent performance? If you\\'re facing specific evaluation hurdles, don\\'t hesitate to reach out—we\\'d love to help!']",
- "['<1-hop>\\n\\ntitle: \"Part 6: Evaluating AI Agents: Beyond Simple Answers with Ragas\" date: 2025-04-28T06:00:00-06:00 layout: blog description: \"Learn how to evaluate complex AI agents using Ragas\\' specialized metrics for goal accuracy, tool call accuracy, and topic adherence to build more reliable and effective agent-based applications.\" categories: [\"AI\", \"Agents\", \"Evaluation\", \"Ragas\", \"LLM\"] coverImage: \"/images/ai_agent_evaluation.png\" readingTime: 8 published: true In our previous posts, we\\'ve explored how Ragas evaluates RAG systems and enables custom metrics for specialized applications. As LLMs evolve beyond simple question-answering to become powerful AI agents, evaluation needs have grown more sophisticated too. In this post, we\\'ll explore Ragas\\' specialized metrics for evaluating AI agents that engage in multi-turn interactions, use tools, and work toward specific goals. ', '<2-hop>\\n\\ntitle: \"Part 3: Evaluating RAG Systems with Ragas\" date: 2025-04-26T20:00:00-06:00 layout: blog description: \"Learn specialized techniques for comprehensive evaluation of Retrieval-Augmented Generation systems using Ragas, including metrics for retrieval quality, generation quality, and end-to-end performance.\" categories: [\"AI\", \"RAG\", \"Evaluation\", \"Ragas\"] coverImage: \"https://images.unsplash.com/photo-1743796055664-3473eedab36e?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D\" readingTime: 14 published: true In our previous post, we covered the fundamentals of setting up evaluation workflows with Ragas. Now, let\\'s focus specifically on evaluating Retrieval-Augmented Generation (RAG) systems, which present unique evaluation challenges due to their multi-component nature. Understanding RAG Systems: More Than the Sum of Their Parts RAG systems combine two critical capabilities: 1. Retrieval: Finding relevant information from a knowledge base 2. Generation: Creating coherent, accurate responses based on retrieved information This dual nature means evaluation must address both components while also assessing their interaction. A system might retrieve perfect information but generate poor responses, or generate excellent prose from irrelevant retrieved content. The RAG Evaluation Triad Effective RAG evaluation requires examining three key dimensions: Retrieval Quality: How well does the system find relevant information? Generation Quality: How well does the system produce responses from retrieved information? End-to-End Performance: How well does the complete system satisfy user needs? Let\\'s explore how Ragas helps evaluate each dimension of RAG systems.']",
- "Ragas facilitates the comprehensive evaluation of Retrieval-Augmented Generation (RAG) systems by focusing on three key dimensions: **Retrieval Quality**, **Generation Quality**, and **End-to-End Performance**. It employs specialized metrics to assess how well the system retrieves relevant information, generates coherent responses, and meets user needs. For instance, metrics like Context Relevancy, Context Precision, and Faithfulness help evaluate the effectiveness of both the retrieval and generation components.\n\nIn contrast, evaluating more complex AI agents involves additional considerations due to their multi-turn interactions, tool usage, and goal-oriented behavior. Ragas introduces specialized metrics such as **Goal Accuracy**, **Tool Call Accuracy**, and **Topic Adherence** to assess whether agents successfully achieve user objectives, use tools correctly, and adhere to topic constraints. This approach reflects the complexity of AI agents, which requires a more nuanced evaluation compared to traditional RAG systems.\n\nFor more detailed insights, you can explore the following resources:\n- [Evaluating Rag Systems With Ragas](https://thedataguy.pro/blog/evaluating-rag-systems-with-ragas/)\n- [Introduction To Ragas](https://thedataguy.pro/blog/introduction-to-ragas/)\n- [Evaluating AI Agents With Ragas](https://thedataguy.pro/blog/evaluating-ai-agents-with-ragas/)",
- "Ragas facilitates the comprehensive evaluation of RAG systems by providing specialized metrics that assess both the retrieval and generation components. For RAG systems, Ragas evaluates retrieval quality (how well relevant information is found), generation quality (how accurately and coherently responses are produced from retrieved information), and end-to-end performance (how well the system satisfies user needs). This approach ensures that both the individual components and their interaction are thoroughly assessed. When evaluating more complex AI agents, Ragas extends its evaluation with additional specialized metrics for goal accuracy, tool call accuracy, and topic adherence, reflecting the increased sophistication required for agents that engage in multi-turn interactions, use tools, and work toward specific goals.",
- "multi_hop_specific_query_synthesizer"
+ "How do observability best practices contribute to building production-ready AI systems?",
+ "[\"What excites me most is that my experience in enterprise software development gives me a unique perspective on AI implementation. I understand not just the algorithms and models, but also how to integrate them into robust, production-ready systems that deliver real value.\\n\\n## The Best of Both Worlds\\n\\nComing back to AI doesn't mean leaving behind everything I learned in web and enterprise development. Quite the opposite - I believe my background gives me a particular advantage in building AI systems that are:\", '| **Harmfulness** | | ✓ | |\\n| **Coherence** | | ✓ | |\\n| **Context Relevancy** | | | ✓ |', \"- **Production-ready**: Understanding software engineering best practices helps create AI systems that can operate reliably at scale.\\n- **User-focused**: Experience with UX principles ensures AI solutions are designed with actual human users in mind.\\n- **Integrated**: Knowledge of enterprise systems makes it easier to connect AI capabilities with existing business processes.\\n- **Simplified**: My experience in streamlining complex business processes helps me identify where AI can have the greatest impact through intelligent automation.\\n- **Business-oriented**: I understand that AI isn't just about the technology—it's about solving real business problems and creating measurable value.\\n- **Practical**: I focus on practical applications that deliver immediate benefits rather than getting caught up in theoretical possibilities.\\n\\n## What's Next\\n\\nAs I return to my AI roots, I'm excited to share this journey with you through this blog. In the coming months, I plan to write about:\", '| **Metric** | **Comprehensive RAG Evaluation** | **Content Quality Evaluation** | **Retrieval Quality Evaluation** |\\n|-----------------------------|----------------------------------|---------------------------------|-----------------------------------|\\n| **Faithfulness** | ✓ | ✓ | |\\n| **Answer Relevancy** | ✓ | ✓ | |\\n| **Context Recall** | ✓ | | ✓ |\\n| **Context Precision** | ✓ | | ✓ |\\n| **Harmfulness** | | ✓ | |']",
+ "['<1-hop>\\n\\n## Best Practices for Observability\\n\\n1. **Define clear thresholds**: Establish performance baselines and alert thresholds for each metric\\n2. **Segment evaluations**: Break down results by query type, data source, or other relevant factors\\n3. **Historical tracking**: Maintain historical evaluation data to identify trends and regressions\\n4. **Correlation analysis**: Link evaluation metrics to user feedback and business outcomes\\n5. **Regular benchmarking**: Periodically evaluate against fixed test sets to ensure consistency\\n6. **Alert on regressions**: Implement automated alerts when metrics drop below thresholds\\n7. **Contextualize metrics**: Include example failures alongside aggregate metrics for better understanding\\n\\n## Building a Feedback Loop\\n\\nThe ultimate goal of evaluation is to drive improvements. Establish a feedback loop:', \"<2-hop>\\n\\n- **Production-ready**: Understanding software engineering best practices helps create AI systems that can operate reliably at scale.\\n- **User-focused**: Experience with UX principles ensures AI solutions are designed with actual human users in mind.\\n- **Integrated**: Knowledge of enterprise systems makes it easier to connect AI capabilities with existing business processes.\\n- **Simplified**: My experience in streamlining complex business processes helps me identify where AI can have the greatest impact through intelligent automation.\\n- **Business-oriented**: I understand that AI isn't just about the technology—it's about solving real business problems and creating measurable value.\\n- **Practical**: I focus on practical applications that deliver immediate benefits rather than getting caught up in theoretical possibilities.\\n\\n## What's Next\\n\\nAs I return to my AI roots, I'm excited to share this journey with you through this blog. In the coming months, I plan to write about:\"]",
+ "I don't know.",
+ "Observability best practices, such as defining clear thresholds, segmenting evaluations, maintaining historical tracking, and alerting on regressions, ensure that AI systems are continuously monitored and improved. These practices are essential for creating production-ready AI systems that can operate reliably at scale, as they enable consistent performance evaluation and rapid response to issues.",
+ "multi_hop_abstract_query_synthesizer"
]
],
"shape": {
"columns": 6,
- "rows": 12
+ "rows": 10
}
},
"text/html": [
@@ -710,203 +804,173 @@
" \n",
" \n",
" 0 | \n",
- " Does Ragas support integration with Langfuse? | \n",
- " [title: \"Part 1: Introduction to Ragas: The Es... | \n",
- " [title: \"Part 1: Introduction to Ragas: The Es... | \n",
- " Yes, Ragas supports integration with Langfuse,... | \n",
- " Yes, Ragas works with observability platforms ... | \n",
+ " How are Large Language Models integrated into ... | \n",
+ " [| **Harmfulness** | ... | \n",
+ " [---\\ntitle: \"Part 1: Introduction to Ragas: T... | \n",
+ " I don't know. | \n",
+ " Large Language Models (LLMs) are becoming fund... | \n",
" single_hop_specifc_query_synthesizer | \n",
"
\n",
" \n",
" 1 | \n",
- " what Part 8: Building Feedback Loops do for LL... | \n",
- " [title: \"Part 5: Advanced Metrics and Customiz... | \n",
- " [What's Coming in This Blog Series This introd... | \n",
- " Part 8: Building Feedback Loops with Ragas pro... | \n",
- " Part 8: Building Feedback Loops show how to im... | \n",
+ " Howw does Ragas help evalute LLM applikations ... | \n",
+ " [In our next post, we'll explore advanced metr... | \n",
+ " [## What is Ragas?\\n\\n[Ragas](https://docs.rag... | \n",
+ " I don't know. | \n",
+ " Ragas is an open-source evaluation framework d... | \n",
" single_hop_specifc_query_synthesizer | \n",
"
\n",
" \n",
" 2 | \n",
- " How does Ragas assist with Evaluation of RAG s... | \n",
- " [title: \"Part 3: Evaluating RAG Systems with R... | \n",
- " [title: \"Part 4: Generating Test Data with Rag... | \n",
- " Ragas assists with the evaluation of Retrieval... | \n",
- " Ragas helps generate robust test datasets for ... | \n",
+ " What specialized metrics does Ragas provide fo... | \n",
+ " [In our next post, we'll explore advanced metr... | \n",
+ " [Evaluation serves several key purposes:\\n- **... | \n",
+ " I don't know. | \n",
+ " Ragas offers both LLM-based and computational ... | \n",
" single_hop_specifc_query_synthesizer | \n",
"
\n",
" \n",
" 3 | \n",
- " Wut is OpenAIEmbeddings used for? | \n",
- " [title: \"Part 5: Advanced Metrics and Customiz... | \n",
- " [How to Generate Synthetic Data for RAG Evalua... | \n",
+ " me wanna know how LangSmith work with Ragas, l... | \n",
+ " [In our next post, we'll explore how to integr... | \n",
+ " [### 🧪 Test Data Generation\\nCreating high-qua... | \n",
" I don't know. | \n",
- " OpenAIEmbeddings is used as an embedding model... | \n",
+ " LangSmith is listed as one of the observabilit... | \n",
" single_hop_specifc_query_synthesizer | \n",
"
\n",
" \n",
" 4 | \n",
- " Wht are the key steps in the Ragas evalution w... | \n",
- " [title: \"Part 3: Evaluating RAG Systems with R... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation W... | \n",
- " ### Key Steps in the Ragas Evaluation Workflow... | \n",
- " The key steps in the Ragas evaluation workflow... | \n",
- " multi_hop_abstract_query_synthesizer | \n",
+ " How do I use the OPENAI API key when initializ... | \n",
+ " [In our next post, we'll explore how to integr... | \n",
+ " [## Getting Started with Ragas\\n\\nInstalling R... | \n",
+ " To use the OPENAI API key when initializing an... | \n",
+ " To use the OPENAI API key when initializing an... | \n",
+ " single_hop_specifc_query_synthesizer | \n",
"
\n",
" \n",
" 5 | \n",
- " How does Ragas facilitate both test data gener... | \n",
- " [title: \"Part 3: Evaluating RAG Systems with R... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 4: Generating Test Da... | \n",
- " Ragas facilitates both test data generation an... | \n",
- " Ragas facilitates test data generation and syn... | \n",
+ " How does synthetic data generation contribute ... | \n",
+ " [In our next post, we'll explore how to genera... | \n",
+ " [<1-hop>\\n\\n## Why and How to Generate Synthet... | \n",
+ " I don't know. | \n",
+ " Synthetic data generation enables rapid creati... | \n",
" multi_hop_abstract_query_synthesizer | \n",
"
\n",
" \n",
" 6 | \n",
- " Wht speshulized evalushun metrix does Ragas pr... | \n",
- " [title: \"Part 2: Basic Evaluation Workflow wit... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation W... | \n",
- " Ragas provides several specialized evaluation ... | \n",
- " Ragas provieds speshulized evalushun metrix fo... | \n",
+ " How does Ragas support the evaluation of both ... | \n",
+ " [In our next post, we'll explore advanced metr... | \n",
+ " [<1-hop>\\n\\n# Create a sample\\nsample = Single... | \n",
+ " Ragas is an evaluation framework designed to s... | \n",
+ " Ragas supports the evaluation of LLM applicati... | \n",
" multi_hop_abstract_query_synthesizer | \n",
"
\n",
" \n",
" 7 | \n",
- " Which specialized metrics does Ragas provide f... | \n",
- " [title: \"Part 3: Evaluating RAG Systems with R... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 3: Evaluating RAG Sys... | \n",
- " Ragas provides several specialized metrics for... | \n",
- " Ragas provides specialized metrics for evaluat... | \n",
+ " How does Metric-Driven Development (MDD) utili... | \n",
+ " [Reward functions embody the core MDD idea: se... | \n",
+ " [<1-hop>\\n\\n## What Exactly is Metric-Driven D... | \n",
+ " Metric-Driven Development (MDD) utilizes combi... | \n",
+ " Metric-Driven Development (MDD) utilizes combi... | \n",
" multi_hop_abstract_query_synthesizer | \n",
"
\n",
" \n",
" 8 | \n",
- " How does RAGAS facilitate metric-driven develo... | \n",
- " [title: \"Part 5: Advanced Metrics and Customiz... | \n",
- " [<1-hop>\\n\\nHow to Generate Synthetic Data for... | \n",
- " Ragas facilitates metric-driven development in... | \n",
- " RAGAS facilitates metric-driven development in... | \n",
- " multi_hop_specific_query_synthesizer | \n",
+ " How does Ragas provide specialized evaluation ... | \n",
+ " [In our next post, we'll explore advanced metr... | \n",
+ " [<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https... | \n",
+ " Ragas provides specialized evaluation metrics ... | \n",
+ " Ragas is an open-source evaluation framework s... | \n",
+ " multi_hop_abstract_query_synthesizer | \n",
"
\n",
" \n",
" 9 | \n",
- " How does the use of Ragas facilitate the evalu... | \n",
- " [title: \"Part 3: Evaluating RAG Systems with R... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 4: Generating Test Da... | \n",
- " ### Ragas and Evaluation of RAG Systems\\n\\nRag... | \n",
- " Ragas facilitates the evaluation of Retrieval-... | \n",
- " multi_hop_specific_query_synthesizer | \n",
- "
\n",
- " \n",
- " 10 | \n",
- " How can ChatOpenAI be integrated into a RAG ev... | \n",
- " [title: \"Part 5: Advanced Metrics and Customiz... | \n",
- " [<1-hop>\\n\\nHow to Generate Synthetic Data for... | \n",
- " To integrate ChatOpenAI into a RAG evaluation ... | \n",
- " ChatOpenAI can be integrated into a RAG evalua... | \n",
- " multi_hop_specific_query_synthesizer | \n",
- "
\n",
- " \n",
- " 11 | \n",
- " How does Ragas facilitate the comprehensive ev... | \n",
- " [title: \"Part 3: Evaluating RAG Systems with R... | \n",
- " [<1-hop>\\n\\ntitle: \"Part 6: Evaluating AI Agen... | \n",
- " Ragas facilitates the comprehensive evaluation... | \n",
- " Ragas facilitates the comprehensive evaluation... | \n",
- " multi_hop_specific_query_synthesizer | \n",
+ " How do observability best practices contribute... | \n",
+ " [What excites me most is that my experience in... | \n",
+ " [<1-hop>\\n\\n## Best Practices for Observabilit... | \n",
+ " I don't know. | \n",
+ " Observability best practices, such as defining... | \n",
+ " multi_hop_abstract_query_synthesizer | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " user_input \\\n",
- "0 Does Ragas support integration with Langfuse? \n",
- "1 what Part 8: Building Feedback Loops do for LL... \n",
- "2 How does Ragas assist with Evaluation of RAG s... \n",
- "3 Wut is OpenAIEmbeddings used for? \n",
- "4 Wht are the key steps in the Ragas evalution w... \n",
- "5 How does Ragas facilitate both test data gener... \n",
- "6 Wht speshulized evalushun metrix does Ragas pr... \n",
- "7 Which specialized metrics does Ragas provide f... \n",
- "8 How does RAGAS facilitate metric-driven develo... \n",
- "9 How does the use of Ragas facilitate the evalu... \n",
- "10 How can ChatOpenAI be integrated into a RAG ev... \n",
- "11 How does Ragas facilitate the comprehensive ev... \n",
+ " user_input \\\n",
+ "0 How are Large Language Models integrated into ... \n",
+ "1 Howw does Ragas help evalute LLM applikations ... \n",
+ "2 What specialized metrics does Ragas provide fo... \n",
+ "3 me wanna know how LangSmith work with Ragas, l... \n",
+ "4 How do I use the OPENAI API key when initializ... \n",
+ "5 How does synthetic data generation contribute ... \n",
+ "6 How does Ragas support the evaluation of both ... \n",
+ "7 How does Metric-Driven Development (MDD) utili... \n",
+ "8 How does Ragas provide specialized evaluation ... \n",
+ "9 How do observability best practices contribute... \n",
"\n",
- " retrieved_contexts \\\n",
- "0 [title: \"Part 1: Introduction to Ragas: The Es... \n",
- "1 [title: \"Part 5: Advanced Metrics and Customiz... \n",
- "2 [title: \"Part 3: Evaluating RAG Systems with R... \n",
- "3 [title: \"Part 5: Advanced Metrics and Customiz... \n",
- "4 [title: \"Part 3: Evaluating RAG Systems with R... \n",
- "5 [title: \"Part 3: Evaluating RAG Systems with R... \n",
- "6 [title: \"Part 2: Basic Evaluation Workflow wit... \n",
- "7 [title: \"Part 3: Evaluating RAG Systems with R... \n",
- "8 [title: \"Part 5: Advanced Metrics and Customiz... \n",
- "9 [title: \"Part 3: Evaluating RAG Systems with R... \n",
- "10 [title: \"Part 5: Advanced Metrics and Customiz... \n",
- "11 [title: \"Part 3: Evaluating RAG Systems with R... \n",
+ " retrieved_contexts \\\n",
+ "0 [| **Harmfulness** | ... \n",
+ "1 [In our next post, we'll explore advanced metr... \n",
+ "2 [In our next post, we'll explore advanced metr... \n",
+ "3 [In our next post, we'll explore how to integr... \n",
+ "4 [In our next post, we'll explore how to integr... \n",
+ "5 [In our next post, we'll explore how to genera... \n",
+ "6 [In our next post, we'll explore advanced metr... \n",
+ "7 [Reward functions embody the core MDD idea: se... \n",
+ "8 [In our next post, we'll explore advanced metr... \n",
+ "9 [What excites me most is that my experience in... \n",
"\n",
- " reference_contexts \\\n",
- "0 [title: \"Part 1: Introduction to Ragas: The Es... \n",
- "1 [What's Coming in This Blog Series This introd... \n",
- "2 [title: \"Part 4: Generating Test Data with Rag... \n",
- "3 [How to Generate Synthetic Data for RAG Evalua... \n",
- "4 [<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation W... \n",
- "5 [<1-hop>\\n\\ntitle: \"Part 4: Generating Test Da... \n",
- "6 [<1-hop>\\n\\ntitle: \"Part 2: Basic Evaluation W... \n",
- "7 [<1-hop>\\n\\ntitle: \"Part 3: Evaluating RAG Sys... \n",
- "8 [<1-hop>\\n\\nHow to Generate Synthetic Data for... \n",
- "9 [<1-hop>\\n\\ntitle: \"Part 4: Generating Test Da... \n",
- "10 [<1-hop>\\n\\nHow to Generate Synthetic Data for... \n",
- "11 [<1-hop>\\n\\ntitle: \"Part 6: Evaluating AI Agen... \n",
+ " reference_contexts \\\n",
+ "0 [---\\ntitle: \"Part 1: Introduction to Ragas: T... \n",
+ "1 [## What is Ragas?\\n\\n[Ragas](https://docs.rag... \n",
+ "2 [Evaluation serves several key purposes:\\n- **... \n",
+ "3 [### 🧪 Test Data Generation\\nCreating high-qua... \n",
+ "4 [## Getting Started with Ragas\\n\\nInstalling R... \n",
+ "5 [<1-hop>\\n\\n## Why and How to Generate Synthet... \n",
+ "6 [<1-hop>\\n\\n# Create a sample\\nsample = Single... \n",
+ "7 [<1-hop>\\n\\n## What Exactly is Metric-Driven D... \n",
+ "8 [<1-hop>\\n\\n## What is Ragas?\\n\\n[Ragas](https... \n",
+ "9 [<1-hop>\\n\\n## Best Practices for Observabilit... \n",
"\n",
- " response \\\n",
- "0 Yes, Ragas supports integration with Langfuse,... \n",
- "1 Part 8: Building Feedback Loops with Ragas pro... \n",
- "2 Ragas assists with the evaluation of Retrieval... \n",
- "3 I don't know. \n",
- "4 ### Key Steps in the Ragas Evaluation Workflow... \n",
- "5 Ragas facilitates both test data generation an... \n",
- "6 Ragas provides several specialized evaluation ... \n",
- "7 Ragas provides several specialized metrics for... \n",
- "8 Ragas facilitates metric-driven development in... \n",
- "9 ### Ragas and Evaluation of RAG Systems\\n\\nRag... \n",
- "10 To integrate ChatOpenAI into a RAG evaluation ... \n",
- "11 Ragas facilitates the comprehensive evaluation... \n",
+ " response \\\n",
+ "0 I don't know. \n",
+ "1 I don't know. \n",
+ "2 I don't know. \n",
+ "3 I don't know. \n",
+ "4 To use the OPENAI API key when initializing an... \n",
+ "5 I don't know. \n",
+ "6 Ragas is an evaluation framework designed to s... \n",
+ "7 Metric-Driven Development (MDD) utilizes combi... \n",
+ "8 Ragas provides specialized evaluation metrics ... \n",
+ "9 I don't know. \n",
"\n",
- " reference \\\n",
- "0 Yes, Ragas works with observability platforms ... \n",
- "1 Part 8: Building Feedback Loops show how to im... \n",
- "2 Ragas helps generate robust test datasets for ... \n",
- "3 OpenAIEmbeddings is used as an embedding model... \n",
- "4 The key steps in the Ragas evaluation workflow... \n",
- "5 Ragas facilitates test data generation and syn... \n",
- "6 Ragas provieds speshulized evalushun metrix fo... \n",
- "7 Ragas provides specialized metrics for evaluat... \n",
- "8 RAGAS facilitates metric-driven development in... \n",
- "9 Ragas facilitates the evaluation of Retrieval-... \n",
- "10 ChatOpenAI can be integrated into a RAG evalua... \n",
- "11 Ragas facilitates the comprehensive evaluation... \n",
+ " reference \\\n",
+ "0 Large Language Models (LLMs) are becoming fund... \n",
+ "1 Ragas is an open-source evaluation framework d... \n",
+ "2 Ragas offers both LLM-based and computational ... \n",
+ "3 LangSmith is listed as one of the observabilit... \n",
+ "4 To use the OPENAI API key when initializing an... \n",
+ "5 Synthetic data generation enables rapid creati... \n",
+ "6 Ragas supports the evaluation of LLM applicati... \n",
+ "7 Metric-Driven Development (MDD) utilizes combi... \n",
+ "8 Ragas is an open-source evaluation framework s... \n",
+ "9 Observability best practices, such as defining... \n",
"\n",
- " synthesizer_name \n",
- "0 single_hop_specifc_query_synthesizer \n",
- "1 single_hop_specifc_query_synthesizer \n",
- "2 single_hop_specifc_query_synthesizer \n",
- "3 single_hop_specifc_query_synthesizer \n",
- "4 multi_hop_abstract_query_synthesizer \n",
- "5 multi_hop_abstract_query_synthesizer \n",
- "6 multi_hop_abstract_query_synthesizer \n",
- "7 multi_hop_abstract_query_synthesizer \n",
- "8 multi_hop_specific_query_synthesizer \n",
- "9 multi_hop_specific_query_synthesizer \n",
- "10 multi_hop_specific_query_synthesizer \n",
- "11 multi_hop_specific_query_synthesizer "
+ " synthesizer_name \n",
+ "0 single_hop_specifc_query_synthesizer \n",
+ "1 single_hop_specifc_query_synthesizer \n",
+ "2 single_hop_specifc_query_synthesizer \n",
+ "3 single_hop_specifc_query_synthesizer \n",
+ "4 single_hop_specifc_query_synthesizer \n",
+ "5 multi_hop_abstract_query_synthesizer \n",
+ "6 multi_hop_abstract_query_synthesizer \n",
+ "7 multi_hop_abstract_query_synthesizer \n",
+ "8 multi_hop_abstract_query_synthesizer \n",
+ "9 multi_hop_abstract_query_synthesizer "
]
},
- "execution_count": 46,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -918,7 +982,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"id": "f5d50d7b",
"metadata": {},
"outputs": [],
@@ -928,43 +992,40 @@
},
{
"cell_type": "code",
- "execution_count": 48,
+ "execution_count": 12,
"id": "fb7d4a45",
"metadata": {},
"outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8960aa00f8a94925938bb108e127cf12",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Evaluating: 0%| | 0/60 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
{
"name": "stderr",
"output_type": "stream",
"text": [
- "Evaluating: 28%|██▊ | 20/72 [06:08<15:53, 18.34s/it]Exception raised in Job[1]: TimeoutError()\n",
- "Evaluating: 29%|██▉ | 21/72 [06:29<16:21, 19.25s/it]Exception raised in Job[7]: TimeoutError()\n",
- "Exception raised in Job[6]: TimeoutError()\n",
- "Exception raised in Job[10]: TimeoutError()\n",
- "Exception raised in Job[4]: TimeoutError()\n",
- "Exception raised in Job[13]: TimeoutError()\n",
- "Evaluating: 38%|███▊ | 27/72 [06:30<04:09, 5.56s/it]Exception raised in Job[16]: TimeoutError()\n",
- "Evaluating: 39%|███▉ | 28/72 [06:32<03:39, 4.98s/it]Exception raised in Job[18]: TimeoutError()\n",
- "Evaluating: 40%|████ | 29/72 [06:33<03:06, 4.34s/it]Exception raised in Job[22]: TimeoutError()\n",
- "Evaluating: 43%|████▎ | 31/72 [07:01<05:18, 7.77s/it]Exception raised in Job[24]: TimeoutError()\n",
- "Evaluating: 44%|████▍ | 32/72 [07:01<04:00, 6.00s/it]Exception raised in Job[25]: TimeoutError()\n",
- "Evaluating: 51%|█████▏ | 37/72 [07:47<05:36, 9.60s/it]Exception raised in Job[28]: TimeoutError()\n",
- "Evaluating: 56%|█████▌ | 40/72 [10:11<15:29, 29.06s/it]Exception raised in Job[30]: TimeoutError()\n",
- "Evaluating: 60%|█████▉ | 43/72 [10:53<09:33, 19.76s/it]Exception raised in Job[31]: TimeoutError()\n",
- "Evaluating: 61%|██████ | 44/72 [11:40<13:01, 27.90s/it]Exception raised in Job[34]: TimeoutError()\n",
- "Evaluating: 62%|██████▎ | 45/72 [12:37<16:28, 36.62s/it]Exception raised in Job[35]: TimeoutError()\n",
- "Evaluating: 64%|██████▍ | 46/72 [12:37<11:11, 25.83s/it]Exception raised in Job[36]: TimeoutError()\n",
- "Evaluating: 65%|██████▌ | 47/72 [12:59<10:12, 24.50s/it]Exception raised in Job[37]: TimeoutError()\n",
- "Exception raised in Job[40]: TimeoutError()\n",
- "Exception raised in Job[42]: TimeoutError()\n",
- "Evaluating: 69%|██████▉ | 50/72 [13:00<04:01, 10.98s/it]Exception raised in Job[43]: TimeoutError()\n",
- "Evaluating: 72%|███████▏ | 52/72 [13:06<02:40, 8.03s/it]Exception raised in Job[46]: TimeoutError()\n",
- "Evaluating: 76%|███████▋ | 55/72 [13:34<02:04, 7.30s/it]Exception raised in Job[48]: TimeoutError()\n",
- "Evaluating: 79%|███████▉ | 57/72 [13:40<01:20, 5.35s/it]Exception raised in Job[49]: TimeoutError()\n",
- "Evaluating: 82%|████████▏ | 59/72 [13:51<01:17, 5.99s/it]Exception raised in Job[52]: TimeoutError()\n",
- "Evaluating: 88%|████████▊ | 63/72 [16:26<04:02, 26.94s/it]Exception raised in Job[54]: TimeoutError()\n",
- "Evaluating: 89%|████████▉ | 64/72 [16:35<02:51, 21.44s/it]Exception raised in Job[55]: TimeoutError()\n",
- "Evaluating: 99%|█████████▊| 71/72 [18:50<00:20, 20.68s/it]Exception raised in Job[64]: TimeoutError()\n",
- "Evaluating: 100%|██████████| 72/72 [19:28<00:00, 16.24s/it]\n"
+ "Exception raised in Job[11]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29843, Requested 1360. Please try again in 2.406s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+ "Exception raised in Job[28]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29165, Requested 1477. Please try again in 1.284s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+ "Exception raised in Job[19]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 30000, Requested 1733. Please try again in 3.466s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+ "Exception raised in Job[24]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 30000, Requested 1837. Please try again in 3.674s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+ "Exception raised in Job[31]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29338, Requested 1730. Please try again in 2.136s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+ "Exception raised in Job[30]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29225, Requested 2003. Please try again in 2.456s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+ "Exception raised in Job[34]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29778, Requested 1498. Please try again in 2.552s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+ "Exception raised in Job[25]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29207, Requested 1958. Please try again in 2.33s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+ "Exception raised in Job[36]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29319, Requested 2005. Please try again in 2.648s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+ "Exception raised in Job[40]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29964, Requested 1557. Please try again in 3.042s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+ "Exception raised in Job[37]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29893, Requested 2010. Please try again in 3.805s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+ "Exception raised in Job[43]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-XyIrJZx8AUgCzADN6GcfWGO6 on tokens per min (TPM): Limit 30000, Used 29579, Requested 1876. Please try again in 2.91s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n"
]
}
],
@@ -974,17 +1035,17 @@
},
{
"cell_type": "code",
- "execution_count": 49,
+ "execution_count": 13,
"id": "74aab82a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness(mode=f1)': 0.4225, 'answer_relevancy': 0.8779, 'context_entity_recall': 0.1056, 'noise_sensitivity(mode=relevant)': 0.4647}"
+ "{'context_recall': 0.1905, 'faithfulness': 0.8545, 'factual_correctness(mode=f1)': 0.2490, 'answer_relevancy': 0.3892, 'context_entity_recall': 0.1503, 'noise_sensitivity(mode=relevant)': 0.2540}"
]
},
- "execution_count": 49,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -995,15 +1056,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "11e71a82",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"id": "49fa29f2",
"metadata": {},
"outputs": [],