Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Jul 22

Commit

2537530

verified ·

1 Parent(s): 03d14e5

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

inference.py +6 -1
llm_as_judge_constants.py +171 -3
operators.py +3 -1
struct_data_operators.py +3 -1
text2sql_utils.py +6 -1
version.py +1 -1

inference.py CHANGED Viewed

@@ -2098,6 +2098,7 @@ class RITSInferenceEngine(
         "deepseek-ai/DeepSeek-V3": "deepseek-v3-h200",
         "meta-llama/Llama-3.1-8B-Instruct": "llama-3-1-8b-instruct",
         "meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama-4-scout-17b-16e-instruct",
     }
     def get_default_headers(self):
@@ -3522,8 +3523,10 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-2-90b-vision-instruct": "meta-llama/llama-3-2-90b-vision-instruct",
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
             "llama-guard-3-11b-vision": "meta-llama/llama-guard-3-11b-vision",
-            "mistral-large-instruct": "mistralai/mistral-large",
             "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7b-instruct-v01",
         },
         "together-ai": {  # checked from https://www.together.ai/models
             "llama-3-8b-instruct": "together_ai/meta-llama/Llama-3-8b-chat-hf",
@@ -3583,12 +3586,14 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
             "llama-4-scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
             "llama-4-maverick": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
             "mistral-large-instruct": "mistralai/mistral-large-instruct-2407",
             "mixtral-8x7b-instruct": "mistralai/mixtral-8x7B-instruct-v0.1",
             "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7B-instruct-v0.1",
             "deepseek-v3": "deepseek-ai/DeepSeek-V3",
             "granite-guardian-3-2-3b-a800m": "ibm-granite/granite-guardian-3.2-3b-a800m",
             "granite-guardian-3-2-5b": "ibm-granite/granite-guardian-3.2-5b",
         },
         "open-ai": {
             "o1-mini": "o1-mini",

         "deepseek-ai/DeepSeek-V3": "deepseek-v3-h200",
         "meta-llama/Llama-3.1-8B-Instruct": "llama-3-1-8b-instruct",
         "meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama-4-scout-17b-16e-instruct",
+        "mistralai/Mistral-Small-3.1-24B-Instruct-2503": "mistral-small-3-1-24b-2503",
     }
     def get_default_headers(self):
             "llama-3-2-90b-vision-instruct": "meta-llama/llama-3-2-90b-vision-instruct",
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
             "llama-guard-3-11b-vision": "meta-llama/llama-guard-3-11b-vision",
             "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7b-instruct-v01",
+            "mistral-small-instruct": "mistralai/mistral-small-3-1-24b-instruct-2503",
+            "mistral-medium-instruct": "mistralai/mistral-medium-2505",
+            "mistral-large-instruct": "mistralai/mistral-large",
         },
         "together-ai": {  # checked from https://www.together.ai/models
             "llama-3-8b-instruct": "together_ai/meta-llama/Llama-3-8b-chat-hf",
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
             "llama-4-scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
             "llama-4-maverick": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
+            "mistral-small-instruct": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
             "mistral-large-instruct": "mistralai/mistral-large-instruct-2407",
             "mixtral-8x7b-instruct": "mistralai/mixtral-8x7B-instruct-v0.1",
             "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7B-instruct-v0.1",
             "deepseek-v3": "deepseek-ai/DeepSeek-V3",
             "granite-guardian-3-2-3b-a800m": "ibm-granite/granite-guardian-3.2-3b-a800m",
             "granite-guardian-3-2-5b": "ibm-granite/granite-guardian-3.2-5b",
+            "phi-4": "microsoft/phi-4",
         },
         "open-ai": {
             "o1-mini": "o1-mini",

llm_as_judge_constants.py CHANGED Viewed

@@ -1032,6 +1032,126 @@ class DirectCriteriaCatalogEnum(Enum):
         bigger_is_better=False,
     )
     REFERENCE_DOCUMENT_FAITHFULNESS = CriteriaWithOptions(
         name="reference_document_faithfulness",
         description="Is the response faithful according to reference document?",
@@ -1412,12 +1532,12 @@ class DirectCriteriaCatalogEnum(Enum):
     LOGICAL_VALIDITY_OF_REASONING = CriteriaWithOptions(
         name="logical_validity_of_reasoning",
         description=(
-            "Assess whether the model's reasoning is logically valid when solving problems "
             "in propositional logic. The reasoning should follow correct logical principles "
             "and lead to a valid conclusion based on the given premises."
         ),
-        prediction_field="reasoning",
-        context_fields=[],
         options=[
             CriteriaOption(
                 name="Yes",
@@ -1433,6 +1553,54 @@ class DirectCriteriaCatalogEnum(Enum):
             "No": 0.0,
         },
     )
 DIRECT_CRITERIA = [c.value for c in DirectCriteriaCatalogEnum]

         bigger_is_better=False,
     )
+    STEP_BY_STEP_REASONING_BAD_GRAMMAR = get_yes_no_criteria(
+        name="step_by_step_reasoning_bad_grammar",
+        description="Does this step contain any faulty, unconventional, or controversial grammar usage? In other words, does the language in this step sounds unnatural?",
+        prediction_field="step",
+        context_fields=[],
+        bigger_is_better=False,
+    )
+    STEP_BY_STEP_REASONING_NON_FACTUAL = get_yes_no_criteria(
+        name="step_by_step_reasoning_non_factual",
+        description="Does this step contain any information that contradicts the context while still largely talking about the same concepts? (Ex. Characteristics of named objects are wrong, named entities changed.)",
+        prediction_field="step",
+        context_fields=[
+            "question",
+            "premise",
+            "hypothesis",
+            "model reasoning",
+            "correct answer",
+        ],
+        bigger_is_better=False,
+    )
+    STEP_BY_STEP_NON_COHERENT = get_yes_no_criteria(
+        name="step_by_step_reasoning_non_coherent",
+        description="Does this step contain any logical deduction errors (Ie, makes a conclusion contradictory to previously stated clauses, including clauses within this step itself; makes a conclusion while not having enough support to make the conclusion)",
+        prediction_field="step",
+        context_fields=[
+            "question",
+            "premise",
+            "hypothesis",
+            "model reasoning",
+            "correct answer",
+        ],
+        bigger_is_better=False,
+    )
+    STEP_BY_STEP_BAD_FINAL_ANSWER = get_yes_no_criteria(
+        name="step_by_step_reasoning_bad_final_answer",
+        description="Does this step contain a final step with an incorrect final answer? (If an explicit 'yes/no' is not provided, an exact match of the correct answer with respect to the question in the context must be given.)",
+        prediction_field="step",
+        context_fields=[
+            "question",
+            "premise",
+            "hypothesis",
+            "model reasoning",
+            "correct answer",
+        ],
+        bigger_is_better=False,
+    )
+    STEP_BY_STEP_REASONING_HALLUCINATION = get_yes_no_criteria(
+        name="step_by_step_reasoning_hallucination",
+        description="Does this step contain any information not provided in the problem statement that is irrelevant or wrong?",
+        prediction_field="step",
+        context_fields=[
+            "question",
+            "premise",
+            "hypothesis",
+            "model reasoning",
+            "correct answer",
+        ],
+        bigger_is_better=False,
+    )
+    STEP_BY_STEP_REASONING_REDUNDANCY = get_yes_no_criteria(
+        name="step_by_step_reasoning_redundancy",
+        description="Does this step contain any information not required to answer the question asked despite being factual and consistent with the context?",
+        prediction_field="step",
+        context_fields=[
+            "question",
+            "premise",
+            "hypothesis",
+            "model reasoning",
+            "correct answer",
+        ],
+        bigger_is_better=False,
+    )
+    STEP_BY_STEP_REASONING_REPETITION = get_yes_no_criteria(
+        name="step_by_step_reasoning_repetition",
+        description="Does this step contain any information, possibly paraphrased, already mentioned in previous step (and thus could be dropped without impacting correctness)?",
+        prediction_field="step",
+        context_fields=[
+            "question",
+            "premise",
+            "hypothesis",
+            "model reasoning",
+            "correct answer",
+        ],
+        bigger_is_better=False,
+    )
+    STEP_BY_STEP_REASONING_COMMONSENSE = get_yes_no_criteria(
+        name="step_by_step_reasoning_commonsense",
+        description="Does this step contain any errors in relation to general knowledge about the world (i.e. how to compute velocity, how many inches in one foot, etc) not explicitly provided in the context?",
+        prediction_field="step",
+        context_fields=[
+            "question",
+            "premise",
+            "hypothesis",
+            "model reasoning",
+            "correct answer",
+        ],
+        bigger_is_better=False,
+    )
+    STEP_BY_STEP_REASONING_ARITHMETIC = get_yes_no_criteria(
+        name="step_by_step_reasoning_arithmetic",
+        description="Does this step contain any math equation errors? Note that you should consider only current step in isolation, rather than issues propagated from prior steps.",
+        prediction_field="step",
+        context_fields=[
+            "question",
+            "premise",
+            "hypothesis",
+            "model reasoning",
+            "correct answer",
+        ],
+        bigger_is_better=False,
+    )
     REFERENCE_DOCUMENT_FAITHFULNESS = CriteriaWithOptions(
         name="reference_document_faithfulness",
         description="Is the response faithful according to reference document?",
     LOGICAL_VALIDITY_OF_REASONING = CriteriaWithOptions(
         name="logical_validity_of_reasoning",
         description=(
+            "Assess whether the model reasoning is logically valid when solving problems "
             "in propositional logic. The reasoning should follow correct logical principles "
             "and lead to a valid conclusion based on the given premises."
         ),
+        prediction_field="model reasoning",
+        context_fields=["problem statement", "statements"],
         options=[
             CriteriaOption(
                 name="Yes",
             "No": 0.0,
         },
     )
+    TRANSLATION_QUALITY = CriteriaWithOptions(
+        name="translation_quality",
+        description=(
+            "On a scale from 0 to 6, is the translation of the source text accurate, fluent, comprenhencible and free of errors?\n"
+            """Accuracy: How well does the translation convey the original meaning and content of the source text?
+Fluency: How natural and idiomatic is the translation in terms of grammar, syntax, and phrasing?
+Comprehensibility: How easily can the translation be understood by a native speaker of the target language?
+Errors: Are there any errors in grammar, vocabulary, punctuation, or formatting that affect the overall quality of the translation?"""
+        ),
+        prediction_field="translation",
+        context_fields=[
+            "source language",
+            "target language",
+            "source text",
+            "reference translation",
+        ],
+        options=[
+            CriteriaOption(
+                name="0",
+                description="Nonsense/No meaning preserved: Nearly all information is lost between the translation and the source text. Grammar is irrelevant.",
+            ),
+            CriteriaOption(
+                name="1",
+                description="Minimal Meaning Preserved: Only isolated fragments of meaning are retained. The translation is largely incoherent and fails to convey the main ideas. Grammar is poor or broken.",
+            ),
+            CriteriaOption(
+                name="2",
+                description="Some Meaning Preserved: The translation preserves some of the meaning of the source text but misses significant parts. The narrative is hard to follow due to fundamental errors. Grammar may be poor.",
+            ),
+            CriteriaOption(
+                name="3",
+                description="Moderate Meaning Preserved: The core message is partially conveyed, but there are frequent issues with grammar, fluency, or comprehension that impact the overall readability and accuracy.",
+            ),
+            CriteriaOption(
+                name="4",
+                description="Most Meaning Preserved and Few Grammar Mistakes: The translation retains most of the meaning of the source text. It may have some grammar mistakes or minor contextual inconsistencies.",
+            ),
+            CriteriaOption(
+                name="5",
+                description="Nearly Perfect: The translation is highly accurate and mostly fluent. Only very minor grammar or phrasing issues are present, and they do not hinder understanding.",
+            ),
+            CriteriaOption(
+                name="6",
+                description="Perfect Meaning and Grammar: The meaning of the translation is completely consistent with the source text and the surrounding context (if applicable). The grammar is also correct.",
+            ),
+        ],
+        option_map={str(i): i / 6 for i in range(7)},
+    )
 DIRECT_CRITERIA = [c.value for c in DirectCriteriaCatalogEnum]

operators.py CHANGED Viewed

@@ -537,7 +537,9 @@ class InstanceFieldOperator(InstanceOperator):
                     old_value = self.get_default
             with error_context(
-                self, field=from_field, action="Process Field", value=old_value
             ):
                 if self.process_every_value:
                     new_value = [

                     old_value = self.get_default
             with error_context(
+                self,
+                field=from_field,
+                action="Process Field",
             ):
                 if self.process_every_value:
                     new_value = [

struct_data_operators.py CHANGED Viewed

@@ -798,7 +798,9 @@ class ToolCallPostProcessor(FieldOperator):
             result = json.loads(value, strict=False)
         if isoftype(result, List[ToolCall]):
             if len(result) > 1:
-                UnitxtWarning(f"More than one tool returned from model: {result}")
                 return self.failure_value
             return result[0]
         if not isoftype(result, ToolCall):

             result = json.loads(value, strict=False)
         if isoftype(result, List[ToolCall]):
             if len(result) > 1:
+                UnitxtWarning(f"More than one tool call returned from model: {result}")
+                return self.failure_value
+            if len(result) == 0:
                 return self.failure_value
             return result[0]
         if not isoftype(result, ToolCall):

text2sql_utils.py CHANGED Viewed

@@ -4,7 +4,6 @@ import hashlib
 import json
 import os
 import re
-import sqlite3
 import time
 from abc import ABC, abstractmethod
 from collections import Counter
@@ -21,6 +20,12 @@ from requests.exceptions import ConnectionError, ReadTimeout
 from .logging_utils import get_logger
 from .types import SQLDatabase
 logger = get_logger()
 # Check if caching is enabled via environment variable

 import json
 import os
 import re
 import time
 from abc import ABC, abstractmethod
 from collections import Counter
 from .logging_utils import get_logger
 from .types import SQLDatabase
+try:
+    import sqlite3
+except ImportError:
+    sqlite3 = None
 logger = get_logger()
 # Check if caching is enabled via environment variable

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.26.3"


1	+ version = "1.26.4"