Elron commited on
Commit
2537530
·
verified ·
1 Parent(s): 03d14e5

Upload folder using huggingface_hub

Browse files
inference.py CHANGED
@@ -2098,6 +2098,7 @@ class RITSInferenceEngine(
2098
  "deepseek-ai/DeepSeek-V3": "deepseek-v3-h200",
2099
  "meta-llama/Llama-3.1-8B-Instruct": "llama-3-1-8b-instruct",
2100
  "meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama-4-scout-17b-16e-instruct",
 
2101
  }
2102
 
2103
  def get_default_headers(self):
@@ -3522,8 +3523,10 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
3522
  "llama-3-2-90b-vision-instruct": "meta-llama/llama-3-2-90b-vision-instruct",
3523
  "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
3524
  "llama-guard-3-11b-vision": "meta-llama/llama-guard-3-11b-vision",
3525
- "mistral-large-instruct": "mistralai/mistral-large",
3526
  "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7b-instruct-v01",
 
 
 
3527
  },
3528
  "together-ai": { # checked from https://www.together.ai/models
3529
  "llama-3-8b-instruct": "together_ai/meta-llama/Llama-3-8b-chat-hf",
@@ -3583,12 +3586,14 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
3583
  "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
3584
  "llama-4-scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
3585
  "llama-4-maverick": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
 
3586
  "mistral-large-instruct": "mistralai/mistral-large-instruct-2407",
3587
  "mixtral-8x7b-instruct": "mistralai/mixtral-8x7B-instruct-v0.1",
3588
  "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7B-instruct-v0.1",
3589
  "deepseek-v3": "deepseek-ai/DeepSeek-V3",
3590
  "granite-guardian-3-2-3b-a800m": "ibm-granite/granite-guardian-3.2-3b-a800m",
3591
  "granite-guardian-3-2-5b": "ibm-granite/granite-guardian-3.2-5b",
 
3592
  },
3593
  "open-ai": {
3594
  "o1-mini": "o1-mini",
 
2098
  "deepseek-ai/DeepSeek-V3": "deepseek-v3-h200",
2099
  "meta-llama/Llama-3.1-8B-Instruct": "llama-3-1-8b-instruct",
2100
  "meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama-4-scout-17b-16e-instruct",
2101
+ "mistralai/Mistral-Small-3.1-24B-Instruct-2503": "mistral-small-3-1-24b-2503",
2102
  }
2103
 
2104
  def get_default_headers(self):
 
3523
  "llama-3-2-90b-vision-instruct": "meta-llama/llama-3-2-90b-vision-instruct",
3524
  "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
3525
  "llama-guard-3-11b-vision": "meta-llama/llama-guard-3-11b-vision",
 
3526
  "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7b-instruct-v01",
3527
+ "mistral-small-instruct": "mistralai/mistral-small-3-1-24b-instruct-2503",
3528
+ "mistral-medium-instruct": "mistralai/mistral-medium-2505",
3529
+ "mistral-large-instruct": "mistralai/mistral-large",
3530
  },
3531
  "together-ai": { # checked from https://www.together.ai/models
3532
  "llama-3-8b-instruct": "together_ai/meta-llama/Llama-3-8b-chat-hf",
 
3586
  "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
3587
  "llama-4-scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
3588
  "llama-4-maverick": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
3589
+ "mistral-small-instruct": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
3590
  "mistral-large-instruct": "mistralai/mistral-large-instruct-2407",
3591
  "mixtral-8x7b-instruct": "mistralai/mixtral-8x7B-instruct-v0.1",
3592
  "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7B-instruct-v0.1",
3593
  "deepseek-v3": "deepseek-ai/DeepSeek-V3",
3594
  "granite-guardian-3-2-3b-a800m": "ibm-granite/granite-guardian-3.2-3b-a800m",
3595
  "granite-guardian-3-2-5b": "ibm-granite/granite-guardian-3.2-5b",
3596
+ "phi-4": "microsoft/phi-4",
3597
  },
3598
  "open-ai": {
3599
  "o1-mini": "o1-mini",
llm_as_judge_constants.py CHANGED
@@ -1032,6 +1032,126 @@ class DirectCriteriaCatalogEnum(Enum):
1032
  bigger_is_better=False,
1033
  )
1034
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1035
  REFERENCE_DOCUMENT_FAITHFULNESS = CriteriaWithOptions(
1036
  name="reference_document_faithfulness",
1037
  description="Is the response faithful according to reference document?",
@@ -1412,12 +1532,12 @@ class DirectCriteriaCatalogEnum(Enum):
1412
  LOGICAL_VALIDITY_OF_REASONING = CriteriaWithOptions(
1413
  name="logical_validity_of_reasoning",
1414
  description=(
1415
- "Assess whether the model's reasoning is logically valid when solving problems "
1416
  "in propositional logic. The reasoning should follow correct logical principles "
1417
  "and lead to a valid conclusion based on the given premises."
1418
  ),
1419
- prediction_field="reasoning",
1420
- context_fields=[],
1421
  options=[
1422
  CriteriaOption(
1423
  name="Yes",
@@ -1433,6 +1553,54 @@ class DirectCriteriaCatalogEnum(Enum):
1433
  "No": 0.0,
1434
  },
1435
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1436
 
1437
 
1438
  DIRECT_CRITERIA = [c.value for c in DirectCriteriaCatalogEnum]
 
1032
  bigger_is_better=False,
1033
  )
1034
 
1035
+ STEP_BY_STEP_REASONING_BAD_GRAMMAR = get_yes_no_criteria(
1036
+ name="step_by_step_reasoning_bad_grammar",
1037
+ description="Does this step contain any faulty, unconventional, or controversial grammar usage? In other words, does the language in this step sounds unnatural?",
1038
+ prediction_field="step",
1039
+ context_fields=[],
1040
+ bigger_is_better=False,
1041
+ )
1042
+
1043
+ STEP_BY_STEP_REASONING_NON_FACTUAL = get_yes_no_criteria(
1044
+ name="step_by_step_reasoning_non_factual",
1045
+ description="Does this step contain any information that contradicts the context while still largely talking about the same concepts? (Ex. Characteristics of named objects are wrong, named entities changed.)",
1046
+ prediction_field="step",
1047
+ context_fields=[
1048
+ "question",
1049
+ "premise",
1050
+ "hypothesis",
1051
+ "model reasoning",
1052
+ "correct answer",
1053
+ ],
1054
+ bigger_is_better=False,
1055
+ )
1056
+
1057
+ STEP_BY_STEP_NON_COHERENT = get_yes_no_criteria(
1058
+ name="step_by_step_reasoning_non_coherent",
1059
+ description="Does this step contain any logical deduction errors (Ie, makes a conclusion contradictory to previously stated clauses, including clauses within this step itself; makes a conclusion while not having enough support to make the conclusion)",
1060
+ prediction_field="step",
1061
+ context_fields=[
1062
+ "question",
1063
+ "premise",
1064
+ "hypothesis",
1065
+ "model reasoning",
1066
+ "correct answer",
1067
+ ],
1068
+ bigger_is_better=False,
1069
+ )
1070
+
1071
+ STEP_BY_STEP_BAD_FINAL_ANSWER = get_yes_no_criteria(
1072
+ name="step_by_step_reasoning_bad_final_answer",
1073
+ description="Does this step contain a final step with an incorrect final answer? (If an explicit 'yes/no' is not provided, an exact match of the correct answer with respect to the question in the context must be given.)",
1074
+ prediction_field="step",
1075
+ context_fields=[
1076
+ "question",
1077
+ "premise",
1078
+ "hypothesis",
1079
+ "model reasoning",
1080
+ "correct answer",
1081
+ ],
1082
+ bigger_is_better=False,
1083
+ )
1084
+
1085
+ STEP_BY_STEP_REASONING_HALLUCINATION = get_yes_no_criteria(
1086
+ name="step_by_step_reasoning_hallucination",
1087
+ description="Does this step contain any information not provided in the problem statement that is irrelevant or wrong?",
1088
+ prediction_field="step",
1089
+ context_fields=[
1090
+ "question",
1091
+ "premise",
1092
+ "hypothesis",
1093
+ "model reasoning",
1094
+ "correct answer",
1095
+ ],
1096
+ bigger_is_better=False,
1097
+ )
1098
+
1099
+ STEP_BY_STEP_REASONING_REDUNDANCY = get_yes_no_criteria(
1100
+ name="step_by_step_reasoning_redundancy",
1101
+ description="Does this step contain any information not required to answer the question asked despite being factual and consistent with the context?",
1102
+ prediction_field="step",
1103
+ context_fields=[
1104
+ "question",
1105
+ "premise",
1106
+ "hypothesis",
1107
+ "model reasoning",
1108
+ "correct answer",
1109
+ ],
1110
+ bigger_is_better=False,
1111
+ )
1112
+
1113
+ STEP_BY_STEP_REASONING_REPETITION = get_yes_no_criteria(
1114
+ name="step_by_step_reasoning_repetition",
1115
+ description="Does this step contain any information, possibly paraphrased, already mentioned in previous step (and thus could be dropped without impacting correctness)?",
1116
+ prediction_field="step",
1117
+ context_fields=[
1118
+ "question",
1119
+ "premise",
1120
+ "hypothesis",
1121
+ "model reasoning",
1122
+ "correct answer",
1123
+ ],
1124
+ bigger_is_better=False,
1125
+ )
1126
+
1127
+ STEP_BY_STEP_REASONING_COMMONSENSE = get_yes_no_criteria(
1128
+ name="step_by_step_reasoning_commonsense",
1129
+ description="Does this step contain any errors in relation to general knowledge about the world (i.e. how to compute velocity, how many inches in one foot, etc) not explicitly provided in the context?",
1130
+ prediction_field="step",
1131
+ context_fields=[
1132
+ "question",
1133
+ "premise",
1134
+ "hypothesis",
1135
+ "model reasoning",
1136
+ "correct answer",
1137
+ ],
1138
+ bigger_is_better=False,
1139
+ )
1140
+
1141
+ STEP_BY_STEP_REASONING_ARITHMETIC = get_yes_no_criteria(
1142
+ name="step_by_step_reasoning_arithmetic",
1143
+ description="Does this step contain any math equation errors? Note that you should consider only current step in isolation, rather than issues propagated from prior steps.",
1144
+ prediction_field="step",
1145
+ context_fields=[
1146
+ "question",
1147
+ "premise",
1148
+ "hypothesis",
1149
+ "model reasoning",
1150
+ "correct answer",
1151
+ ],
1152
+ bigger_is_better=False,
1153
+ )
1154
+
1155
  REFERENCE_DOCUMENT_FAITHFULNESS = CriteriaWithOptions(
1156
  name="reference_document_faithfulness",
1157
  description="Is the response faithful according to reference document?",
 
1532
  LOGICAL_VALIDITY_OF_REASONING = CriteriaWithOptions(
1533
  name="logical_validity_of_reasoning",
1534
  description=(
1535
+ "Assess whether the model reasoning is logically valid when solving problems "
1536
  "in propositional logic. The reasoning should follow correct logical principles "
1537
  "and lead to a valid conclusion based on the given premises."
1538
  ),
1539
+ prediction_field="model reasoning",
1540
+ context_fields=["problem statement", "statements"],
1541
  options=[
1542
  CriteriaOption(
1543
  name="Yes",
 
1553
  "No": 0.0,
1554
  },
1555
  )
1556
+ TRANSLATION_QUALITY = CriteriaWithOptions(
1557
+ name="translation_quality",
1558
+ description=(
1559
+ "On a scale from 0 to 6, is the translation of the source text accurate, fluent, comprenhencible and free of errors?\n"
1560
+ """Accuracy: How well does the translation convey the original meaning and content of the source text?
1561
+ Fluency: How natural and idiomatic is the translation in terms of grammar, syntax, and phrasing?
1562
+ Comprehensibility: How easily can the translation be understood by a native speaker of the target language?
1563
+ Errors: Are there any errors in grammar, vocabulary, punctuation, or formatting that affect the overall quality of the translation?"""
1564
+ ),
1565
+ prediction_field="translation",
1566
+ context_fields=[
1567
+ "source language",
1568
+ "target language",
1569
+ "source text",
1570
+ "reference translation",
1571
+ ],
1572
+ options=[
1573
+ CriteriaOption(
1574
+ name="0",
1575
+ description="Nonsense/No meaning preserved: Nearly all information is lost between the translation and the source text. Grammar is irrelevant.",
1576
+ ),
1577
+ CriteriaOption(
1578
+ name="1",
1579
+ description="Minimal Meaning Preserved: Only isolated fragments of meaning are retained. The translation is largely incoherent and fails to convey the main ideas. Grammar is poor or broken.",
1580
+ ),
1581
+ CriteriaOption(
1582
+ name="2",
1583
+ description="Some Meaning Preserved: The translation preserves some of the meaning of the source text but misses significant parts. The narrative is hard to follow due to fundamental errors. Grammar may be poor.",
1584
+ ),
1585
+ CriteriaOption(
1586
+ name="3",
1587
+ description="Moderate Meaning Preserved: The core message is partially conveyed, but there are frequent issues with grammar, fluency, or comprehension that impact the overall readability and accuracy.",
1588
+ ),
1589
+ CriteriaOption(
1590
+ name="4",
1591
+ description="Most Meaning Preserved and Few Grammar Mistakes: The translation retains most of the meaning of the source text. It may have some grammar mistakes or minor contextual inconsistencies.",
1592
+ ),
1593
+ CriteriaOption(
1594
+ name="5",
1595
+ description="Nearly Perfect: The translation is highly accurate and mostly fluent. Only very minor grammar or phrasing issues are present, and they do not hinder understanding.",
1596
+ ),
1597
+ CriteriaOption(
1598
+ name="6",
1599
+ description="Perfect Meaning and Grammar: The meaning of the translation is completely consistent with the source text and the surrounding context (if applicable). The grammar is also correct.",
1600
+ ),
1601
+ ],
1602
+ option_map={str(i): i / 6 for i in range(7)},
1603
+ )
1604
 
1605
 
1606
  DIRECT_CRITERIA = [c.value for c in DirectCriteriaCatalogEnum]
operators.py CHANGED
@@ -537,7 +537,9 @@ class InstanceFieldOperator(InstanceOperator):
537
  old_value = self.get_default
538
 
539
  with error_context(
540
- self, field=from_field, action="Process Field", value=old_value
 
 
541
  ):
542
  if self.process_every_value:
543
  new_value = [
 
537
  old_value = self.get_default
538
 
539
  with error_context(
540
+ self,
541
+ field=from_field,
542
+ action="Process Field",
543
  ):
544
  if self.process_every_value:
545
  new_value = [
struct_data_operators.py CHANGED
@@ -798,7 +798,9 @@ class ToolCallPostProcessor(FieldOperator):
798
  result = json.loads(value, strict=False)
799
  if isoftype(result, List[ToolCall]):
800
  if len(result) > 1:
801
- UnitxtWarning(f"More than one tool returned from model: {result}")
 
 
802
  return self.failure_value
803
  return result[0]
804
  if not isoftype(result, ToolCall):
 
798
  result = json.loads(value, strict=False)
799
  if isoftype(result, List[ToolCall]):
800
  if len(result) > 1:
801
+ UnitxtWarning(f"More than one tool call returned from model: {result}")
802
+ return self.failure_value
803
+ if len(result) == 0:
804
  return self.failure_value
805
  return result[0]
806
  if not isoftype(result, ToolCall):
text2sql_utils.py CHANGED
@@ -4,7 +4,6 @@ import hashlib
4
  import json
5
  import os
6
  import re
7
- import sqlite3
8
  import time
9
  from abc import ABC, abstractmethod
10
  from collections import Counter
@@ -21,6 +20,12 @@ from requests.exceptions import ConnectionError, ReadTimeout
21
  from .logging_utils import get_logger
22
  from .types import SQLDatabase
23
 
 
 
 
 
 
 
24
  logger = get_logger()
25
 
26
  # Check if caching is enabled via environment variable
 
4
  import json
5
  import os
6
  import re
 
7
  import time
8
  from abc import ABC, abstractmethod
9
  from collections import Counter
 
20
  from .logging_utils import get_logger
21
  from .types import SQLDatabase
22
 
23
+ try:
24
+ import sqlite3
25
+ except ImportError:
26
+ sqlite3 = None
27
+
28
+
29
  logger = get_logger()
30
 
31
  # Check if caching is enabled via environment variable
version.py CHANGED
@@ -1 +1 @@
1
- version = "1.26.3"
 
1
+ version = "1.26.4"