forrestbao commited on
Commit
a6b0504
·
1 Parent(s): 4e7ea3a

polish in-app message

Browse files
Files changed (2) hide show
  1. app.py +42 -30
  2. requirements.txt +5 -32
app.py CHANGED
@@ -1,46 +1,58 @@
1
- # %%
 
 
 
2
  from typing import List, Literal
3
- from pydantic import BaseModel
4
  from IPython.display import display, Markdown
 
5
 
6
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
- from transformers import pipeline
8
 
9
- # %%
10
- class HHEMOutput(BaseModel):
11
- score: float # we need score for ROC curve
12
- label: Literal[0,1]
 
13
 
14
- # %%
15
- PROMPT_TEMPLATE = "<pad> Determine if the hypothesis is true given the premise?\n\nPremise: {text1}\n\nHypothesis: {text2}"
16
 
17
- CHECKPOINT = "vectara/hallucination_evaluation_model"
18
- FOUNDATION = "google/flan-t5-small"
19
 
20
- tokenizer = AutoTokenizer.from_pretrained(FOUNDATION)
21
- classifier = pipeline("text-classification", model=CHECKPOINT, tokenizer=tokenizer, trust_remote_code=True)
 
22
 
23
- def predict(premise: str, hypothesis: str) -> Markdown:
24
-
25
- texts_prompted: List[str] = [PROMPT_TEMPLATE.format(text1=premise, text2=hypothesis)]
26
 
27
- full_scores = classifier(texts_prompted, top_k=None) # List[List[Dict[str, float]]]
28
 
29
- # Optional: Extract the scores for the 'consistent' label
30
- simple_scores = [score_dict['score'] for score_for_both_labels in full_scores for score_dict in score_for_both_labels if score_dict['label'] == 'consistent']
31
 
32
- threshold = 0.5
33
- preds = [0 if s < threshold else 1 for s in simple_scores]
34
 
35
- output = HHEMOutput(score=simple_scores[0], label=preds[0])
36
- verdict = "consistent" if output.label == 1 else "hallucinated"
37
 
38
- output_string = f"""
39
- **Premise**: {premise}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- **Hypothesis**: {hypothesis}
42
-
43
- **HHEM's judgement is**: {verdict} **with the score**: {output.score}
44
  """
45
 
46
- return Markdown(output_string)
 
1
+ # Forrest Sheng Bao
2
+ # 2025-05-25
3
+ # forrest@vectara.com
4
+
5
  from typing import List, Literal
 
6
  from IPython.display import display, Markdown
7
+ from transformers import AutoModelForSequenceClassification
8
 
9
+ hhem = AutoModelForSequenceClassification.from_pretrained('vectara/hallucination_evaluation_model', trust_remote_code=True)
 
10
 
11
+ def HHEM(
12
+ LLM_Prompt: str = "The sky is blue.",
13
+ LLM_Response: str = "The ocean is blue."
14
+ ) -> Markdown:
15
+ """# GUI demo for Vectara's HHEM-2.1-Open
16
 
17
+ Vectara's Hughes Hallucination Evaluation Model (HHEM) evaluates how well an LLM's output (called the "response" or the "hypothesis") is faithful/grounded to or supported by the input given to it (called the "prompt" or the "premise"). HHEM has two versions: [HHEM-Open](https://huggingface.co/vectara/hallucination_evaluation_model) and [HHEM Commercial](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model).
 
18
 
19
+ To use the demo, fill in the "LLM_Prompt" and "LLM_Response" fields and click the run button. A placeholder example is prefilled for you. Feel free to replace it with your own examples and evaluate them.
 
20
 
21
+ Args:
22
+ LLM_Prompt (str): a.k.a. the "premise".
23
+ LLM_Response (str): a.k.a. the "hypothesis".
24
 
25
+ """
26
+ pairs = [(LLM_Prompt, LLM_Response)]
 
27
 
28
+ score = hhem.predict(pairs)[0]
29
 
30
+ verdict = "consistent" if score > 0.5 else "hallucinated"
 
31
 
32
+ output_string = f"""
33
+ Given the **prompt**:
34
 
35
+ > {LLM_Prompt}
 
36
 
37
+ and
38
+
39
+ the **response**:
40
+ > {LLM_Response}
41
+
42
+ HHEM's **judgement** is: <u>{verdict}</u> **with the score**: <u>{score:0.3f}</u>.
43
+
44
+ Wanna know why? Check out [Vectara's Hallucination Corrector (VHC)](https://hcm.demo.vectara.dev/)
45
+
46
+ Note that in the industry, there are generally two definitions to hallucations:
47
+ * faithfulness: the LLM's response contains information not supported by the prompt given to it.
48
+ * factuality: the LLM's response is not true per the world knowledge.
49
+
50
+ In HHEM, we take the faithfulness definition.
51
+
52
+ See also:
53
+ * [HHEM Leaderboard](https://huggingface.co/spaces/vectara/leaderboard)
54
+ * [Source code of this app](https://huggingface.co/spaces/vectara/hhem-2.1-open-demo/tree/main)
55
 
 
 
 
56
  """
57
 
58
+ return output_string
requirements.txt CHANGED
@@ -1,32 +1,5 @@
1
- APScheduler==3.10.1
2
- black==23.11.0
3
- click==8.1.7
4
- datasets==2.14.5
5
- gradio==4.44.0
6
- gradio_client==1.3.0
7
- huggingface-hub>=0.18.0
8
- litellm==1.15.1
9
- matplotlib==3.7.1
10
- numpy==1.26.4
11
- pandas==2.0.0
12
- python-dateutil==2.8.2
13
- requests==2.31.0
14
- tqdm==4.66.5
15
- tokenizers>=0.15.0
16
- sentence-transformers==2.2.2
17
- google-generativeai
18
- replicate
19
- anthropic
20
- openai
21
- cohere
22
- mistralai
23
- peft
24
- markdown-it-py
25
- mdit_plain
26
- google-cloud-aiplatform>=1.38
27
- qwen-vl-utils
28
- vertexai
29
- # git+https://github.com/huggingface/transformers
30
- transformers==4.45.2
31
- together==1.3.0
32
- spacy
 
1
+ funix==0.6.2
2
+ ipython
3
+ pydantic
4
+ transformers
5
+ peft