Spaces:

vectara
/

hhem-2.1-open-demo

Running

App Files Files Community

forrestbao commited on 19 days ago

Commit

a6b0504

1 Parent(s): 4e7ea3a

polish in-app message

Browse files

Files changed (2) hide show

app.py +42 -30
requirements.txt +5 -32

app.py CHANGED Viewed

@@ -1,46 +1,58 @@
-# %%
 from typing import List, Literal
-from pydantic import BaseModel
 from IPython.display import display, Markdown
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from transformers import pipeline
-# %%
-class HHEMOutput(BaseModel):
-    score: float # we need score for ROC curve
-    label: Literal[0,1]
-# %%
-PROMPT_TEMPLATE = "<pad> Determine if the hypothesis is true given the premise?\n\nPremise: {text1}\n\nHypothesis: {text2}"
-CHECKPOINT = "vectara/hallucination_evaluation_model"
-FOUNDATION = "google/flan-t5-small"
-tokenizer = AutoTokenizer.from_pretrained(FOUNDATION)
-classifier = pipeline("text-classification", model=CHECKPOINT, tokenizer=tokenizer, trust_remote_code=True)
-def predict(premise: str, hypothesis: str) -> Markdown:
-    texts_prompted: List[str] = [PROMPT_TEMPLATE.format(text1=premise, text2=hypothesis)]
-    full_scores = classifier(texts_prompted, top_k=None) # List[List[Dict[str, float]]]
-    # Optional: Extract the scores for the 'consistent' label
-    simple_scores = [score_dict['score'] for score_for_both_labels in full_scores for score_dict in score_for_both_labels if score_dict['label'] == 'consistent']
-    threshold = 0.5
-    preds = [0 if s < threshold else 1 for s in simple_scores]
-    output = HHEMOutput(score=simple_scores[0], label=preds[0])
-    verdict = "consistent" if output.label == 1 else "hallucinated"
-    output_string = f"""
-    **Premise**: {premise}
-    **Hypothesis**: {hypothesis}
-    **HHEM's judgement is**: {verdict} **with the score**: {output.score}
     """
-    return Markdown(output_string)

+# Forrest Sheng Bao
+# 2025-05-25
+# forrest@vectara.com
 from typing import List, Literal
 from IPython.display import display, Markdown
+from transformers import AutoModelForSequenceClassification
+hhem = AutoModelForSequenceClassification.from_pretrained('vectara/hallucination_evaluation_model', trust_remote_code=True)
+def HHEM(
+    LLM_Prompt: str = "The sky is blue.",
+    LLM_Response: str = "The ocean is blue."
+    ) -> Markdown:
+    """# GUI demo for Vectara's HHEM-2.1-Open
+    Vectara's Hughes Hallucination Evaluation Model (HHEM)  evaluates how well an LLM's output (called the "response" or the "hypothesis") is faithful/grounded to or supported by the input given to it (called the "prompt" or the "premise"). HHEM has two versions: [HHEM-Open](https://huggingface.co/vectara/hallucination_evaluation_model) and [HHEM Commercial](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model).
+    To use the demo, fill in the "LLM_Prompt" and "LLM_Response" fields and click the run button. A placeholder example is prefilled for you. Feel free to replace it with your own examples and evaluate them.
+    Args:
+        LLM_Prompt (str): a.k.a. the "premise".
+        LLM_Response (str): a.k.a. the "hypothesis".
+    """
+    pairs = [(LLM_Prompt, LLM_Response)]
+    score = hhem.predict(pairs)[0]
+    verdict = "consistent" if score > 0.5 else "hallucinated"
+    output_string = f"""
+Given the **prompt**:
+> {LLM_Prompt}
+and
+the **response**:
+> {LLM_Response}
+HHEM's **judgement** is: <u>{verdict}</u> **with the score**: <u>{score:0.3f}</u>.
+Wanna know why? Check out [Vectara's Hallucination Corrector (VHC)](https://hcm.demo.vectara.dev/)
+Note that in the industry, there are generally two definitions to hallucations:
+* faithfulness: the LLM's response contains information not supported by the prompt given to it.
+* factuality: the LLM's response is not true per the world knowledge.
+In HHEM, we take the faithfulness definition.
+See also:
+* [HHEM Leaderboard](https://huggingface.co/spaces/vectara/leaderboard)
+* [Source code of this app](https://huggingface.co/spaces/vectara/hhem-2.1-open-demo/tree/main)
     """
+    return output_string

requirements.txt CHANGED Viewed

@@ -1,32 +1,5 @@
-APScheduler==3.10.1
-black==23.11.0
-click==8.1.7
-datasets==2.14.5
-gradio==4.44.0
-gradio_client==1.3.0
-huggingface-hub>=0.18.0
-litellm==1.15.1
-matplotlib==3.7.1
-numpy==1.26.4
-pandas==2.0.0
-python-dateutil==2.8.2
-requests==2.31.0
-tqdm==4.66.5
-tokenizers>=0.15.0
-sentence-transformers==2.2.2
-google-generativeai
-replicate
-anthropic
-openai
-cohere
-mistralai
-peft
-markdown-it-py
-mdit_plain
-google-cloud-aiplatform>=1.38
-qwen-vl-utils
-vertexai
-# git+https://github.com/huggingface/transformers
-transformers==4.45.2
-together==1.3.0
-spacy

+funix==0.6.2
+ipython
+pydantic
+transformers
+peft