File size: 2,606 Bytes
953bf3d
 
debfcf8
 
 
 
 
 
 
 
 
 
 
2f5c740
 
 
 
 
 
d7f1630
 
 
 
 
 
2f5c740
d7f1630
ce419f1
b1d6e77
d7f1630
 
730ef21
 
 
d552000
 
b463202
2f5c740
d7f1630
dfc8b82
debfcf8
 
d7f1630
d7bea84
debfcf8
 
80ae653
debfcf8
2f5c740
 
9226230
80ae653
dfc8b82
b1d6e77
 
debfcf8
 
80ae653
 
debfcf8
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import gradio as gr
import os

import pandas as pd
from datasets import load_dataset


from transformers import T5ForConditionalGeneration, T5Tokenizer
device = 'cpu' # if you have a GPU

tokenizer = T5Tokenizer.from_pretrained('stanfordnlp/SteamSHP-flan-t5-large')
model = T5ForConditionalGeneration.from_pretrained('stanfordnlp/SteamSHP-flan-t5-large').to(device)

model_list = [
 'google/flan-t5-xxl',
'bigscience/bloomz-7b1',
'facebook/opt-iml-max-30b',
'allenai/tk-instruct-11b-def-pos']

HF_TOKEN = os.getenv("HF_TOKEN")

OUTPUTS_DATASET = "HuggingFaceH4/instruction-pilot-outputs-filtered"

ds = load_dataset(OUTPUTS_DATASET, split="train", use_auth_token=HF_TOKEN)

def process(model_A, model_B):
    sample_ds = ds.shuffle().select(range(1))
    sample = sample_ds[0]
    prompt = sample["prompt"]
    
    df = pd.DataFrame.from_records(sample["filtered_outputs"])
    response_A_df = df[df['model']==model_A]["output"]
    response_B_df= df[df['model']==model_B]["output"]

    response_A = response_A_df.values[0]
    response_B = response_B_df.values[0]
    print(response_A)
    
        
    input_text = "POST: "+ prompt+ "\n\n RESPONSE A: "+response_A+"\n\n RESPONSE B: "+response_B+"\n\n Which response is better? RESPONSE"
    x = tokenizer([input_text], return_tensors='pt').input_ids.to(device)
    y = model.generate(x, max_new_tokens=1)
    prefered = tokenizer.batch_decode(y, skip_special_tokens=True)[0] 
    return  prompt,df[df['model'].isin([model_A, model_B])], prefered

title = "Compare Instruction Models to see which one is more helpful"
description = "This app compares the outputs of various open-source, instruction-trained models from a [dataset](https://huggingface.co/datasets/{OUTPUTS_DATASET}) of human demonstrations using a reward model trained on the [Stanford Human Preferences Dataset (SHP)] (https://huggingface.co/datasets/stanfordnlp/SHP)"
interface = gr.Interface(fn=process, 
                     inputs=[gr.Dropdown(choices=model_list, value=model_list[0], label='Model A'),
                            gr.Dropdown(choices=model_list, value=model_list[1], label='Model B')],
                     outputs=[
                              gr.Textbox(label = "Prompt"),
                              gr.DataFrame(label = "Model Responses"),
                              gr.Textbox(label = "Preferred Option"),
                                 
                              ],
                     title=title,
                     description = description
                        
                     )
                     
interface.launch(debug=True)