File size: 7,254 Bytes
1df0250
d3e2c23
 
 
0e1cbae
d3e2c23
0e1cbae
8b7797a
d3e2c23
8a3751a
 
 
 
 
d3e2c23
8a3751a
 
d3e2c23
8a3751a
d3e2c23
 
 
 
2bf3482
 
 
 
 
 
 
 
 
d3e2c23
 
 
 
 
 
 
 
027f46b
41584a2
 
 
 
 
 
 
 
 
 
 
 
2bf3482
46715c0
d3e2c23
46715c0
d3e2c23
46715c0
d3e2c23
46715c0
d3e2c23
46715c0
d3e2c23
46715c0
d3e2c23
46715c0
d3e2c23
46715c0
d3e2c23
46715c0
d3e2c23
 
1df0250
 
 
 
 
 
 
 
ef8661d
1df0250
d3e2c23
93cec36
d3e2c23
 
 
 
 
 
 
 
 
1df0250
d3e2c23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b4947e
d3e2c23
 
 
 
1df0250
 
 
 
f5d4ce9
 
 
8b7797a
f5d4ce9
195bb6b
 
1df0250
 
 
93cec36
1df0250
 
93cec36
1df0250
 
 
 
 
 
 
 
d3e2c23
 
93cec36
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#this is version two with flagging features
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# initialize the environment
model_name = 'anugrahap/gpt2-indo-textgen'
HF_TOKEN = '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'
hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, "gpt2-output")

# define the tokenization method
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          model_max_length=1e30,
                                          padding_side='right',
                                          return_tensors='pt')

# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# create the decoder parameter to generate the text
def single_generation(text,min_length,max_length,temperature,top_k,top_p,num_beams,repetition_penalty,do_sample):
    # create local variable for error parameter    
    error_rep=gr.Error(f"ERROR: repetition penalty cannot be lower than one! Given rep penalty = {repetition_penalty}")
    error_temp=gr.Error(f"ERROR: temperature cannot be zero or lower! Given temperature = {temperature}")
    error_minmax=gr.Error(f"ERROR: min length must be lower than or equal to max length! Given min length = {min_length}")
    error_numbeams_type=gr.Error(f"ERROR: number of beams must be an integer not {type(num_beams)}")
    error_topk_type=gr.Error(f"ERROR: top k must be an integer not {type(top_k)}")
    error_minmax_type=gr.Error(f"ERROR: min length and max length must be an integer not {type(min_length)} and {type(max_length)}")
    error_empty_temprep=gr.Error("ERROR: temperature and repetition penalty cannot be empty!")
    error_empty_text=gr.Error("ERROR: Input Text cannot be empty!")
    error_unknown=gr.Error("Unknown Error.")

    if text != '':
        if type(min_length) == int and type(max_length) == int:
            if type(top_k) == int:
                if type(num_beams) == int:
                    if min_length <= max_length:
                        if temperature > 0:
                            if repetition_penalty >= 1:
                                if temperature and repetition_penalty is not None:
                                    result = generator(text,
                                                       min_length=min_length,
                                                       max_length=max_length,
                                                       temperature=temperature,
                                                       top_k=top_k,
                                                       top_p=top_p,
                                                       num_beams=num_beams,
                                                       repetition_penalty=repetition_penalty,
                                                       do_sample=do_sample,
                                                       no_repeat_ngram_size=2,
                                                       num_return_sequences=1)
                                    return result[0]["generated_text"]
                                elif temperature or repetition_penalty is None:
                                    raise error_empty_temprep
                            elif repetition_penalty < 1:
                                raise error_rep
                        elif temperature <= 0:
                            raise error_temp
                    elif min_length > max_length:
                        raise error_minmax
                elif type(num_beams) != int:
                    raise error_numbeams_type
            elif type(top_k) != int:
                raise error_topk_type
        elif type(min_length) != int or type(max_length) != int:
            raise error_minmax_type  
    elif text == '':
        raise error_empty_text
    else:
        raise error_unknown


# create the variable needed for the gradio app
forinput=[gr.Textbox(lines=5, label="Input Text"),
        gr.Slider(label="Min Length", minimum=10, maximum=50, value=10, step=5),
        gr.Slider(label="Max Length", minimum=10, maximum=100, value=30, step=10),
        gr.Number(label="Temperature Sampling", value=1.5),
        gr.Slider(label="Top K Sampling", minimum=0, maximum=100, value=30, step=5),
        gr.Slider(label="Top P Sampling", minimum=0.01, maximum=1, value=0.93),
        gr.Slider(label="Number of Beams", minimum=1, maximum=10, value=5, step=1),
        gr.Number(label="Repetition Penalty", value=2.0),
        gr.Dropdown(label="Do Sample?", choices=[True,False], value=True, multiselect=False)]

foroutput=gr.Textbox(lines=5, max_lines=50, label="Generated Text with Greedy/Beam Search Decoding")

examples = [
    ["Indonesia adalah negara kepulauan", 10, 30, 1.0, 25, 0.92, 5, 2.0, True],
    ["Indonesia adalah negara kepulauan", 10, 30, 1.0, 25, 0.92, 5, 1.0, False],
    ["Skripsi merupakan tugas akhir mahasiswa", 20, 40, 1.0, 50, 0.92, 1, 2.0, True],
    ["Skripsi merupakan tugas akhir mahasiswa", 20, 40, 1.0, 50, 0.92, 1, 1.0, False],
    ["Pemandangan di pantai kuta Bali sangatlah indah.", 30, 50, 0.5, 40, 0.98, 10, 1.0, True],
    ["Pemandangan di pantai kuta Bali sangatlah indah.", 10, 30, 1.5, 30, 0.93, 5, 2.0, True]]

title = """
    <style>
    .center {
    display: block;
    margin-top: 20px;
    margin-down: 0px;
    margin-left: auto;
    margin-right: auto;
    }
    </style>
    <style>
    h1 {
    text-align: center;
    margin-top: 0px;
    }
    </style>
    <img src="https://i.postimg.cc/cHPVPSfH/Q-GEN-logo.png"
    alt="Q-GEN Logo"
    border="0" 
    class="center"
    style="height: 100px; width: 100px;"/>
    <h1>GPT-2 Indonesian Text Generation Playground</h1>"""

description = "<p><i>This project is a part of thesis requirement of Anugrah Akbar Praramadhan</i></p>"

article = """<p style='text-align: center'>
    <a href='https://huggingface.co/anugrahap/gpt2-indo-textgen' target='_blank'>Link to the Trained Model<b>&nbsp;|</b></a>
    <a href='https://huggingface.co/spaces/anugrahap/gpt2-indo-text-gen/tree/main' target='_blank'>Link to the Project Repository<b>&nbsp;|</b></a>
    <a href='https://huggingface.co/datasets/anugrahap/gpt2-output/' target='_blank'>Link to the Autosaved Generated Output<b>&nbsp;|</b></a>
    <a href='https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf' target='_blank'>Original Paper</a><br></p>
    <p style='text-align: center'> Trained on Indo4B Benchmark Dataset of Indonesian language Wikipedia with a Causal Language Modeling (CLM) objective<br></p>
    <p style='text-align: center'>Copyright Anugrah Akbar Praramadhan 2023</p>
    """

# using gradio interfaces
app = gr.Interface(
    fn=single_generation,
    inputs=forinput,
    outputs=foroutput,
    examples=examples,
    title=title,
    description=description,
    article=article,
    allow_flagging='manual',
    flagging_options=['Well Performed', 'Inappropriate Word Selection', 'Wordy', 'Strange Word', 'Others'],
    flagging_callback=hf_writer)


if __name__=='__main__':
    app.launch()