File size: 6,975 Bytes
1df0250
d3e2c23
 
 
0e1cbae
d3e2c23
0e1cbae
1df0250
d3e2c23
8a3751a
 
 
 
 
d3e2c23
8a3751a
 
d3e2c23
8a3751a
d3e2c23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1df0250
 
 
 
 
 
 
 
ef8661d
1df0250
d3e2c23
1df0250
d3e2c23
 
 
 
 
 
 
 
 
1df0250
d3e2c23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b4947e
d3e2c23
 
 
 
1df0250
 
 
 
46b46dd
d3e2c23
d2439cb
1df0250
d3e2c23
46b46dd
 
1df0250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3e2c23
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#this is version two with flagging features
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# initialize the environment
model_name = 'anugrahap/gpt2-indo-textgen'
HF_TOKEN = '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'
hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, "output-gpt2-indo-textgen")

# define the tokenization method
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          model_max_length=1e30,
                                          padding_side='right',
                                          return_tensors='pt')

# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# create the decoder parameter to generate the text
def single_generation(text,min_length,max_length,temperature,top_k,top_p,num_beams,repetition_penalty,do_sample):
    # create local variable for error parameter    
    error_rep=ValueError(f"ERROR: repetition penalty cannot be lower than one! Given rep penalty = {repetition_penalty}")
    error_temp=ValueError(f"ERROR: temperature cannot be zero or lower! Given temperature = {temperature}")
    error_minmax=ValueError(f"ERROR: min length must be lower than or equal to max length! Given min length = {min_length}")
    error_numbeams_type=TypeError(f"ERROR: number of beams must be an integer not {type(num_beams)}")
    error_topk_type=TypeError(f"ERROR: top k must be an integer not {type(top_k)}")
    error_minmax_type=TypeError(f"ERROR: min length and max length must be an integer not {type(min_length)} and {type(max_length)}")   
    error_empty=ValueError("ERROR: Input Text cannot be empty!")
    error_unknown=TypeError("Unknown Error.")

    if text != '':
        if type(min_length) == int and type(max_length) == int:
            if type(top_k) == int:
                if type(num_beams) == int:
                    if min_length <= max_length:
                        if temperature > 0:
                            if repetition_penalty >= 1:
                                result = generator(text,
                                                   min_length=min_length,
                                                   max_length=max_length,
                                                   temperature=temperature,
                                                   top_k=top_k,
                                                   top_p=top_p,
                                                   num_beams=num_beams,
                                                   repetition_penalty=repetition_penalty,
                                                   do_sample=do_sample,
                                                   no_repeat_ngram_size=2,
                                                   num_return_sequences=1)
                                return result[0]["generated_text"]
                            elif repetition_penalty < 1:
                                return error_rep
                        elif temperature <= 0:
                            return error_temp
                    elif min_length > max_length:
                        return error_minmax
                elif type(num_beams) != int:
                    return error_numbeams_type
            elif type(top_k) != int:
                return error_topk_type
        elif type(min_length) != int or type(max_length) != int:
            return error_minmax_type  
    elif text == '':
        return error_empty
    else:
        return error_unknown


# create the variable needed for the gradio app
forinput=[gr.Textbox(lines=5, label="Input Text"),
        gr.Slider(label="Min Length", minimum=10, maximum=50, value=10, step=5),
        gr.Slider(label="Max Length", minimum=10, maximum=100, value=30, step=10),
        gr.Number(label="Temperature Sampling", value=1.5),
        gr.Slider(label="Top K Sampling", minimum=0, maximum=100, value=30, step=5),
        gr.Slider(label="Top P Sampling", minimum=0.01, maximum=1, value=0.93),
        gr.Slider(label="Number of Beams", minimum=1, maximum=10, value=5, step=1),
        gr.Number(label="Repetition Penalty", value=2.0),
        gr.Dropdown(label="Do Sample?", choices=[True,False], value=True, multiselect=False)]

foroutput=gr.Textbox(lines=5, max_lines=50, label="Generated Text with Greedy/Beam Search Decoding")

examples = [
    ["Indonesia adalah negara kepulauan", 10, 30, 1.0, 25, 0.92, 5, 2.0, True],
    ["Indonesia adalah negara kepulauan", 10, 30, 1.0, 25, 0.92, 5, 1.0, False],
    ["Skripsi merupakan tugas akhir mahasiswa", 20, 40, 1.0, 50, 0.92, 1, 2.0, True],
    ["Skripsi merupakan tugas akhir mahasiswa", 20, 40, 1.0, 50, 0.92, 1, 1.0, False],
    ["Pemandangan di pantai kuta Bali sangatlah indah.", 30, 50, 0.5, 40, 0.98, 10, 1.0, True],
    ["Pemandangan di pantai kuta Bali sangatlah indah.", 10, 30, 1.5, 30, 0.93, 5, 2.0, True]]

title = """
    <style>
    .center {
    display: block;
    margin-top: 20px;
    margin-down: 0px;
    margin-left: auto;
    margin-right: auto;
    }
    </style>
    <style>
    h1 {
    text-align: center;
    margin-top: 0px;
    }
    </style>
    <img src="https://i.postimg.cc/cHPVPSfH/Q-GEN-logo.png"
    alt="Q-GEN Logo"
    border="0" 
    class="center"
    style="height: 100px; width: 100px;"/>
    <h1>GPT-2 Indonesian Text Generation Playground</h1>"""

description = "<p><i>This project is a part of thesis requirement of Anugrah Akbar Praramadhan</i></p>"

article = """
    <p style='text-align: center'><a href='https://huggingface.co/anugrahap/gpt2-indo-textgen' target='_blank'>Link to the Trained Model</a><br>
    <p style='text-align: center'><a href='https://huggingface.co/spaces/anugrahap/gpt2-indo-text-gen/tree/main' target='_blank'>Link to the Project Repository</a><br>
    <p style='text-align: center'><a href='https://huggingface.co/datasets/anugrahap/output-gpt2-indo-textgen/' target='_blank'>Link to the Autosaved Generated Output</a><br>
    <p style='text-align: center'><a href='https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf' target='_blank'>Original Paper</a>
    <p style='text-align: center'>Copyright Anugrah Akbar Praramadhan 2023 <br>
    <p style='text-align: center'> Trained on Indo4B Benchmark Dataset of Indonesian language Wikipedia with a Causal Language Modeling (CLM) objective <br>
    """

# using gradio interfaces
app = gr.Interface(
    fn=single_generation,
    inputs=forinput,
    outputs=foroutput,
    examples=examples,
    title=title,
    description=description,
    article=article,
    allow_flagging='manual',
    flagging_options=['Well Performed', 'Inappropriate Word Selection', 'Wordy', 'Strange Word', 'Others'],
    flagging_callback=hf_writer)


if __name__=='__main__':
    app.launch()