KaizeShi commited on
Commit
97d3016
Β·
1 Parent(s): e348d3e

Add application file

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +129 -89
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ‘€
4
  colorFrom: gray
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.0.0
8
  app_file: app.py
9
  pinned: false
10
  license: llama2
 
4
  colorFrom: gray
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 3.21.0
8
  app_file: app.py
9
  pinned: false
10
  license: llama2
app.py CHANGED
@@ -1,108 +1,148 @@
1
- import os
2
- import json
3
- import subprocess
4
- from threading import Thread
5
-
6
- import torch
7
  import spaces
 
8
  from peft import PeftModel
 
9
  import gradio as gr
10
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
11
- from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
12
 
13
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
14
 
15
- MODEL_ID = "meta-llama/Llama-2-7b-hf"
16
- CHAT_TEMPLATE = os.environ.get("CHAT_TEMPLATE")
17
- CONTEXT_LENGTH = int(os.environ.get("CONTEXT_LENGTH"))
18
- COLOR = os.environ.get("COLOR")
19
- DESCRIPTION = os.environ.get("DESCRIPTION")
20
- LORA_WEIGHTS = "DSMI/LLaMA-E"
21
  access_token = os.environ.get('HF_TOKEN')
22
 
23
- @spaces.GPU(duration=120)
24
- def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
25
- # Format history with a given chat template
26
- if CHAT_TEMPLATE == "Auto":
27
- stop_tokens = [tokenizer.eos_token_id]
28
- instruction = []
29
- for user, assistant in history:
30
- instruction.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
31
- instruction.append({"role": "user", "content": message})
32
- elif CHAT_TEMPLATE == "ChatML":
33
- stop_tokens = ["<|endoftext|>", "<|im_end|>"]
34
- instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
35
- for user, assistant in history:
36
- instruction += '<|im_start|>user\n' + user + '\n<|im_end|>\n<|im_start|>assistant\n' + assistant
37
- instruction += '\n<|im_start|>user\n' + message + '\n<|im_end|>\n<|im_start|>assistant\n'
38
- elif CHAT_TEMPLATE == "Mistral Instruct":
39
- stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
40
- instruction = '<s>[INST] ' + system_prompt
41
- for user, assistant in history:
42
- instruction += user + ' [/INST] ' + assistant + '</s>[INST]'
43
- instruction += ' ' + message + ' [/INST]'
44
- else:
45
- raise Exception("Incorrect chat template, select 'ChatML' or 'Mistral Instruct'")
46
- print(instruction)
47
-
48
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
49
- enc = tokenizer([instruction], return_tensors="pt", padding=True, truncation=True)
50
- input_ids, attention_mask = enc.input_ids, enc.attention_mask
51
 
52
- if input_ids.shape[1] > CONTEXT_LENGTH:
53
- input_ids = input_ids[:, -CONTEXT_LENGTH:]
54
 
55
- generate_kwargs = dict(
56
- {"input_ids": input_ids.to(device), "attention_mask": attention_mask.to(device)},
57
- streamer=streamer,
58
- do_sample=True,
59
- temperature=temperature,
60
- max_new_tokens=max_new_tokens,
61
- top_k=top_k,
62
- repetition_penalty=repetition_penalty,
63
- top_p=top_p
64
- )
65
- t = Thread(target=model.generate, kwargs=generate_kwargs)
66
- t.start()
67
- outputs = []
68
- for new_token in streamer:
69
- outputs.append(new_token)
70
- if new_token in stop_tokens:
71
- break
72
- yield "".join(outputs)
73
 
 
 
 
 
 
74
 
75
- # Load model
76
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
77
- quantization_config = BitsAndBytesConfig(
78
- load_in_4bit=False,
79
- bnb_4bit_compute_dtype=torch.bfloat16
80
- )
81
- tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", token=access_token)
82
- model = LlamaForCausalLM.from_pretrained(
83
- MODEL_ID,
84
  load_in_8bit=False,
85
  torch_dtype=torch.float16,
86
  device_map="auto",
87
  )
88
-
89
- model = PeftModel.from_pretrained(
90
  model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
91
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- # Create Gradio interface
94
- gr.ChatInterface(
95
- predict,
96
- title= "πŸ¦™πŸ›οΈ LLaMA-E",
97
- description=DESCRIPTION,
98
- additional_inputs_accordion=gr.Accordion(label="βš™οΈ Parameters", open=False),
99
- additional_inputs=[
100
- gr.Textbox("You are HelpingAI a emotional AI always answer my question in HelpingAI style", label="System prompt"),
101
- gr.Slider(0, 1, 0.8, label="Temperature"),
102
- gr.Slider(128, 4096, 1024, label="Max new tokens"),
103
- gr.Slider(1, 80, 40, label="Top K sampling"),
104
- gr.Slider(0, 2, 1.1, label="Repetition penalty"),
105
- gr.Slider(0, 1, 0.95, label="Top P sampling"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  ],
107
- theme=gr.themes.Soft(primary_hue=COLOR),
108
- ).queue().launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import spaces
2
+ import torch
3
  from peft import PeftModel
4
+ import transformers
5
  import gradio as gr
6
+ import os
 
7
 
 
8
 
9
+ assert (
10
+ "LlamaTokenizer" in transformers._import_structure["models.llama"]
11
+ ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
12
+ from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
 
 
13
  access_token = os.environ.get('HF_TOKEN')
14
 
15
+ tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", token=access_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ BASE_MODEL = "meta-llama/Llama-2-7b-hf"
18
+ LORA_WEIGHTS = "DSMI/LLaMA-E"
19
 
20
+ if torch.cuda.is_available():
21
+ device = "cuda"
22
+ else:
23
+ device = "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ try:
26
+ if torch.backends.mps.is_available():
27
+ device = "mps"
28
+ except:
29
+ pass
30
 
31
+ print("Device: " + str(device))
32
+
33
+ if device == "cuda":
34
+ model = LlamaForCausalLM.from_pretrained(
35
+ BASE_MODEL,
 
 
 
 
36
  load_in_8bit=False,
37
  torch_dtype=torch.float16,
38
  device_map="auto",
39
  )
40
+ model = PeftModel.from_pretrained(
 
41
  model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
42
  )
43
+ elif device == "mps":
44
+ model = LlamaForCausalLM.from_pretrained(
45
+ BASE_MODEL,
46
+ device_map={"": device},
47
+ torch_dtype=torch.float16,
48
+ )
49
+ model = PeftModel.from_pretrained(
50
+ model,
51
+ LORA_WEIGHTS,
52
+ device_map={"": device},
53
+ torch_dtype=torch.float16,
54
+ )
55
+ else:
56
+ model = LlamaForCausalLM.from_pretrained(
57
+ BASE_MODEL,
58
+ device_map={"": device},
59
+ low_cpu_mem_usage=True
60
+ )
61
+ model = PeftModel.from_pretrained(
62
+ model,
63
+ LORA_WEIGHTS,
64
+ device_map={"": device},
65
+ )
66
+
67
+ print("Model: " + str(model))
68
+
69
+ def generate_prompt(instruction, input=None):
70
+ if input:
71
+ return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
72
+ ### Instruction:
73
+ {instruction}
74
+ ### Input:
75
+ {input}
76
+ ### Response:"""
77
+ else:
78
+ return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
79
+ ### Instruction:
80
+ {instruction}
81
+ ### Response:"""
82
 
83
+ if device != "cpu":
84
+ model.half()
85
+ model.eval()
86
+ if torch.__version__ >= "2":
87
+ model = torch.compile(model)
88
+
89
+ @spaces.GPU()
90
+ def evaluate(
91
+ instruction,
92
+ input=None,
93
+ temperature=0.1,
94
+ top_p=0.75,
95
+ top_k=40,
96
+ num_beams=2,
97
+ max_new_tokens=64,
98
+ **kwargs,
99
+ ):
100
+ prompt = generate_prompt(instruction, input)
101
+ inputs = tokenizer(prompt, return_tensors="pt")
102
+ input_ids = inputs["input_ids"].to(device)
103
+ generation_config = GenerationConfig(
104
+ temperature=temperature,
105
+ top_p=top_p,
106
+ top_k=top_k,
107
+ num_beams=num_beams,
108
+ **kwargs,
109
+ )
110
+ with torch.no_grad():
111
+ generation_output = model.generate(
112
+ input_ids=input_ids,
113
+ generation_config=generation_config,
114
+ return_dict_in_generate=True,
115
+ output_scores=True,
116
+ max_new_tokens=max_new_tokens,
117
+ )
118
+ s = generation_output.sequences[0]
119
+ output = tokenizer.decode(s)
120
+ return output.split("### Response:")[1].strip()
121
+
122
+
123
+ g = gr.Interface(
124
+ fn=evaluate,
125
+ inputs=[
126
+ gr.components.Textbox(
127
+ lines=2, label="Instruction", placeholder="Tell me about alpacas."
128
+ ),
129
+ gr.components.Textbox(lines=2, label="Input", placeholder="none"),
130
+ gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
131
+ gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
132
+ gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
133
+ gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
134
+ gr.components.Slider(
135
+ minimum=1, maximum=512, step=1, value=128, label="Max tokens"
136
+ ),
137
  ],
138
+ outputs=[
139
+ gr.Textbox(
140
+ lines=5,
141
+ label="Output",
142
+ )
143
+ ],
144
+ title="πŸ¦™πŸ›οΈ LLaMA-E",
145
+ description="LLaMA-E is a series of fine-tuned LLaMA model following the E-commerce instructions. It is developed by DSMI (http://dsmi.tech/) @ University of Technology Sydney, and trained on the 120k instruction set. This model is for academic research use only. For more details please contact: Kaize.Shi@uts.edu.au",
146
+ )
147
+ g.queue(concurrency_count=1)
148
+ g.launch()