Spaces:

KaizeShi
/

LLaMA-E

Runtime error

App Files Files Community

KaizeShi commited on May 30, 2024

Commit

97d3016

1 Parent(s): e348d3e

Add application file

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +129 -89

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 👀
 colorFrom: gray
 colorTo: blue
 sdk: gradio
-sdk_version: 4.0.0
 app_file: app.py
 pinned: false
 license: llama2

 colorFrom: gray
 colorTo: blue
 sdk: gradio
+sdk_version: 3.21.0
 app_file: app.py
 pinned: false
 license: llama2

app.py CHANGED Viewed

@@ -1,108 +1,148 @@
-import os
-import json
-import subprocess
-from threading import Thread
-import torch
 import spaces
 from peft import PeftModel
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
-from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-MODEL_ID = "meta-llama/Llama-2-7b-hf"
-CHAT_TEMPLATE = os.environ.get("CHAT_TEMPLATE")
-CONTEXT_LENGTH = int(os.environ.get("CONTEXT_LENGTH"))
-COLOR = os.environ.get("COLOR")
-DESCRIPTION = os.environ.get("DESCRIPTION")
-LORA_WEIGHTS = "DSMI/LLaMA-E"
 access_token = os.environ.get('HF_TOKEN')
-@spaces.GPU(duration=120)
-def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
-    # Format history with a given chat template
-    if CHAT_TEMPLATE == "Auto":
-        stop_tokens = [tokenizer.eos_token_id]
-        instruction = []
-        for user, assistant in history:
-            instruction.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-        instruction.append({"role": "user", "content": message})
-    elif CHAT_TEMPLATE == "ChatML":
-        stop_tokens = ["<|endoftext|>", "<|im_end|>"]
-        instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
-        for user, assistant in history:
-            instruction += '<|im_start|>user\n' + user + '\n<|im_end|>\n<|im_start|>assistant\n' + assistant
-        instruction += '\n<|im_start|>user\n' + message + '\n<|im_end|>\n<|im_start|>assistant\n'
-    elif CHAT_TEMPLATE == "Mistral Instruct":
-        stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
-        instruction = '<s>[INST] ' + system_prompt
-        for user, assistant in history:
-            instruction += user + ' [/INST] ' + assistant + '</s>[INST]'
-        instruction += ' ' + message + ' [/INST]'
-    else:
-        raise Exception("Incorrect chat template, select 'ChatML' or 'Mistral Instruct'")
-    print(instruction)
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    enc = tokenizer([instruction], return_tensors="pt", padding=True, truncation=True)
-    input_ids, attention_mask = enc.input_ids, enc.attention_mask
-    if input_ids.shape[1] > CONTEXT_LENGTH:
-        input_ids = input_ids[:, -CONTEXT_LENGTH:]
-    generate_kwargs = dict(
-        {"input_ids": input_ids.to(device), "attention_mask": attention_mask.to(device)},
-        streamer=streamer,
-        do_sample=True,
-        temperature=temperature,
-        max_new_tokens=max_new_tokens,
-        top_k=top_k,
-        repetition_penalty=repetition_penalty,
-        top_p=top_p
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for new_token in streamer:
-        outputs.append(new_token)
-        if new_token in stop_tokens:
-            break
-        yield "".join(outputs)
-# Load model
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=False,
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
-tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", token=access_token)
-model = LlamaForCausalLM.from_pretrained(
-        MODEL_ID,
         load_in_8bit=False,
         torch_dtype=torch.float16,
         device_map="auto",
     )
-model = PeftModel.from_pretrained(
         model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
     )
-# Create Gradio interface
-gr.ChatInterface(
-    predict,
-    title= "🦙🛍️ LLaMA-E",
-    description=DESCRIPTION,
-    additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
-    additional_inputs=[
-        gr.Textbox("You are HelpingAI a emotional AI always answer my question in HelpingAI style", label="System prompt"),
-        gr.Slider(0, 1, 0.8, label="Temperature"),
-        gr.Slider(128, 4096, 1024, label="Max new tokens"),
-        gr.Slider(1, 80, 40, label="Top K sampling"),
-        gr.Slider(0, 2, 1.1, label="Repetition penalty"),
-        gr.Slider(0, 1, 0.95, label="Top P sampling"),
     ],
-    theme=gr.themes.Soft(primary_hue=COLOR),
-).queue().launch()

 import spaces
+import torch
 from peft import PeftModel
+import transformers
 import gradio as gr
+import os
+assert (
+    "LlamaTokenizer" in transformers._import_structure["models.llama"]
+), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
+from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
 access_token = os.environ.get('HF_TOKEN')
+tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", token=access_token)
+BASE_MODEL = "meta-llama/Llama-2-7b-hf"
+LORA_WEIGHTS = "DSMI/LLaMA-E"
+if torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
+try:
+    if torch.backends.mps.is_available():
+        device = "mps"
+except:
+    pass
+print("Device: " + str(device))
+if device == "cuda":
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL,
         load_in_8bit=False,
         torch_dtype=torch.float16,
         device_map="auto",
     )
+    model = PeftModel.from_pretrained(
         model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
     )
+elif device == "mps":
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL,
+        device_map={"": device},
+        torch_dtype=torch.float16,
+    )
+    model = PeftModel.from_pretrained(
+        model,
+        LORA_WEIGHTS,
+        device_map={"": device},
+        torch_dtype=torch.float16,
+    )
+else:
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL,
+        device_map={"": device},
+        low_cpu_mem_usage=True
+    )
+    model = PeftModel.from_pretrained(
+        model,
+        LORA_WEIGHTS,
+        device_map={"": device},
+    )
+print("Model: " + str(model))
+def generate_prompt(instruction, input=None):
+    if input:
+        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+                ### Instruction:
+                {instruction}
+                ### Input:
+                {input}
+                ### Response:"""
+    else:
+        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
+                ### Instruction:
+                {instruction}
+                ### Response:"""
+if device != "cpu":
+    model.half()
+model.eval()
+if torch.__version__ >= "2":
+    model = torch.compile(model)
+@spaces.GPU()
+def evaluate(
+    instruction,
+    input=None,
+    temperature=0.1,
+    top_p=0.75,
+    top_k=40,
+    num_beams=2,
+    max_new_tokens=64,
+    **kwargs,
+):
+    prompt = generate_prompt(instruction, input)
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(device)
+    generation_config = GenerationConfig(
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        num_beams=num_beams,
+        **kwargs,
+    )
+    with torch.no_grad():
+        generation_output = model.generate(
+            input_ids=input_ids,
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+            output_scores=True,
+            max_new_tokens=max_new_tokens,
+        )
+    s = generation_output.sequences[0]
+    output = tokenizer.decode(s)
+    return output.split("### Response:")[1].strip()
+g = gr.Interface(
+    fn=evaluate,
+    inputs=[
+        gr.components.Textbox(
+            lines=2, label="Instruction", placeholder="Tell me about alpacas."
+        ),
+        gr.components.Textbox(lines=2, label="Input", placeholder="none"),
+        gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
+        gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
+        gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
+        gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
+        gr.components.Slider(
+            minimum=1, maximum=512, step=1, value=128, label="Max tokens"
+        ),
     ],
+    outputs=[
+        gr.Textbox(
+            lines=5,
+            label="Output",
+        )
+    ],
+    title="🦙🛍️ LLaMA-E",
+    description="LLaMA-E is a series of fine-tuned LLaMA model following the E-commerce instructions. It is developed by DSMI (http://dsmi.tech/) @ University of Technology Sydney, and trained on the 120k instruction set. This model is for academic research use only. For more details please contact: Kaize.Shi@uts.edu.au",
+)
+g.queue(concurrency_count=1)
+g.launch()