Spaces:

sagar007
/

lama_storm_8b

Running

sagar007 commited on Aug 28, 2024

Commit

cc1b568

verified ·

1 Parent(s): 1f7ba92

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,16 +1,23 @@
 import gradio as gr
-from transformers import AutoTokenizer, pipeline
 import torch
 model_name = "akjindal53244/Llama-3.1-Storm-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-pipeline = pipeline(
-    "text-generation",
-    model=model_name,
     torch_dtype=torch.bfloat16,
-    device_map="auto",
 )
 def generate_text(prompt, max_length, temperature):
     messages = [
         {"role": "system", "content": "You are a helpful assistant."},
@@ -18,8 +25,10 @@ def generate_text(prompt, max_length, temperature):
     ]
     formatted_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-    outputs = pipeline(
-        formatted_prompt,
         max_new_tokens=max_length,
         do_sample=True,
         temperature=temperature,
@@ -27,7 +36,7 @@ def generate_text(prompt, max_length, temperature):
         top_p=0.95,
     )
-    return outputs[0]["generated_text"]
 iface = gr.Interface(
     fn=generate_text,

 import gradio as gr
+import spaces
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import subprocess
+# Install flash-attn
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+# Load the model and tokenizer
 model_name = "akjindal53244/Llama-3.1-Storm-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
     torch_dtype=torch.bfloat16,
+    use_flash_attention_2=True,
+    device_map="auto"
 )
+@spaces.GPU(duration=120)
 def generate_text(prompt, max_length, temperature):
     messages = [
         {"role": "system", "content": "You are a helpful assistant."},
     ]
     formatted_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
+    outputs = model.generate(
+        **inputs,
         max_new_tokens=max_length,
         do_sample=True,
         temperature=temperature,
         top_p=0.95,
     )
+    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
 iface = gr.Interface(
     fn=generate_text,