Spaces:

NouRed
/

BioMed-LLaMa-3

Build error

App Files Files Community

NouRed commited on May 29, 2024

Commit

e653a3d

verified ·

1 Parent(s): e60e350

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -36

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ from transformers import (
     AutoModelForCausalLM)
-@spaces.GPU
 def generate_prompt(instruction, input=None):
     if input:
         return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
@@ -33,46 +32,51 @@ def generate_prompt(instruction, input=None):
 ### Response:
 """
 @spaces.GPU
-based_model_path = "meta-llama/Meta-Llama-3-8B"
-lora_weights = "NouRed/BioMed-Tuned-Llama-3-8b"
-load_in_4bit=True
-bnb_4bit_use_double_quant=True
-bnb_4bit_quant_type="nf4"
-bnb_4bit_compute_dtype=torch.bfloat16
-device = torch.device("cuda" if torch.cuda.is_available() else "CPU")
-tokenizer = AutoTokenizer.from_pretrained(
-    based_model_path,
     )
-tokenizer.padding_side = 'right'
-tokenizer.pad_token = tokenizer.eos_token
-tokenizer.add_eos_token = True
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=load_in_4bit,
-    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
-    bnb_4bit_quant_type=bnb_4bit_quant_type,
-    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
-)
-base_model = AutoModelForCausalLM.from_pretrained(
-    based_model_path,
-    device_map="auto",
-    attn_implementation="flash_attention_2", # I have an A100 GPU with 40GB of RAM 😎
-    quantization_config=quantization_config,
-    )
-model = PeftModel.from_pretrained(
-    base_model,
-    lora_weights,
-    torch_dtype=torch.float16,
-    )
 @spaces.GPU

     AutoModelForCausalLM)
 def generate_prompt(instruction, input=None):
     if input:
         return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
 ### Response:
 """
 @spaces.GPU
+def models():
+    based_model_path = "meta-llama/Meta-Llama-3-8B"
+    lora_weights = "NouRed/BioMed-Tuned-Llama-3-8b"
+    load_in_4bit=True
+    bnb_4bit_use_double_quant=True
+    bnb_4bit_quant_type="nf4"
+    bnb_4bit_compute_dtype=torch.bfloat16
+    device = torch.device("cuda" if torch.cuda.is_available() else "CPU")
+    tokenizer = AutoTokenizer.from_pretrained(
+        based_model_path,
+        )
+    tokenizer.padding_side = 'right'
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.add_eos_token = True
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=load_in_4bit,
+        bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
+        bnb_4bit_quant_type=bnb_4bit_quant_type,
+        bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
     )
+    base_model = AutoModelForCausalLM.from_pretrained(
+        based_model_path,
+        device_map="auto",
+        attn_implementation="flash_attention_2", # I have an A100 GPU with 40GB of RAM 😎
+        quantization_config=quantization_config,
+        )
+    model = PeftModel.from_pretrained(
+        base_model,
+        lora_weights,
+        torch_dtype=torch.float16,
+        )
+    return model, tokenizer
+model, tokenizer = models()
 @spaces.GPU