NouRed commited on
Commit
e653a3d
·
verified ·
1 Parent(s): e60e350

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -36
app.py CHANGED
@@ -11,7 +11,6 @@ from transformers import (
11
  AutoModelForCausalLM)
12
 
13
 
14
- @spaces.GPU
15
  def generate_prompt(instruction, input=None):
16
  if input:
17
  return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. # noqa: E501
@@ -33,46 +32,51 @@ def generate_prompt(instruction, input=None):
33
  ### Response:
34
  """
35
 
36
-
37
  @spaces.GPU
38
- based_model_path = "meta-llama/Meta-Llama-3-8B"
39
- lora_weights = "NouRed/BioMed-Tuned-Llama-3-8b"
40
-
41
- load_in_4bit=True
42
- bnb_4bit_use_double_quant=True
43
- bnb_4bit_quant_type="nf4"
44
- bnb_4bit_compute_dtype=torch.bfloat16
45
- device = torch.device("cuda" if torch.cuda.is_available() else "CPU")
46
-
47
-
48
- tokenizer = AutoTokenizer.from_pretrained(
49
- based_model_path,
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- tokenizer.padding_side = 'right'
53
- tokenizer.pad_token = tokenizer.eos_token
54
- tokenizer.add_eos_token = True
55
-
56
-
57
- quantization_config = BitsAndBytesConfig(
58
- load_in_4bit=load_in_4bit,
59
- bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
60
- bnb_4bit_quant_type=bnb_4bit_quant_type,
61
- bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
62
- )
63
 
64
- base_model = AutoModelForCausalLM.from_pretrained(
65
- based_model_path,
66
- device_map="auto",
67
- attn_implementation="flash_attention_2", # I have an A100 GPU with 40GB of RAM 😎
68
- quantization_config=quantization_config,
69
- )
70
 
71
- model = PeftModel.from_pretrained(
72
- base_model,
73
- lora_weights,
74
- torch_dtype=torch.float16,
75
- )
76
 
77
 
78
  @spaces.GPU
 
11
  AutoModelForCausalLM)
12
 
13
 
 
14
  def generate_prompt(instruction, input=None):
15
  if input:
16
  return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. # noqa: E501
 
32
  ### Response:
33
  """
34
 
 
35
  @spaces.GPU
36
+ def models():
37
+ based_model_path = "meta-llama/Meta-Llama-3-8B"
38
+ lora_weights = "NouRed/BioMed-Tuned-Llama-3-8b"
39
+
40
+ load_in_4bit=True
41
+ bnb_4bit_use_double_quant=True
42
+ bnb_4bit_quant_type="nf4"
43
+ bnb_4bit_compute_dtype=torch.bfloat16
44
+ device = torch.device("cuda" if torch.cuda.is_available() else "CPU")
45
+
46
+
47
+ tokenizer = AutoTokenizer.from_pretrained(
48
+ based_model_path,
49
+ )
50
+
51
+ tokenizer.padding_side = 'right'
52
+ tokenizer.pad_token = tokenizer.eos_token
53
+ tokenizer.add_eos_token = True
54
+
55
+
56
+ quantization_config = BitsAndBytesConfig(
57
+ load_in_4bit=load_in_4bit,
58
+ bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
59
+ bnb_4bit_quant_type=bnb_4bit_quant_type,
60
+ bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
61
  )
62
+
63
+ base_model = AutoModelForCausalLM.from_pretrained(
64
+ based_model_path,
65
+ device_map="auto",
66
+ attn_implementation="flash_attention_2", # I have an A100 GPU with 40GB of RAM 😎
67
+ quantization_config=quantization_config,
68
+ )
69
+
70
+ model = PeftModel.from_pretrained(
71
+ base_model,
72
+ lora_weights,
73
+ torch_dtype=torch.float16,
74
+ )
75
 
76
+ return model, tokenizer
 
 
 
 
 
 
 
 
 
 
77
 
 
 
 
 
 
 
78
 
79
+ model, tokenizer = models()
 
 
 
 
80
 
81
 
82
  @spaces.GPU