NouRed commited on
Commit
d32201f
·
verified ·
1 Parent(s): ca2100e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -41
app.py CHANGED
@@ -36,51 +36,44 @@ def generate_prompt(instruction, input=None):
36
  ### Response:
37
  """
38
 
39
- @spaces.GPU
40
- def models():
41
- based_model_path = "meta-llama/Meta-Llama-3-8B"
42
- lora_weights = "NouRed/BioMed-Tuned-Llama-3-8b"
43
-
44
- load_in_4bit=True
45
- bnb_4bit_use_double_quant=True
46
- bnb_4bit_quant_type="nf4"
47
- bnb_4bit_compute_dtype=torch.bfloat16
48
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
-
50
-
51
- tokenizer = AutoTokenizer.from_pretrained(
52
- based_model_path,
53
- )
54
-
55
- tokenizer.padding_side = 'right'
56
- tokenizer.pad_token = tokenizer.eos_token
57
- tokenizer.add_eos_token = True
58
-
59
-
60
- quantization_config = BitsAndBytesConfig(
61
- load_in_4bit=load_in_4bit,
62
- bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
63
- bnb_4bit_quant_type=bnb_4bit_quant_type,
64
- bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
65
  )
66
-
67
- base_model = AutoModelForCausalLM.from_pretrained(
68
- based_model_path,
69
- device_map="auto",
70
- attn_implementation="flash_attention_2", # I have an A100 GPU with 40GB of RAM 😎
71
- quantization_config=quantization_config,
72
- )
73
-
74
- model = PeftModel.from_pretrained(
75
- base_model,
76
- lora_weights,
77
- torch_dtype=torch.float16,
78
- )
79
 
80
- return model, tokenizer
 
 
 
81
 
 
 
 
 
 
 
82
 
83
- model, tokenizer = models()
 
 
 
 
 
 
 
 
 
 
 
84
 
85
 
86
  @spaces.GPU
 
36
  ### Response:
37
  """
38
 
39
+ based_model_path = "meta-llama/Meta-Llama-3-8B"
40
+ lora_weights = "NouRed/BioMed-Tuned-Llama-3-8b"
41
+
42
+ load_in_4bit=True
43
+ bnb_4bit_use_double_quant=True
44
+ bnb_4bit_quant_type="nf4"
45
+ bnb_4bit_compute_dtype=torch.bfloat16
46
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
47
+
48
+
49
+ tokenizer = AutoTokenizer.from_pretrained(
50
+ based_model_path,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ tokenizer.padding_side = 'right'
54
+ tokenizer.pad_token = tokenizer.eos_token
55
+ tokenizer.add_eos_token = True
56
+
57
 
58
+ quantization_config = BitsAndBytesConfig(
59
+ load_in_4bit=load_in_4bit,
60
+ bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
61
+ bnb_4bit_quant_type=bnb_4bit_quant_type,
62
+ bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
63
+ )
64
 
65
+ base_model = AutoModelForCausalLM.from_pretrained(
66
+ based_model_path,
67
+ device_map="auto",
68
+ attn_implementation="flash_attention_2", # I have an A100 GPU with 40GB of RAM 😎
69
+ quantization_config=quantization_config,
70
+ )
71
+
72
+ model = PeftModel.from_pretrained(
73
+ base_model,
74
+ lora_weights,
75
+ torch_dtype=torch.float16,
76
+ )
77
 
78
 
79
  @spaces.GPU