rezaenayati commited on
Commit
a5e67e1
·
verified ·
1 Parent(s): 6118e79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -18
app.py CHANGED
@@ -1,24 +1,19 @@
1
- import gradio as gr
2
- import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from peft import PeftModel
 
5
 
6
- # Load base model without quantization for CPU compatibility
7
  base_model = AutoModelForCausalLM.from_pretrained(
8
- "meta-llama/Llama-3.1-8B-Instruct", # Use the original non-quantized model
9
  torch_dtype=torch.float16,
10
- device_map="cpu", # Force CPU usage
11
- low_cpu_mem_usage=True # Optimize for CPU memory usage
12
  )
13
 
14
- # Load tokenizer
15
- tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
16
-
17
- # Add padding token if it doesn't exist
18
- if tokenizer.pad_token is None:
19
- tokenizer.pad_token = tokenizer.eos_token
20
 
21
- # Load LoRA adapters
22
  model = PeftModel.from_pretrained(base_model, "rezaenayati/RezAi-Model")
23
 
24
  def chat_with_rezAi(messages, history):
@@ -30,19 +25,18 @@ def chat_with_rezAi(messages, history):
30
 
31
  conversation += f"<|start_header_id|>user<|end_header_id|>\n{messages}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
32
 
33
- inputs = tokenizer([conversation], return_tensors="pt", padding=True, truncation=True, max_length=2048)
34
 
35
  with torch.no_grad():
36
  outputs = model.generate(
37
- **inputs,
38
  max_new_tokens=128,
39
  temperature=0.5,
40
  do_sample=True,
41
- pad_token_id=tokenizer.eos_token_id,
42
- attention_mask=inputs['attention_mask']
43
  )
44
 
45
- # Get response
46
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
47
  new_response = response.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
48
 
 
1
+ mport torch
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from peft import PeftModel
4
+ import gradio as gr
5
 
 
6
  base_model = AutoModelForCausalLM.from_pretrained(
7
+ "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
8
  torch_dtype=torch.float16,
9
+ device_map="auto",
10
+ load_in_4bit=True
11
  )
12
 
13
+ # tokenizer
14
+ tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")
 
 
 
 
15
 
16
+ # LoRA adaptors
17
  model = PeftModel.from_pretrained(base_model, "rezaenayati/RezAi-Model")
18
 
19
  def chat_with_rezAi(messages, history):
 
25
 
26
  conversation += f"<|start_header_id|>user<|end_header_id|>\n{messages}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
27
 
28
+ inputs = tokenizer([conversation], return_tensors="pt")
29
 
30
  with torch.no_grad():
31
  outputs = model.generate(
32
+ inputs,
33
  max_new_tokens=128,
34
  temperature=0.5,
35
  do_sample=True,
36
+ pad_token_id=tokenizer.eos_token_id
 
37
  )
38
 
39
+ # get response
40
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
41
  new_response = response.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
42