rezaenayati commited on
Commit
da587af
·
verified ·
1 Parent(s): 5e67de1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -2,16 +2,16 @@ import torch
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from peft import PeftModel
4
  import gradio as gr
 
5
 
6
- # Load base model
7
  base_model = AutoModelForCausalLM.from_pretrained(
8
  "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
9
  torch_dtype=torch.float16,
10
- device_map="auto",
11
- load_in_4bit=True
12
  )
13
 
14
- # Load tokenizer
15
  tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")
16
 
17
  # Add padding token if missing
@@ -21,6 +21,7 @@ if tokenizer.pad_token is None:
21
  # Load LoRA adapter
22
  model = PeftModel.from_pretrained(base_model, "rezaenayati/RezAi-Model")
23
 
 
24
  def chat_with_rezAi(messages, history):
25
  conversation = "<|start_header_id|>system<|end_header_id|>\nYou are Reza Enayati, a Computer Science student and entrepreneur from Los Angeles, who is eager to work as a software engineer or machine learning engineer. Answer these questions as if you are in an interview.<|eot_id|>"
26
 
@@ -32,13 +33,13 @@ def chat_with_rezAi(messages, history):
32
  # Add current message
33
  conversation += f"<|start_header_id|>user<|end_header_id|>\n{messages}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
34
 
35
- # Tokenize with proper handling
36
  inputs = tokenizer(conversation, return_tensors="pt", truncate=True, max_length=2048)
37
- inputs = {k: v.to(model.device) for k, v in inputs.items()} # Move to GPU
38
 
 
39
  with torch.no_grad():
40
  outputs = model.generate(
41
- **inputs, # Unpack inputs properly
42
  max_new_tokens=128,
43
  temperature=0.5,
44
  do_sample=True,
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from peft import PeftModel
4
  import gradio as gr
5
+ import spaces # Important for ZeroGPU
6
 
7
+ # Load models (will be moved to GPU when needed)
8
  base_model = AutoModelForCausalLM.from_pretrained(
9
  "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
10
  torch_dtype=torch.float16,
11
+ device_map="auto", # ZeroGPU handles this
12
+ trust_remote_code=True
13
  )
14
 
 
15
  tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")
16
 
17
  # Add padding token if missing
 
21
  # Load LoRA adapter
22
  model = PeftModel.from_pretrained(base_model, "rezaenayati/RezAi-Model")
23
 
24
+ @spaces.GPU # This decorator is CRITICAL for ZeroGPU
25
  def chat_with_rezAi(messages, history):
26
  conversation = "<|start_header_id|>system<|end_header_id|>\nYou are Reza Enayati, a Computer Science student and entrepreneur from Los Angeles, who is eager to work as a software engineer or machine learning engineer. Answer these questions as if you are in an interview.<|eot_id|>"
27
 
 
33
  # Add current message
34
  conversation += f"<|start_header_id|>user<|end_header_id|>\n{messages}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
35
 
36
+ # Tokenize
37
  inputs = tokenizer(conversation, return_tensors="pt", truncate=True, max_length=2048)
 
38
 
39
+ # Generate response
40
  with torch.no_grad():
41
  outputs = model.generate(
42
+ **inputs,
43
  max_new_tokens=128,
44
  temperature=0.5,
45
  do_sample=True,