JimmyK300 commited on
Commit
c072401
·
verified ·
1 Parent(s): d936083

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -16
app.py CHANGED
@@ -1,15 +1,15 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
- import torch
4
 
5
- # Model name
6
- MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
7
 
8
- # Load tokenizer and model
9
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
 
11
 
12
- def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
13
  messages = [{"role": "system", "content": system_message}]
14
 
15
  # Add chat history to messages
@@ -21,16 +21,9 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
21
 
22
  messages.append({"role": "user", "content": message})
23
 
24
- # Tokenize input
25
- inputs = tokenizer(message, return_tensors="pt").to("cpu")
26
-
27
  # Generate response
28
- with torch.no_grad():
29
- outputs = model.generate(
30
- **inputs, max_length=max_tokens, temperature=temperature, top_p=top_p
31
- )
32
 
33
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
34
  return response
35
 
36
  # Define Gradio interface
 
1
  import gradio as gr
2
+ from ctransformers import AutoModelForCausalLM
 
3
 
4
+ # Path to your downloaded GGUF model (change this if needed)
5
+ MODEL_PATH = "qwen2.5-0.5b-instruct-q2_k.gguf"
6
 
7
+ # Load model using ctransformers
8
+ model = AutoModelForCausalLM.from_pretrained(
9
+ MODEL_PATH, model_type="qwen2", gpu_layers=30 # Adjust `gpu_layers` for performance
10
+ )
11
 
12
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
13
  messages = [{"role": "system", "content": system_message}]
14
 
15
  # Add chat history to messages
 
21
 
22
  messages.append({"role": "user", "content": message})
23
 
 
 
 
24
  # Generate response
25
+ response = model(messages, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p)
 
 
 
26
 
 
27
  return response
28
 
29
  # Define Gradio interface