desert
commited on
Commit
·
21886ee
1
Parent(s):
f84cd21
del
Browse files
app.py
CHANGED
|
@@ -2,20 +2,17 @@ import gradio as gr
|
|
| 2 |
from unsloth import FastLanguageModel
|
| 3 |
import torch
|
| 4 |
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
# Load your model and tokenizer (make sure to adjust the path to where your model is stored)
|
| 7 |
-
max_seq_length = 2048 # Adjust as necessary
|
| 8 |
-
load_in_4bit = True # Enable 4-bit quantization for reduced memory usage
|
| 9 |
-
model_path = "llama_lora_model_1" # Path to your custom model
|
| 10 |
-
|
| 11 |
-
# Load the model and tokenizer
|
| 12 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 13 |
-
model_name=
|
| 14 |
-
max_seq_length=max_seq_length,
|
| 15 |
-
|
|
|
|
| 16 |
)
|
| 17 |
|
| 18 |
-
|
| 19 |
# Respond function
|
| 20 |
def respond(
|
| 21 |
message,
|
|
|
|
| 2 |
from unsloth import FastLanguageModel
|
| 3 |
import torch
|
| 4 |
|
| 5 |
+
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
|
| 6 |
+
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
| 7 |
+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 10 |
+
model_name = "llama_lora_model_1",
|
| 11 |
+
max_seq_length = max_seq_length,
|
| 12 |
+
dtype = dtype,
|
| 13 |
+
load_in_4bit = load_in_4bit,
|
| 14 |
)
|
| 15 |
|
|
|
|
| 16 |
# Respond function
|
| 17 |
def respond(
|
| 18 |
message,
|