Upload 2 files
Browse files- app.py +13 -6
- requirements.txt +2 -1
app.py
CHANGED
@@ -24,8 +24,9 @@ def load_model():
|
|
24 |
# Load the LoRA adapter model for text generation
|
25 |
model = AutoPeftModelForCausalLM.from_pretrained(
|
26 |
"./lora_adapter", # Path to your adapter files
|
27 |
-
torch_dtype=torch.
|
28 |
-
device_map="
|
|
|
29 |
)
|
30 |
|
31 |
# Load tokenizer from the same directory
|
@@ -75,6 +76,11 @@ Respond *only* with a valid JSON object that follows this exact schema:
|
|
75 |
Do NOT add any text or explanations before or after the JSON object.
|
76 |
"""
|
77 |
|
|
|
|
|
|
|
|
|
|
|
78 |
def classify_solution(question: str, solution: str):
|
79 |
"""
|
80 |
Classify the math solution using the exact training format
|
@@ -113,14 +119,15 @@ def classify_solution(question: str, solution: str):
|
|
113 |
max_length=2048 # Increased for longer prompts
|
114 |
)
|
115 |
|
116 |
-
# Generate response
|
117 |
with torch.no_grad():
|
118 |
outputs = model.generate(
|
119 |
**inputs,
|
120 |
-
max_new_tokens=200
|
121 |
temperature=0.1,
|
122 |
-
do_sample=
|
123 |
-
pad_token_id=tokenizer.pad_token_id
|
|
|
124 |
)
|
125 |
|
126 |
# Decode the generated response
|
|
|
24 |
# Load the LoRA adapter model for text generation
|
25 |
model = AutoPeftModelForCausalLM.from_pretrained(
|
26 |
"./lora_adapter", # Path to your adapter files
|
27 |
+
torch_dtype=torch.float32, # Use float32 for CPU
|
28 |
+
device_map="cpu", # Force CPU
|
29 |
+
low_cpu_mem_usage=True # Optimize for low memory
|
30 |
)
|
31 |
|
32 |
# Load tokenizer from the same directory
|
|
|
76 |
Do NOT add any text or explanations before or after the JSON object.
|
77 |
"""
|
78 |
|
79 |
+
# Add this import at the top
|
80 |
+
import spaces
|
81 |
+
|
82 |
+
# Add this decorator to the classify function
|
83 |
+
@spaces.GPU
|
84 |
def classify_solution(question: str, solution: str):
|
85 |
"""
|
86 |
Classify the math solution using the exact training format
|
|
|
119 |
max_length=2048 # Increased for longer prompts
|
120 |
)
|
121 |
|
122 |
+
# Generate response with CPU optimization
|
123 |
with torch.no_grad():
|
124 |
outputs = model.generate(
|
125 |
**inputs,
|
126 |
+
max_new_tokens=150, # Reduced from 200
|
127 |
temperature=0.1,
|
128 |
+
do_sample=False, # Faster greedy decoding
|
129 |
+
pad_token_id=tokenizer.pad_token_id,
|
130 |
+
use_cache=True # Speed up generation
|
131 |
)
|
132 |
|
133 |
# Decode the generated response
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ gradio
|
|
2 |
torch
|
3 |
transformers
|
4 |
peft
|
5 |
-
accelerate
|
|
|
|
2 |
torch
|
3 |
transformers
|
4 |
peft
|
5 |
+
accelerate
|
6 |
+
spaces
|