mcamargo00 commited on
Commit
9f84309
·
verified ·
1 Parent(s): cecea85

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +13 -6
  2. requirements.txt +2 -1
app.py CHANGED
@@ -24,8 +24,9 @@ def load_model():
24
  # Load the LoRA adapter model for text generation
25
  model = AutoPeftModelForCausalLM.from_pretrained(
26
  "./lora_adapter", # Path to your adapter files
27
- torch_dtype=torch.float16,
28
- device_map="auto"
 
29
  )
30
 
31
  # Load tokenizer from the same directory
@@ -75,6 +76,11 @@ Respond *only* with a valid JSON object that follows this exact schema:
75
  Do NOT add any text or explanations before or after the JSON object.
76
  """
77
 
 
 
 
 
 
78
  def classify_solution(question: str, solution: str):
79
  """
80
  Classify the math solution using the exact training format
@@ -113,14 +119,15 @@ def classify_solution(question: str, solution: str):
113
  max_length=2048 # Increased for longer prompts
114
  )
115
 
116
- # Generate response (not just classify)
117
  with torch.no_grad():
118
  outputs = model.generate(
119
  **inputs,
120
- max_new_tokens=200,
121
  temperature=0.1,
122
- do_sample=True,
123
- pad_token_id=tokenizer.pad_token_id
 
124
  )
125
 
126
  # Decode the generated response
 
24
  # Load the LoRA adapter model for text generation
25
  model = AutoPeftModelForCausalLM.from_pretrained(
26
  "./lora_adapter", # Path to your adapter files
27
+ torch_dtype=torch.float32, # Use float32 for CPU
28
+ device_map="cpu", # Force CPU
29
+ low_cpu_mem_usage=True # Optimize for low memory
30
  )
31
 
32
  # Load tokenizer from the same directory
 
76
  Do NOT add any text or explanations before or after the JSON object.
77
  """
78
 
79
+ # Add this import at the top
80
+ import spaces
81
+
82
+ # Add this decorator to the classify function
83
+ @spaces.GPU
84
  def classify_solution(question: str, solution: str):
85
  """
86
  Classify the math solution using the exact training format
 
119
  max_length=2048 # Increased for longer prompts
120
  )
121
 
122
+ # Generate response with CPU optimization
123
  with torch.no_grad():
124
  outputs = model.generate(
125
  **inputs,
126
+ max_new_tokens=150, # Reduced from 200
127
  temperature=0.1,
128
+ do_sample=False, # Faster greedy decoding
129
+ pad_token_id=tokenizer.pad_token_id,
130
+ use_cache=True # Speed up generation
131
  )
132
 
133
  # Decode the generated response
requirements.txt CHANGED
@@ -2,4 +2,5 @@ gradio
2
  torch
3
  transformers
4
  peft
5
- accelerate
 
 
2
  torch
3
  transformers
4
  peft
5
+ accelerate
6
+ spaces