fyzanshaik commited on
Commit
03f2222
·
verified ·
1 Parent(s): 7d7b966

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -25
app.py CHANGED
@@ -1,62 +1,140 @@
1
- # app.py (Revisit this version from previous long answer)
2
  import gradio as gr
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
 
 
5
 
6
  # --- Configuration ---
7
- MODEL_NAME = "neuralnets/cf_codebot"
 
8
 
9
  # --- Model Loading ---
 
10
  try:
11
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
12
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  model.eval()
14
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
- model.to(device)
16
- print(f"Model loaded on: {device}")
 
 
 
 
 
 
17
 
18
  except Exception as e:
19
- print(f"Error loading model '{MODEL_NAME}': {e}")
20
  print("Using a dummy function for demonstration purposes.")
21
- tokenizer, model, device = None, None, "cpu"
22
 
23
  # --- Inference Function ---
24
  def generate_editorial(problem_statement: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
25
- if model is None:
26
- return "Model not loaded, using dummy generation. (Check logs)"
 
 
 
27
 
28
  try:
29
- input_text = problem_statement
 
 
 
 
 
 
 
 
 
 
30
 
 
 
 
 
 
 
 
 
 
31
  inputs = tokenizer(
32
  input_text,
33
  return_tensors="pt",
34
  padding=True,
35
  truncation=True,
36
- max_length=512
37
- ).to(device)
38
 
 
39
  outputs = model.generate(
40
  **inputs,
41
  max_new_tokens=max_new_tokens,
42
  num_return_sequences=1,
43
- pad_token_id=tokenizer.eos_token_id,
44
  do_sample=True,
45
  top_k=50,
46
  top_p=top_p,
47
  temperature=temperature,
48
- stop_sequences=["<end_of_turn>"] # Can use this, or `stop` if transformers is very new
 
 
 
 
49
  )
50
 
 
 
 
51
  generated_sequence = tokenizer.decode(outputs[0], skip_special_tokens=False)
52
 
53
- if generated_sequence.startswith(input_text):
54
- editorial_content = generated_sequence[len(input_text):].strip()
55
- editorial_content = editorial_content.replace("<end_of_turn>", "").strip()
 
 
 
 
56
  else:
 
57
  editorial_content = generated_sequence.strip()
58
- editorial_content = editorial_content.replace("<end_of_turn>", "").strip()
 
59
 
 
 
 
 
60
  return editorial_content
61
 
62
  except Exception as e:
@@ -64,7 +142,7 @@ def generate_editorial(problem_statement: str, max_new_tokens: int, temperature:
64
  return f"An error occurred during editorial generation: {e}"
65
 
66
  # --- Gradio Interface Setup ---
67
- demo = gr.Interface(
68
  fn=generate_editorial,
69
  inputs=[
70
  gr.Textbox(lines=10, label="Problem Statement", placeholder="Paste your problem statement here...", autofocus=True),
@@ -73,8 +151,8 @@ demo = gr.Interface(
73
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
74
  ],
75
  outputs=gr.Textbox(label="Generated Editorial"),
76
- title="Codeforces Editorial Assistant (Model Loaded In-Space)",
77
- description="Paste a Codeforces problem statement and get a generated editorial from neuralnets/cf_codebot.",
78
  flagging_mode="auto", # Updated from allow_flagging
79
  examples=[
80
  [
@@ -87,4 +165,4 @@ demo = gr.Interface(
87
  )
88
 
89
  if __name__ == "__main__":
90
- demo.launch()
 
1
+ # app.py (Revised for Unsloth LoRA Gemma Model)
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM # We still use AutoModelForCausalLM for the base model
4
  import torch
5
+ # Import unsloth for loading the adapters
6
+ from unsloth import FastLanguageModel
7
 
8
  # --- Configuration ---
9
+ BASE_MODEL_NAME = "unsloth/gemma-3-4b-it"
10
+ ADAPTER_MODEL_NAME = "neuralnets/cf_codebot" # Your friend's fine-tuned adapters
11
 
12
  # --- Model Loading ---
13
+ # This block will run once when the Space starts up.
14
  try:
15
+ # Load the base model and tokenizer using unsloth's optimized method
16
+ # This automatically handles loading the tokenizer too.
17
+ # We specify "bf16" for faster inference if GPU is available, else it will default.
18
+ # max_seq_length is important for context window. 2048 is a common default for Gemma.
19
+ model, tokenizer = FastLanguageModel.from_pretrained(
20
+ model_name = BASE_MODEL_NAME,
21
+ max_seq_length = 2048, # Max context length the model can handle
22
+ dtype = torch.bfloat16, # Optimized dtype for performance
23
+ load_in_4bit = True, # Load in 4-bit to save memory (even on CPU, though less impact than GPU)
24
+ )
25
+
26
+ # Load the LoRA adapters from your friend's model onto the base model
27
+ model = FastLanguageModel.get_peft_model(
28
+ model,
29
+ # Default LoRA configuration for inference (should match training if possible)
30
+ # If your friend shared their training config, use those ranks.
31
+ r = 16, # Rank of the LoRA adapters
32
+ target_modules = FastLanguageModel.get_model_peft_target_modules(model),
33
+ lora_alpha = 16, # Alpha value for LoRA
34
+ lora_dropout = 0, # Dropout for inference is usually 0
35
+ bias = "none",
36
+ use_gradient_checkpointing = False,
37
+ random_state = 3407,
38
+ max_seq_length = 2048,
39
+ # `use_te_vllm` for inference if you have specific hardware, but usually not needed for basic deployment
40
+ )
41
+
42
+ # Load the trained adapters
43
+ model.load_lora_weights(ADAPTER_MODEL_NAME)
44
+
45
+ # Set model to evaluation mode
46
  model.eval()
47
+
48
+ # Move model to device (unsloth often handles this, but explicit is good)
49
+ # Note: Unsloth's 4-bit loading often uses `accelerate` which handles device placement.
50
+ # Keeping `device` print for debugging.
51
+ # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52
+ # model.to(device) # No need to explicitly move model if load_in_4bit is True, handled by bitsandbytes/accelerate
53
+
54
+ print(f"Base model '{BASE_MODEL_NAME}' and adapters '{ADAPTER_MODEL_NAME}' loaded successfully.")
55
+ # You can infer the actual device from the model object's parameters later if needed.
56
 
57
  except Exception as e:
58
+ print(f"Error loading model '{BASE_MODEL_NAME}' or adapters '{ADAPTER_MODEL_NAME}': {e}")
59
  print("Using a dummy function for demonstration purposes.")
60
+ tokenizer, model = None, None # Indicate model not loaded
61
 
62
  # --- Inference Function ---
63
  def generate_editorial(problem_statement: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
64
+ if model is None or tokenizer is None: # If model failed to load, use dummy
65
+ print("Model not loaded, using dummy generation.")
66
+ if "watermelon" in problem_statement.lower():
67
+ return "To be able to split the watermelon such that each part is even..."
68
+ return "This is a placeholder editorial based on your problem statement.\n(Model failed to load, check logs)"
69
 
70
  try:
71
+ # Construct the prompt in an instruction-tuned format
72
+ # This is CRUCIAL for instruction-tuned models like Gemma-IT
73
+ # You need to ensure the format matches what the model was trained on.
74
+ # Common format for instruction models:
75
+ # prompt = f"### Instruction:\n{problem_statement}\n\n### Response:\n"
76
+
77
+ # Unsloth's `FastLanguageModel.chat_template` or `apply_chat_template` is ideal here.
78
+ # This function generates the correct chat format for the model.
79
+ messages = [
80
+ {"role": "user", "content": problem_statement}
81
+ ]
82
 
83
+ # Apply the chat template. add_generation_prompt=True ensures it's ready for generation.
84
+ # This adds special tokens like <bos><start_of_turn>user ... <end_of_turn><start_of_turn>model
85
+ input_text = tokenizer.apply_chat_template(
86
+ messages,
87
+ tokenize=False, # We want the string, not token IDs
88
+ add_generation_prompt=True
89
+ )
90
+
91
+ # Tokenize the input string
92
  inputs = tokenizer(
93
  input_text,
94
  return_tensors="pt",
95
  padding=True,
96
  truncation=True,
97
+ max_length=tokenizer.model_max_length # Use model's max_length
98
+ ).to(model.device) # Ensure inputs are on the same device as the model
99
 
100
+ # Generate text
101
  outputs = model.generate(
102
  **inputs,
103
  max_new_tokens=max_new_tokens,
104
  num_return_sequences=1,
 
105
  do_sample=True,
106
  top_k=50,
107
  top_p=top_p,
108
  temperature=temperature,
109
+ pad_token_id=tokenizer.eos_token_id, # Ensure pad_token_id is set
110
+ # Stopping criteria: for instruction-tuned models, often <eos_token> or specific strings.
111
+ # If your friend's model generates "<end_of_turn>" specifically, keep that.
112
+ # Otherwise, the default generation stopping (tokenizer.eos_token_id) usually suffices.
113
+ # `stop_sequences=["<end_of_turn>"]`
114
  )
115
 
116
+ # Decode the generated text
117
+ # We need to skip the input prompt from the generated text
118
+ # `skip_special_tokens=True` for clean text, but check if it affects your specific `<end_of_turn>`
119
  generated_sequence = tokenizer.decode(outputs[0], skip_special_tokens=False)
120
 
121
+ # Extract only the model's response.
122
+ # The `apply_chat_template` typically produces something like:
123
+ # "<bos><start_of_turn>user\n{problem_statement}<end_of_turn>\n<start_of_turn>model\n"
124
+ # We want to find the start of the model's response and take everything after it.
125
+ response_start_marker = "<start_of_turn>model\n" # or similar based on template
126
+ if response_start_marker in generated_sequence:
127
+ editorial_content = generated_sequence.split(response_start_marker)[-1].strip()
128
  else:
129
+ # Fallback if marker not found, or if generated_sequence starts with input
130
  editorial_content = generated_sequence.strip()
131
+ if editorial_content.startswith(input_text):
132
+ editorial_content = editorial_content[len(input_text):].strip()
133
 
134
+ # Remove any lingering special tokens like <end_of_turn> or <eos_token>
135
+ # (tokenizer.decode with skip_special_tokens=True might handle this, but manual clean is safer)
136
+ editorial_content = editorial_content.replace("<end_of_turn>", "").replace(tokenizer.eos_token, "").strip()
137
+
138
  return editorial_content
139
 
140
  except Exception as e:
 
142
  return f"An error occurred during editorial generation: {e}"
143
 
144
  # --- Gradio Interface Setup ---
145
+ iface = gr.Interface(
146
  fn=generate_editorial,
147
  inputs=[
148
  gr.Textbox(lines=10, label="Problem Statement", placeholder="Paste your problem statement here...", autofocus=True),
 
151
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
152
  ],
153
  outputs=gr.Textbox(label="Generated Editorial"),
154
+ title="Codeforces Editorial Assistant (Gemma LoRA)",
155
+ description="Paste a Codeforces problem statement and get a generated editorial from neuralnets/cf_codebot (Gemma-3-4b-it LoRA).",
156
  flagging_mode="auto", # Updated from allow_flagging
157
  examples=[
158
  [
 
165
  )
166
 
167
  if __name__ == "__main__":
168
+ iface.launch()