import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch # --- Model and Tokenizer Loading --- # It's recommended to load the model and tokenizer once globally # so they are not reloaded on every prediction. try: MODEL_NAME = "Vinnnf/Thinkless-1.5B-Warmup" print(f"Loading model: {MODEL_NAME}...") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype="auto", # Use "auto" or torch.float16 if GPU is available and supports it device_map="auto" # Automatically maps to GPU if available, otherwise CPU ) print("Model loaded successfully.") print(f"Loading tokenizer for: {MODEL_NAME}...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) print("Tokenizer loaded successfully.") except Exception as e: print(f"Error loading model or tokenizer: {e}") # Fallback or error handling if model loading fails # For a Gradio app, you might want to display this error in the UI # For now, we'll let it raise if essential components fail to load. raise # --- Prediction Function --- def generate_response(instruction_text, prompt_question, think_mode_active, max_tokens): """ Generates a response from the language model based on the input. """ if not instruction_text or not prompt_question: return "Error: Instruction and Prompt Question cannot be empty.", "", "N/A", "N/A" try: # 1. Combine instruction and prompt question full_prompt_content = f"{instruction_text}\n{prompt_question}" # 2. Format for chat model messages = [ {"role": "user", "content": full_prompt_content} ] # 3. Apply chat template # tokenize=False because we add special tags / afterwards text_from_template = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True # Ensures the model knows to generate a response ) # 4. Add or tag if think_mode_active: final_input_text = f"{text_from_template}" else: final_input_text = f"{text_from_template}" # 5. Tokenize the final input # Ensure the tokenizer and model are on the same device model_inputs = tokenizer([final_input_text], return_tensors="pt").to(model.device) # 6. Generate response # Ensure max_new_tokens is an integer try: max_new_tokens_int = int(max_tokens) except ValueError: return "Error: Max new tokens must be an integer.", final_input_text, "N/A", "N/A" if max_new_tokens_int <= 0: return "Error: Max new tokens must be a positive integer.", final_input_text, "N/A", "N/A" print(f"Generating with max_new_tokens: {max_new_tokens_int}") generated_ids = model.generate( **model_inputs, max_new_tokens=max_new_tokens_int, # Common generation parameters you might want to add: # temperature=0.7, # top_k=50, # top_p=0.95, # num_return_sequences=1, # no_repeat_ngram_size=2, # to prevent some repetition # early_stopping=True ) # 7. Decode the generated part only # The generated_ids include the input_ids, so we slice them off. input_ids_length = model_inputs.input_ids.shape[1] output_only_ids = generated_ids[:, input_ids_length:] num_generated_tokens = len(output_only_ids[0]) # 8. Batch decode response_text = tokenizer.batch_decode(output_only_ids, skip_special_tokens=True)[0] # For debugging: full generated text including prompt # full_response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # print(f"Full text (prompt + response): {full_response_text}") return final_input_text, response_text, num_generated_tokens, full_prompt_content except Exception as e: print(f"Error during generation: {e}") # Return the error message to be displayed in the Gradio UI return f"An error occurred: {str(e)}", "", "N/A", "N/A" # --- Gradio Interface Definition --- # Default values from the original script DEFAULT_INSTRUCTION = "Please reason step by step, and put your final answer within \\boxed{}." DEFAULT_PROMPT_QUESTION = "The arithmetic mean of 7, 2, $x$ and 10 is 9. What is the value of $x$?" DEFAULT_THINK_MODE = True DEFAULT_MAX_TOKENS = 512 # Default value for max_new_tokens # Define input components instruction_input = gr.Textbox( lines=3, label="Instruction", value=DEFAULT_INSTRUCTION, info="The overall instruction for the model (e.g., reasoning style)." ) prompt_question_input = gr.Textbox( lines=3, label="Prompt Question", value=DEFAULT_PROMPT_QUESTION, info="The specific question or task for the model." ) think_mode_checkbox = gr.Checkbox( label="Enable Think Mode ( tag)", value=DEFAULT_THINK_MODE, info="If checked, adds '' for detailed reasoning. If unchecked, adds '' for concise answers." ) max_tokens_slider = gr.Slider( minimum=32, maximum=4096, # As per original script's max_new_tokens value=DEFAULT_MAX_TOKENS, step=32, label="Max New Tokens", info="Maximum number of tokens to generate for the response." ) # Define output components full_prompt_output = gr.Textbox( label="Actual Input to Model (with template and tag)", lines=5, interactive=False, # Read-only show_copy_button=True ) response_output = gr.Textbox( label="Model Response", lines=10, interactive=False, # Read-only show_copy_button=True ) num_tokens_output = gr.Textbox( label="Number of Generated Tokens", interactive=False # Read-only ) original_prompt_output = gr.Textbox( label="Original User Prompt (Instruction + Question)", lines=3, interactive=False, # Read-only show_copy_button=True ) # Create the Gradio interface # We pass a list of inputs and outputs to gr.Interface # The order in the list corresponds to the arguments of the `generate_response` function app_interface = gr.Interface( fn=generate_response, inputs=[ instruction_input, prompt_question_input, think_mode_checkbox, max_tokens_slider ], outputs=[ full_prompt_output, response_output, num_tokens_output, original_prompt_output # Added to show the combined instruction + question ], title="Thinkless Model Interface", description=( "Interact with the Vinnnf/Thinkless-1.5B-Warmup model. " "Provide an instruction and a prompt, choose a thinking mode, and set max tokens. " "The model will generate a response based on your input. " "Note: Model loading might take a few moments when the app starts." ), allow_flagging='never', # or 'auto' if you want to enable flagging examples=[ [ "Please reason step by step, and put your final answer within \\boxed{}.", "Sarah has 5 apples. She gives 2 apples to John and then buys 3 more apples. How many apples does Sarah have now?", True, 256 ], [ "Provide a concise answer.", "What is the capital of France?", False, 64 ], [ "Explain the concept of photosynthesis in simple terms.", "What is photosynthesis?", True, 512 ] ] ) # --- Launch the App --- if __name__ == "__main__": print("Starting Gradio app...") # For Hugging Face Spaces, Gradio automatically handles the server. # When running locally, this will start a local server. app_interface.launch() # To share on Hugging Face Spaces, you would typically save this file as app.py # and ensure your requirements.txt includes: # gradio # transformers # torch # sentencepiece (often a dependency for tokenizers) # accelerate (if using device_map="auto" effectively with multiple GPUs/CPU offload)