import os import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel # Hugging Face Token from Space Secrets HF_TOKEN = os.environ.get("HF_TOKEN") # Model IDs BASE_MODEL = "google/gemma-3-1b-it" LORA_ADAPTER = "markredito/gemma-pip-finetuned-v2" # 🔁 Replace with your actual LoRA repo # Check device device = "cuda" if torch.cuda.is_available() else "cpu" # Quantization config for 4-bit (recommended on T4 GPU) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=bnb_config, token=HF_TOKEN, attn_implementation="eager" # Required for Gemma3 + quant ) model = PeftModel.from_pretrained(model, LORA_ADAPTER, token=HF_TOKEN) # Pad token fallback if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" # Generation function def generate_response(prompt, temperature, top_p, top_k): formatted = ( "user\n" f"{prompt.strip()}\n" "\n" "model\n" ) inputs = tokenizer(formatted, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=200, do_sample=True, temperature=temperature, top_p=top_p, top_k=top_k, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ) decoded = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) cleaned = decoded.split("")[0].replace("model\n", "").strip() return cleaned # Gradio UI with gr.Blocks() as demo: gr.Markdown("## ✨ Gemma Psychedelic Model Demo") gr.Markdown("Use your imagination or try one of the examples below to explore poetic and philosophical responses.") gr.Markdown("Note: this model intentionally hallucinates.") examples = [ "Describe a world where clouds are solid and people walk on them", "Contrast quantum realities phenomena from the perspective of a starship navigator, using a spiral into infinity.", "Dream up futuristic phenomena from the perspective of a timeless oracle, using a fractal blooming in chaos.", ] with gr.Row(): with gr.Column(): prompt_input = gr.Textbox(label="Enter your prompt", lines=4, placeholder="Try something like: What if gravity took a day off?") gr.Examples( examples=examples, inputs=prompt_input, label="Example Prompts" ) temperature = gr.Slider(0.1, 1.5, value=0.7, label="Temperature") top_p = gr.Slider(0.1, 1.0, value=0.9, label="Top-p (nucleus sampling)") top_k = gr.Slider(0, 100, step=1, value=50, label="Top-k") submit = gr.Button("Generate") with gr.Column(): output = gr.Textbox(label="Model Response", lines=10) submit.click(fn=generate_response, inputs=[prompt_input, temperature, top_p, top_k], outputs=output) demo.launch()