# -*- coding: utf-8 -*- Nour Eddine Zekaoui et al. import os import torch import spaces import gradio as gr from peft import PeftModel from transformers import ( AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM) # Set an environment variable HF_TOKEN = os.environ.get("HF_TOKEN") def generate_prompt(instruction, input=None): if input: return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. # noqa: E501 ### Instruction: {instruction} ### Input: {input} ### Response: """ else: return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. # noqa: E501 ### Instruction: {instruction} ### Response: """ based_model_path = "meta-llama/Meta-Llama-3-8B" lora_weights = "NouRed/BioMed-Tuned-Llama-3-8b" load_in_4bit=True bnb_4bit_use_double_quant=True bnb_4bit_quant_type="nf4" bnb_4bit_compute_dtype=torch.bfloat16 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = AutoTokenizer.from_pretrained( based_model_path, ) tokenizer.padding_side = 'right' tokenizer.pad_token = tokenizer.eos_token tokenizer.add_eos_token = True quantization_config = BitsAndBytesConfig( load_in_4bit=load_in_4bit, bnb_4bit_use_double_quant=bnb_4bit_use_double_quant, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=bnb_4bit_compute_dtype ) base_model = AutoModelForCausalLM.from_pretrained( based_model_path, device_map="auto", attn_implementation="flash_attention_2", # I have an A100 GPU with 40GB of RAM 😎 quantization_config=quantization_config, ) model = PeftModel.from_pretrained( base_model, lora_weights, torch_dtype=torch.float16, ) @spaces.GPU def generate( instruction, input=None, temperature=0.1, top_p=0.9, top_k=40, num_beams=4, max_new_tokens=128, do_sample=True, **kwargs): prompt = generate_prompt(instruction, input) inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): generated_ids = model.generate( **inputs, top_p=top_p, top_k=top_k, do_sample=do_sample, max_new_tokens=max_new_tokens, ) output = tokenizer.decode( generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True ) response = output.split("### Response:")[1].strip() return response description = """
""" gr.Interface( fn=generate, inputs=[ gr.components.Textbox( lines=2, label="Instruction", placeholder="Tell me about Covid-19?", ), gr.components.Textbox(lines=2, label="Input", placeholder="none"), gr.components.Slider( minimum=0, maximum=1, value=0.1, label="Temperature" ), gr.components.Slider( minimum=0, maximum=1, value=0.9, label="Top p" ), gr.components.Slider( minimum=0, maximum=100, step=1, value=40, label="Top k" ), gr.components.Slider( minimum=1, maximum=4, step=1, value=4, label="Beams" ), gr.components.Slider( minimum=1, maximum=2000, step=1, value=128, label="Max tokens" ), gr.components.Checkbox( value=True, label="Do Sample", info="Do you want to use sampling during text generation?" ), ], outputs=[ gr.components.Textbox( lines=5, label="Output", ) ], examples=[ ["Suggest treatment for pneumonia", "", 0.1, 0.9, 40, 4, 128, True], ["I have a sore throat, slight cough, tiredness. should i get tested fro covid 19?", "", 0.1, 0.9, 40, 4, 128, True], ["Husband of this patient asked me how to treat premature ejaculation and how to increase her libido.", "", 0.1, 0.9, 40, 4, 128, True], ], theme="soft", description=description, # noqa: E501 ).launch()