NexusRaven

Paused

App Files Files Community

NexusRaven / app.py

Tonic

Update app.py

099b4ce almost 2 years ago

raw

history blame

4.16 kB

	from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, MistralForCausalLM
	from peft import PeftModel, PeftConfig
	import torch
	import gradio as gr
	import random
	from textwrap import wrap

	EXAMPLES = [
	["Hey Falcon! Any recommendations for my holidays in Abu Dhabi?"],
	["What's the Everett interpretation of quantum mechanics?"],
	["Give me a list of the top 10 dive sites you would recommend around the world."],
	["Can you tell me more about deep-water soloing?"],
	["Can you write a short tweet about the release of our latest AI model, Falcon LLM?"]
	]


	device = "cuda" if torch.cuda.is_available() else "cpu"
	base_model_id = "tiiuae/falcon-7b-instruct"
	model_directory = "Tonic/GaiaMiniMed"

	tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True, padding_side="left")
	model_config = AutoConfig.from_pretrained(base_model_id)
	peft_model = AutoModelForCausalLM.from_pretrained(model_directory, config=model_config)
	peft_model = PeftModel.from_pretrained(peft_model, model_directory)

	def format_prompt(message, history, system_prompt):
	prompt = ""
	if system_prompt:
	prompt += f"System: {system_prompt}\n"
	for user_prompt, bot_response in history:
	prompt += f"User: {user_prompt}\n"
	prompt += f"Falcon: {bot_response}\n" # Response already contains "Falcon: "
	prompt += f"""User: {message}
	Falcon:"""
	return prompt

	seed = 42

	def generate(
	prompt, history, system_prompt="", temperature=0.9, max_new_tokens=500, top_p=0.95, repetition_penalty=1.0,
	):
	temperature = float(temperature)
	if temperature < 1e-2:
	temperature = 1e-2
	top_p = float(top_p)
	global seed
	generate_kwargs = dict(
	temperature=temperature,
	max_new_tokens=max_new_tokens,
	top_p=top_p,
	repetition_penalty=1.0,
	stop_sequences="[END]",
	do_sample=True,
	seed=seed,
	)
	seed = seed + 1
	formatted_prompt = format_prompt(prompt, history, system_prompt)

	try:
	stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
	output = ""

	for response in stream:
	output += response.token.text

	for stop_str in STOP_SEQUENCES:
	if output.endswith(stop_str):
	output = output[:-len(stop_str)]
	output = output.rstrip()
	yield output
	yield output
	except Exception as e:
	raise gr.Error(f"Error while generating: {e}")
	return output


	additional_inputs=[
	gr.Textbox("", label="Optional system prompt"),
	gr.Slider(
	label="Temperature",
	value=0.9,
	minimum=0.0,
	maximum=1.0,
	step=0.05,
	interactive=True,
	info="Higher values produce more diverse outputs",
	),
	gr.Slider(
	label="Max new tokens",
	value=256,
	minimum=0,
	maximum=3000,
	step=64,
	interactive=True,
	info="The maximum numbers of new tokens",
	),
	gr.Slider(
	label="Top-p (nucleus sampling)",
	value=0.90,
	minimum=0.01,
	maximum=0.99,
	step=0.05,
	interactive=True,
	info="Higher values sample more low-probability tokens",
	),
	gr.Slider(
	label="Repetition penalty",
	value=1.2,
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	interactive=True,
	info="Penalize repeated tokens",
	)
	]


	with gr.Blocks() as demo:

	title = "👋🏻Welcome to Tonic's GaiaMiniMed🦅⚕️Falcon Chat🚀"
	description = "You can use this Space to test out the current model [(Tonic/GaiaMiniMed)](https://huggingface.co/Tonic/GaiaMiniMed) with chat memory optimized for falcon models. or duplicate this Space and use it locally or on 🤗HuggingFace. [Join me on Discord to build together](https://discord.gg/VqTxc76K3u)."

	gr.ChatInterface(
	generate,
	examples=EXAMPLES,
	additional_inputs=additional_inputs,
	)

	demo.queue(concurrency_count=100, api_open=False).launch(show_api=False)