Spaces:

nvidia
/

kvpress

Runtime error

App Files Files Community

kvpress / app.py

simjeg

Update app.py

3ff5cda 6 months ago

raw

history blame

4.18 kB

	# This space is mostly a copy of the work of Aritra Roy Gosthipaty (see https://huggingface.co/spaces/ariG23498/kv-press/blob/main/app.py)

	import spaces
	import requests
	import gradio as gr
	from bs4 import BeautifulSoup
	from transformers import pipeline

	from kvpress import (
	ExpectedAttentionPress,
	KnormPress,
	RandomPress,
	SnapKVPress,
	StreamingLLMPress,
	TOVAPress,
	)

	press_dict = {
	"ExpectedAttentionPress": ExpectedAttentionPress,
	"KnormPress": KnormPress,
	"RandomPress": RandomPress,
	"SnapKVPress": SnapKVPress,
	"StreamingLLMPress": StreamingLLMPress,
	"TOVAPress": TOVAPress,
	}


	@spaces.GPU
	def process_request(url, question, press_name, compression_ratio):
	""" """

	if press_name not in press_dict:
	return f"Invalid press type selected: {press_name}", -1

	# Fetch the Wikipedia article
	try:
	content = requests.get(url).content
	except requests.exceptions.RequestException as e:
	return f"Error fetching the Wikipedia article: {str(e)}", -1

	try:
	# Parse the Wikipedia HTML
	soup = BeautifulSoup(content, "html.parser")
	context = "".join([p.text for p in soup.find_all("p")]) + "\n\n"

	# Initialize the press
	press = press_dict[press_name](compression_ratio)
	num_tokens = pipe.tokenizer(context, return_tensors="pt")["input_ids"].shape[1]
	pred_answer = pipe(context, question=question, press=press)["answer"]

	return pred_answer, num_tokens
	except Exception as e:
	if "CUDA out of memory" in str(e):
	return "Error: CUDA out of memory. Try using a smaller article or a lower compression ratio.", -1
	else:
	return str(e), -1


	def gradio_interface():
	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# Wikipedia Article Question Answering with kvpress
	This demo uses the llama 3.1 8B Instruct model to answer questions about any given Wikipedia article.
	Under the hood, [kvpress](https://github.com/NVIDIA/kvpress) compresses the key-value (KV) cache associated with the article, helping reduce memory usage and accelerate decoding.

	How to use:
	1. Enter a Wikipedia article URL
	2. Type your question
	3. Select a press type and the desired compression ratio
	4. Press "Submit" to see the answer, along with token statistics before and after compression
	"""
	)

	with gr.Row():
	url_input = gr.Textbox(label="Wikipedia Article URL", placeholder="Enter the Wikipedia article URL here")
	question_input = gr.Textbox(label="Question", placeholder="Type your question here")

	with gr.Row():
	press_selector = gr.Dropdown(
	choices=list(press_dict.keys()),
	value="ExpectedAttentionPress",
	label="Select Press Type",
	)
	compression_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Compression Ratio")

	output = gr.Textbox(label="Output", lines=10)
	output_num_tokens = gr.Number(label="Number of Tokens", interactive=False)

	submit_button = gr.Button("Submit")

	gr.Examples(
	examples=[
	[
	"https://en.wikipedia.org/wiki/Nvidia",
	"Complete this sentence: The Nvidia GeForce Partner Program was a ...",
	"ExpectedAttentionPress",
	0.5,
	],
	],
	inputs=[url_input, question_input, press_selector, compression_slider],
	)

	submit_button.click(
	process_request,
	inputs=[url_input, question_input, press_selector, compression_slider],
	outputs=[output, output_num_tokens],
	)

	return demo


	if __name__ == "__main__":

	# Load pipeline
	device = "cuda:0"
	ckpt = "meta-llama/Meta-Llama-3.1-8B-Instruct"
	pipe = pipeline("kv-press-text-generation", model=ckpt, device=device, torch_dtype="auto")

	# Launch demo
	demo = gradio_interface()
	demo.launch()