Spaces:
Runtime error
Runtime error
# This space is mostly a copy of the work of Aritra Roy Gosthipaty (see https://huggingface.co/spaces/ariG23498/kv-press/blob/main/app.py) | |
import spaces | |
import requests | |
import gradio as gr | |
from bs4 import BeautifulSoup | |
from transformers import pipeline | |
from kvpress import ( | |
ExpectedAttentionPress, | |
KnormPress, | |
RandomPress, | |
SnapKVPress, | |
StreamingLLMPress, | |
TOVAPress, | |
) | |
press_dict = { | |
"ExpectedAttentionPress": ExpectedAttentionPress, | |
"KnormPress": KnormPress, | |
"RandomPress": RandomPress, | |
"SnapKVPress": SnapKVPress, | |
"StreamingLLMPress": StreamingLLMPress, | |
"TOVAPress": TOVAPress, | |
} | |
def process_request(url, question, press_name, compression_ratio): | |
""" """ | |
if press_name not in press_dict: | |
return f"Invalid press type selected: {press_name}", -1 | |
# Fetch the Wikipedia article | |
try: | |
content = requests.get(url).content | |
except requests.exceptions.RequestException as e: | |
return f"Error fetching the Wikipedia article: {str(e)}", -1 | |
try: | |
# Parse the Wikipedia HTML | |
soup = BeautifulSoup(content, "html.parser") | |
context = "".join([p.text for p in soup.find_all("p")]) + "\n\n" | |
# Initialize the press | |
press = press_dict[press_name](compression_ratio) | |
num_tokens = pipe.tokenizer(context, return_tensors="pt")["input_ids"].shape[1] | |
pred_answer = pipe(context, question=question, press=press)["answer"] | |
return pred_answer, num_tokens | |
except Exception as e: | |
if "CUDA out of memory" in str(e): | |
return "Error: CUDA out of memory. Try using a smaller article or a lower compression ratio.", -1 | |
else: | |
return str(e), -1 | |
def gradio_interface(): | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# Wikipedia Article Question Answering with kvpress | |
This demo uses the llama 3.1 8B Instruct model to answer questions about any given Wikipedia article. | |
Under the hood, [kvpress](https://github.com/NVIDIA/kvpress) *compresses the key-value (KV) cache* associated with the article, helping reduce memory usage and accelerate decoding. | |
**How to use:** | |
1. Enter a Wikipedia article URL | |
2. Type your question | |
3. Select a press type and the desired compression ratio | |
4. Press "Submit" to see the answer, along with token statistics before and after compression | |
""" | |
) | |
with gr.Row(): | |
url_input = gr.Textbox(label="Wikipedia Article URL", placeholder="Enter the Wikipedia article URL here") | |
question_input = gr.Textbox(label="Question", placeholder="Type your question here") | |
with gr.Row(): | |
press_selector = gr.Dropdown( | |
choices=list(press_dict.keys()), | |
value="ExpectedAttentionPress", | |
label="Select Press Type", | |
) | |
compression_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Compression Ratio") | |
output = gr.Textbox(label="Output", lines=10) | |
output_num_tokens = gr.Number(label="Number of Tokens", interactive=False) | |
submit_button = gr.Button("Submit") | |
gr.Examples( | |
examples=[ | |
[ | |
"https://en.wikipedia.org/wiki/Nvidia", | |
"Complete this sentence: The Nvidia GeForce Partner Program was a ...", | |
"ExpectedAttentionPress", | |
0.5, | |
], | |
], | |
inputs=[url_input, question_input, press_selector, compression_slider], | |
) | |
submit_button.click( | |
process_request, | |
inputs=[url_input, question_input, press_selector, compression_slider], | |
outputs=[output, output_num_tokens], | |
) | |
return demo | |
if __name__ == "__main__": | |
# Load pipeline | |
device = "cuda:0" | |
ckpt = "meta-llama/Meta-Llama-3.1-8B-Instruct" | |
pipe = pipeline("kv-press-text-generation", model=ckpt, device=device, torch_dtype="auto") | |
# Launch demo | |
demo = gradio_interface() | |
demo.launch() | |