Spaces:

schroneko
/

demo-llm-jp-3-8x1.8b-instruct3

Running on Zero

App Files Files Community

username commited on Mar 27

Commit

06d92b4

1 Parent(s): 457d89f

fix

Browse files

Files changed (1) hide show

app.py +30 -25

app.py CHANGED Viewed

@@ -1,49 +1,54 @@
 import torch
 import gradio as gr
 import spaces
-from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("llm-jp/llm-jp-3-8x1.8b-instruct3")
-model = None
 @spaces.GPU
 def generate_text(system_prompt, user_input, max_length=512, temperature=0.7, top_p=0.95):
-    global model
-    if model is None:
-        model = AutoModelForCausalLM.from_pretrained(
-            "llm-jp/llm-jp-3-8x1.8b-instruct3",
-            device_map="auto",
-            torch_dtype=torch.bfloat16
-        )
     chat = [
         {"role": "system", "content": system_prompt},
         {"role": "user", "content": user_input},
     ]
-    tokenized_input = tokenizer.apply_chat_template(
         chat,
         add_generation_prompt=True,
-        tokenize=True,
         return_tensors="pt"
     ).to(model.device)
-    with torch.no_grad():
-        output = model.generate(
-            tokenized_input,
-            max_new_tokens=max_length,
-            do_sample=True,
-            top_p=top_p,
-            temperature=temperature,
-            repetition_penalty=1.05,
-        )[0]
-    generated_text = tokenizer.decode(output, skip_special_tokens=True)
-    return generated_text
 with gr.Blocks() as demo:
     gr.Markdown("# LLM-JP-3-8x1.8b-instruct3 非公式デモ")
-    gr.Markdown("国立情報学研究所大規模言語モデル研究開発センターの開発した日本語大規模言語モデル「LLM-JP-3」の非公式デモ。詳細は[こちらの記事](https://llm-jp.nii.ac.jp/blog/2025/03/27/moe3.html)をご覧ください。ZeroGPU を使用しています。")
     with gr.Row():
         with gr.Column():

 import torch
 import gradio as gr
 import spaces
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+from threading import Thread
+model_id = "llm-jp/llm-jp-3-8x1.8b-instruct3"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16
+)
 @spaces.GPU
 def generate_text(system_prompt, user_input, max_length=512, temperature=0.7, top_p=0.95):
     chat = [
         {"role": "system", "content": system_prompt},
         {"role": "user", "content": user_input},
     ]
+    input_ids = tokenizer.apply_chat_template(
         chat,
         add_generation_prompt=True,
         return_tensors="pt"
     ).to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = {
+        "input_ids": input_ids,
+        "streamer": streamer,
+        "max_new_tokens": max_length,
+        "do_sample": True,
+        "temperature": temperature,
+        "top_p": top_p,
+        "repetition_penalty": 1.05
+    }
+    thread = Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
+    response = ""
+    for text in streamer:
+        response += text
+    return response
 with gr.Blocks() as demo:
     gr.Markdown("# LLM-JP-3-8x1.8b-instruct3 非公式デモ")
+    gr.Markdown("国立情報学研究所大規模言語モデル研究開発センター（LLMC）が開発した日本語大規模言語モデル LLM-jp-3 MoE 8x1.8B の非公式デモです。ZeroGPU を使用しています。")
     with gr.Row():
         with gr.Column():