File size: 1,617 Bytes
086ed8e
 
b1848f9
086ed8e
 
 
 
 
 
 
 
 
 
 
 
 
b1848f9
086ed8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1848f9
086ed8e
 
 
8512311
b1848f9
 
086ed8e
b1848f9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr
import os

MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# 1) load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    load_in_4bit=True,            # comment out to use full precision
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

# 2) define inference function
def generate(messages):
    """
    messages: List of alternating [user, assistant, user, ...]
    returns: [user, assistant, user, assistant, ...] with model's reply appended
    """
    # rebuild a single prompt string
    prompt = ""
    for i in range(0, len(messages), 2):
        prompt += f"User: {messages[i]}\n"
        if i+1 < len(messages):
            prompt += f"Assistant: {messages[i+1]}\n"
    prompt += "Assistant:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=True,
        temperature=0.7,
    )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # strip everything before the last "Assistant:"
    reply = text.split("Assistant:")[-1].strip()

    messages.append(reply)
    return messages

# 3) build Gradio ChatInterface
demo = gr.ChatInterface(
    fn=generate,
    title="TinyLlama-1.1B Chat API",
    description="Chat with TinyLlama-1.1B and call via /api/predict",
    type="messages",
)

# 4) launch
if __name__ == "__main__":
    demo.launch()