File size: 4,987 Bytes
812f70a
 
 
0e02ca5
 
 
 
 
 
c6bb1bc
0e02ca5
 
 
 
ba913e7
0e02ca5
764f34e
 
 
 
 
 
 
 
 
ba913e7
0e02ca5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5835e21
 
0e02ca5
 
 
 
 
 
 
 
 
764f34e
0e02ca5
5835e21
 
0e02ca5
 
5835e21
 
0e02ca5
5835e21
0e02ca5
 
764f34e
0e02ca5
764f34e
 
 
 
 
 
 
 
 
 
5835e21
 
 
0e02ca5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# https://www.gradio.app/guides/using-hugging-face-integrations

import gradio as gr
import logging
import html
import time
import torch
from   threading import Thread
from   transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

# Model
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
model_name = "/models/llm/hf/mistralai_Mistral-7B-Instruct-v0.1"

# UI Settings
title = "Shisa 7B"
description = "Test out Shisa 7B in either English or Japanese."
placeholder = "Type Here / ここにε…₯εŠ›γ—γ¦γγ γ•γ„" 
examples = [
    "Hello, how are you?", 
    "γ“γ‚“γ«γ‘γ―γ€ε…ƒζ°—γ§γ™γ‹οΌŸ",
    "γŠγ£γ™γ€ε…ƒζ°—οΌŸ",
    "γ“γ‚“γ«γ‘γ―γ€γ„γ‹γŒγŠιŽγ”γ—γ§γ™γ‹οΌŸ",
]

# LLM Settings
system_prompt = 'You are a helpful, friendly assistant.'
chat_history = [{"role": "system", "content": system_prompt}]
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.chat_template = "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_8bit=True,
)
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

def chat(message, history):
    chat_history.append({"role": "user", "content": message})
    input_ids = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True, return_tensors="pt").to('cuda')
    generate_kwargs = dict(
        inputs=input_ids,
        streamer=streamer,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        eos_token_id=tokenizer.eos_token_id,
    )
    # https://www.gradio.app/main/guides/creating-a-chatbot-fast#example-using-a-local-open-source-llm-with-hugging-face
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    partial_message = ""
    for new_token in streamer:
        partial_message += new_token # html.escape(new_token)
        yield partial_message

    '''
    # https://www.gradio.app/main/guides/creating-a-chatbot-fast#streaming-chatbots
    for i in range(len(message)):
        time.sleep(0.3)
        yield message[: i+1]
    '''


chat_interface = gr.ChatInterface(
    chat,
    chatbot=gr.Chatbot(height=400),
    textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
    title=title,
    description=description,
    theme="soft",
    examples=examples,
    cache_examples=False,
    undo_btn="Delete Previous",
    clear_btn="Clear",
)

# https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/app.py#L219 - we use this with construction b/c Gradio barfs on autoreload otherwise
with gr.Blocks() as demo:
    chat_interface.render()
    gr.Markdown("You can try these greetings in English, Japanese, familiar Japanese, or formal Japanese. We limit output to 200 tokens.")


demo.queue().launch()

'''
# Works for Text input...
demo = gr.Interface.from_pipeline(pipe)
'''

'''
def chat(message, history):
    print("foo")
    for i in range(len(message)):
        time.sleep(0.3)
        yield "You typed: " + message[: i+1]
    # print('history:', history)
    # print('message:', message)
    # for new_next in streamer:
    #    yield new_text


'''


'''
# Docs: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/conversational.py
conversation = Conversation()
conversation.add_message({"role": "system", "content": system})
device = torch.device('cuda')
pipe = pipeline(
    'conversational', 
    model=model, 
    tokenizer=tokenizer,
    streamer=streamer,

)

def chat(input, history):
    conversation.add_message({"role": "user", "content": input})
    # we do this shuffle so local shadow response doesn't get created
    response_conversation = pipe(conversation)
    print("foo:", response_conversation.messages[-1]["content"])

    conversation.add_message(response_conversation.messages[-1])
    print("boo:", response_conversation.messages[-1]["content"])
    response = conversation.messages[-1]["content"]
    response = "ping"
    return response

demo = gr.ChatInterface(
    chat,
    chatbot=gr.Chatbot(height=400),
    textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
    title=title,
    description=description,
    theme="soft",
    examples=examples,
    cache_examples=False,
    undo_btn="Delete Previous",
    clear_btn="Clear",
).launch()

# For async
# ).queue().launch()
'''