John6666 commited on
Commit
04c9edc
·
verified ·
1 Parent(s): cb7408b

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +11 -11
  2. app.py +122 -140
  3. requirements.txt +5 -4
README.md CHANGED
@@ -1,12 +1,12 @@
1
- ---
2
- title: DeepSeek-R1-Distill-Llama-8B
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 4.36.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
  An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
1
+ ---
2
+ title: Chatbot Zero
3
+ emoji: 💬
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.21.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
  An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
app.py CHANGED
@@ -1,140 +1,122 @@
1
- import gradio as gr
2
- import os
3
- import spaces
4
- from transformers import GemmaTokenizer, AutoModelForCausalLM
5
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
- from threading import Thread
7
-
8
- # Set an environment variable
9
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
10
-
11
-
12
- DESCRIPTION = '''
13
- <div>
14
- <h1 style="text-align: center;">deepseek-ai/DeepSeek-R1-Distill-Llama-8B</h1>
15
- </div>
16
- '''
17
-
18
- LICENSE = """
19
- <p/>
20
-
21
- ---
22
- """
23
-
24
- PLACEHOLDER = """
25
- <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
26
- <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">DeepSeek-R1-Distill-Llama-8B</h1>
27
- <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
28
- </div>
29
- """
30
-
31
-
32
- css = """
33
- h1 {
34
- text-align: center;
35
- display: block;
36
- }
37
-
38
- #duplicate-button {
39
- margin: auto;
40
- color: white;
41
- background: #1565c0;
42
- border-radius: 100vh;
43
- }
44
- """
45
-
46
- # Load the tokenizer and model
47
- tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
48
- model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B", device_map="auto") # to("cuda:0")
49
- terminators = [
50
- tokenizer.eos_token_id,
51
- tokenizer.convert_tokens_to_ids("<|eot_id|>")
52
- ]
53
-
54
- @spaces.GPU(duration=120)
55
- def chat_llama3_8b(message: str,
56
- history: list,
57
- temperature: float,
58
- max_new_tokens: int
59
- ) -> str:
60
- """
61
- Generate a streaming response using the llama3-8b model.
62
- Args:
63
- message (str): The input message.
64
- history (list): The conversation history used by ChatInterface.
65
- temperature (float): The temperature for generating the response.
66
- max_new_tokens (int): The maximum number of new tokens to generate.
67
- Returns:
68
- str: The generated response.
69
- """
70
- conversation = []
71
- for user, assistant in history:
72
- conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
73
- conversation.append({"role": "user", "content": message})
74
-
75
- input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
76
-
77
- streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
78
-
79
- generate_kwargs = dict(
80
- input_ids= input_ids,
81
- streamer=streamer,
82
- max_new_tokens=max_new_tokens,
83
- do_sample=True,
84
- temperature=temperature,
85
- eos_token_id=terminators,
86
- )
87
- # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
88
- if temperature == 0:
89
- generate_kwargs['do_sample'] = False
90
-
91
- t = Thread(target=model.generate, kwargs=generate_kwargs)
92
- t.start()
93
-
94
- outputs = []
95
- for text in streamer:
96
- outputs.append(text)
97
- #print(outputs)
98
- yield "".join(outputs)
99
-
100
-
101
- # Gradio block
102
- chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
103
-
104
- with gr.Blocks(fill_height=True, css=css) as demo:
105
-
106
- gr.Markdown(DESCRIPTION)
107
- gr.ChatInterface(
108
- fn=chat_llama3_8b,
109
- chatbot=chatbot,
110
- fill_height=True,
111
- additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
112
- additional_inputs=[
113
- gr.Slider(minimum=0,
114
- maximum=1,
115
- step=0.1,
116
- value=0.5,
117
- label="Temperature",
118
- render=False),
119
- gr.Slider(minimum=128,
120
- maximum=4096,
121
- step=1,
122
- value=1024,
123
- label="Max new tokens",
124
- render=False ),
125
- ],
126
- examples=[
127
- ['How to setup a human base on Mars? Give short answer.'],
128
- ['Explain theory of relativity to me like I’m 8 years old.'],
129
- ['What is 9,000 * 9,000?'],
130
- ['Write a pun-filled happy birthday message to my friend Alex.'],
131
- ['Justify why a penguin might make a good king of the jungle.']
132
- ],
133
- cache_examples=False,
134
- )
135
-
136
- gr.Markdown(LICENSE)
137
-
138
- if __name__ == "__main__":
139
- demo.launch()
140
-
 
1
+ import spaces
2
+ import gradio as gr
3
+ import os
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
5
+ from threading import Thread
6
+ import torch
7
+
8
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
9
+ #REPO_ID = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
10
+ REPO_ID = "nicoboss/DeepSeek-R1-Distill-Qwen-32B-Uncensored"
11
+
12
+ DESCRIPTION = f'''
13
+ <div>
14
+ <h1 style="text-align: center;">{REPO_ID}</h1>
15
+ </div>
16
+ '''
17
+
18
+ PLACEHOLDER = f"""
19
+ <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
20
+ <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">{REPO_ID}</h1>
21
+ <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
22
+ </div>
23
+ """
24
+
25
+ css = """
26
+ h1 {
27
+ text-align: center;
28
+ display: block;
29
+ }
30
+
31
+ #duplicate-button {
32
+ margin: auto;
33
+ color: white;
34
+ background: #1565c0;
35
+ border-radius: 100vh;
36
+ }
37
+ """
38
+
39
+ tokenizer = AutoTokenizer.from_pretrained(REPO_ID)
40
+ if torch.cuda.is_available():
41
+ nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
42
+ model = AutoModelForCausalLM.from_pretrained(REPO_ID, quantization_config=nf4_config)
43
+ else: model = AutoModelForCausalLM.from_pretrained(REPO_ID, torch_dtype=torch.float32)
44
+
45
+ terminators = [
46
+ tokenizer.eos_token_id,
47
+ tokenizer.convert_tokens_to_ids("<|eot_id|>")
48
+ ]
49
+
50
+ @spaces.GPU(duration=59)
51
+ def chat(message: str,
52
+ history: list,
53
+ temperature: float,
54
+ max_new_tokens: int,
55
+ progress=gr.Progress(track_tqdm=True)
56
+ ):
57
+ try:
58
+ conversation = []
59
+ for user, assistant in history:
60
+ conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
61
+ conversation.append({"role": "user", "content": message})
62
+
63
+ input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
64
+
65
+ streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
66
+
67
+ generate_kwargs = dict(
68
+ input_ids=input_ids,
69
+ streamer=streamer,
70
+ max_new_tokens=max_new_tokens,
71
+ do_sample=True,
72
+ temperature=temperature,
73
+ eos_token_id=terminators,
74
+ )
75
+ if temperature == 0: generate_kwargs['do_sample'] = False
76
+
77
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
78
+ t.start()
79
+
80
+ outputs = []
81
+ for text in streamer:
82
+ outputs.append(text)
83
+ yield "".join(outputs)
84
+ except Exception as e:
85
+ print(e)
86
+ gr.Warning(f"Error: {e}")
87
+ yield ""
88
+
89
+ chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
90
+
91
+ with gr.Blocks(fill_height=True, fill_width=True, css=css) as demo:
92
+ gr.Markdown(DESCRIPTION)
93
+ gr.ChatInterface(
94
+ fn=chat,
95
+ chatbot=chatbot,
96
+ fill_height=True,
97
+ additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
98
+ additional_inputs=[
99
+ gr.Slider(minimum=0,
100
+ maximum=1,
101
+ step=0.1,
102
+ value=0.5,
103
+ label="Temperature",
104
+ render=False),
105
+ gr.Slider(minimum=128,
106
+ maximum=4096,
107
+ step=1,
108
+ value=1024,
109
+ label="Max new tokens",
110
+ render=False ),
111
+ ],
112
+ examples=[
113
+ ['How to setup a human base on Mars? Give short answer.'],
114
+ ['Explain theory of relativity to me like I’m 8 years old.'],
115
+ ['What is 9,000 * 9,000?'],
116
+ ['Write a pun-filled happy birthday message to my friend Alex.'],
117
+ ['Justify why a penguin might make a good king of the jungle.']
118
+ ],
119
+ cache_examples=False)
120
+
121
+ if __name__ == "__main__":
122
+ demo.launch(ssr_mode=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
- huggingface_hub
2
- accelerate
3
- transformers
4
- SentencePiece
 
 
1
+ huggingface_hub
2
+ accelerate
3
+ transformers
4
+ sentencepiece
5
+ bitsandbytes