dasomaru commited on
Commit
c1f976c
ยท
verified ยท
1 Parent(s): a612272

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -31
app.py CHANGED
@@ -1,31 +1,34 @@
1
- import gradio as gr
2
- import spaces
3
- import torch
4
- from transformers import AutoModelForCausalLM, AutoTokenizer
5
-
6
- model_name = "dasomaru/gemma-3-4bit-it-demo"
7
-
8
- # ๐Ÿš€ ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์„œ๋ฒ„ ์‹œ์ž‘ ์‹œ ํ•œ๋ฒˆ๋งŒ ๋ถˆ๋Ÿฌ์˜จ๋‹ค
9
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
10
- model = AutoModelForCausalLM.from_pretrained(
11
- model_name,
12
- torch_dtype=torch.float16,
13
- device_map="auto", # ZeroGPU์—์„œ๋Š” ์ž๋™์œผ๋กœ GPU ํ• ๋‹น๋จ
14
- trust_remote_code=True,
15
- )
16
-
17
- @spaces.GPU(duration=300)
18
- def generate_response(prompt):
19
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
20
- outputs = model.generate(
21
- **inputs,
22
- max_new_tokens=512,
23
- temperature=0.7,
24
- top_p=0.9,
25
- top_k=50,
26
- do_sample=True,
27
- )
28
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
29
-
30
- demo = gr.Interface(fn=generate_response, inputs="text", outputs="text")
31
- demo.launch()
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+
6
+ model_name = "dasomaru/gemma-3-4bit-it-demo"
7
+
8
+ # ๐Ÿš€ tokenizer๋Š” CPU์—์„œ๋„ ๋ฏธ๋ฆฌ ๋ถˆ๋Ÿฌ์˜ฌ ์ˆ˜ ์žˆ์Œ
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
10
+ # ๐Ÿš€ model์€ CPU๋กœ๋งŒ ๋จผ์ € ์˜ฌ๋ฆผ (GPU ์•„์ง ์—†์Œ)
11
+ model = AutoModelForCausalLM.from_pretrained(
12
+ model_name,
13
+ torch_dtype=torch.float16, # 4bit model์ด๋‹ˆ๊นŒ
14
+ trust_remote_code=True,
15
+ )
16
+
17
+ @spaces.GPU(duration=300)
18
+ def generate_response(prompt):
19
+ # ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ์€ ํ•จ์ˆ˜ ๋‚ด๋ถ€์—์„œ ์ˆ˜ํ–‰
20
+ tokenizer = AutoTokenizer.from_pretrained("dasomaru/gemma-3-4bit-it-demo")
21
+ model = AutoModelForCausalLM.from_pretrained("dasomaru/gemma-3-4bit-it-demo")
22
+ model.to("cuda")
23
+
24
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
25
+ outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.7,
26
+ top_p=0.9,
27
+ top_k=50,
28
+ do_sample=True,)
29
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
30
+
31
+ demo = gr.Interface(fn=generate_response, inputs="text", outputs="text")
32
+ demo.launch()
33
+
34
+