dasomaru commited on
Commit
ea7fd0e
ยท
verified ยท
1 Parent(s): f755cf7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -21
app.py CHANGED
@@ -5,39 +5,41 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
5
 
6
  model_name = "dasomaru/gemma-3-4bit-it-demo"
7
 
8
- # ๐Ÿš€ ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์„œ๋ฒ„ ์‹œ์ž‘ ์‹œ ํ•œ๋ฒˆ๋งŒ ๋ถˆ๋Ÿฌ์˜จ๋‹ค
9
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
10
  model = AutoModelForCausalLM.from_pretrained(
11
  model_name,
12
- torch_dtype=torch.float16,
13
  trust_remote_code=True,
14
  )
15
 
16
  @spaces.GPU(duration=300)
17
  def generate_response(prompt):
18
  # ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ์€ ํ•จ์ˆ˜ ๋‚ด๋ถ€์—์„œ ์ˆ˜ํ–‰
19
- # tokenizer = AutoTokenizer.from_pretrained("dasomaru/gemma-3-4bit-it-demo")
20
- # model = AutoModelForCausalLM.from_pretrained("dasomaru/gemma-3-4bit-it-demo")
21
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
22
- model = AutoModelForCausalLM.from_pretrained(
23
- model_name,
24
- torch_dtype=torch.float16,
25
- trust_remote_code=True,
26
- )
27
- model.to("cuda")
28
-
29
- # inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
30
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
31
- outputs = model.generate(
32
- **inputs,
33
- max_new_tokens=512,
34
- temperature=0.7,
35
- top_p=0.9,
36
- top_k=50,
37
- do_sample=True,
38
- )
39
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
40
 
41
  demo = gr.Interface(fn=generate_response, inputs="text", outputs="text")
42
  demo.launch()
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  model_name = "dasomaru/gemma-3-4bit-it-demo"
7
 
8
+ # ๐Ÿš€ tokenizer๋Š” CPU์—์„œ๋„ ๋ฏธ๋ฆฌ ๋ถˆ๋Ÿฌ์˜ฌ ์ˆ˜ ์žˆ์Œ
9
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
10
+ # ๐Ÿš€ model์€ CPU๋กœ๋งŒ ๋จผ์ € ์˜ฌ๋ฆผ (GPU ์•„์ง ์—†์Œ)
11
  model = AutoModelForCausalLM.from_pretrained(
12
  model_name,
13
+ torch_dtype=torch.float16, # 4bit model์ด๋‹ˆ๊นŒ
14
  trust_remote_code=True,
15
  )
16
 
17
  @spaces.GPU(duration=300)
18
  def generate_response(prompt):
19
  # ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ์€ ํ•จ์ˆ˜ ๋‚ด๋ถ€์—์„œ ์ˆ˜ํ–‰
20
+ tokenizer = AutoTokenizer.from_pretrained("dasomaru/gemma-3-4bit-it-demo")
21
+ model = AutoModelForCausalLM.from_pretrained("dasomaru/gemma-3-4bit-it-demo")
22
+ model.to("cuda")
23
+
 
 
 
 
 
 
 
24
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
25
+ outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.7,
26
+ top_p=0.9,
27
+ top_k=50,
28
+ do_sample=True,)
 
 
 
 
29
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
30
 
31
  demo = gr.Interface(fn=generate_response, inputs="text", outputs="text")
32
  demo.launch()
33
 
34
+
35
+
36
+ # zero = torch.Tensor([0]).cuda()
37
+ # print(zero.device) # <-- 'cpu' ๐Ÿค”
38
+
39
+ # @spaces.GPU
40
+ # def greet(n):
41
+ # print(zero.device) # <-- 'cuda:0' ๐Ÿค—
42
+ # return f"Hello {zero + n} Tensor"
43
+
44
+ # demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
45
+ # demo.launch()