dasomaru commited on
Commit
2a32abb
ยท
verified ยท
1 Parent(s): eaf5ea5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -18
app.py CHANGED
@@ -2,20 +2,21 @@ import gradio as gr
2
  import spaces
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
- from retriever.vectordb_rerank import search_documents # ๐Ÿง  RAG ๊ฒ€์ƒ‰๊ธฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
6
 
7
  model_name = "dasomaru/gemma-3-4bit-it-demo"
8
 
9
  @spaces.GPU(duration=300)
10
  def generate_response(query):
11
- # ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (ZeroGPU ์˜ˆ์•ฝ ํ›„)
12
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
13
  model = AutoModelForCausalLM.from_pretrained(
14
  model_name,
15
  torch_dtype=torch.float16,
 
16
  trust_remote_code=True,
17
- ).to("cuda")
18
-
19
  # 1. ๊ฒ€์ƒ‰
20
  top_k = 5
21
  retrieved_docs = search_documents(query, top_k=top_k)
@@ -31,7 +32,7 @@ def generate_response(query):
31
  prompt += f"[์งˆ๋ฌธ]\n{query}\n\n[๋‹ต๋ณ€]\n"
32
 
33
  # 3. ๋‹ต๋ณ€ ์ƒ์„ฑ
34
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
35
  outputs = model.generate(
36
  **inputs,
37
  max_new_tokens=512,
@@ -41,20 +42,7 @@ def generate_response(query):
41
  do_sample=True,
42
  )
43
 
44
- # 4. ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜
45
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
46
 
47
- # Gradio ์•ฑ
48
  demo = gr.Interface(fn=generate_response, inputs="text", outputs="text")
49
  demo.launch()
50
-
51
- # zero = torch.Tensor([0]).cuda()
52
- # print(zero.device) # <-- 'cpu' ๐Ÿค”
53
-
54
- # @spaces.GPU
55
- # def greet(n):
56
- # print(zero.device) # <-- 'cuda:0' ๐Ÿค—
57
- # return f"Hello {zero + n} Tensor"
58
-
59
- # demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
60
- # demo.launch()
 
2
  import spaces
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from retriever.vectordb import search_documents # ๐Ÿง  RAG ๊ฒ€์ƒ‰๊ธฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
6
 
7
  model_name = "dasomaru/gemma-3-4bit-it-demo"
8
 
9
  @spaces.GPU(duration=300)
10
  def generate_response(query):
11
+ # ๐Ÿš€ generate_response ํ•จ์ˆ˜ ์•ˆ์—์„œ ๋งค๋ฒˆ ๋กœ๋“œ
12
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
13
  model = AutoModelForCausalLM.from_pretrained(
14
  model_name,
15
  torch_dtype=torch.float16,
16
+ device_map="auto", # โœ… ์ค‘์š”: ์ž๋™์œผ๋กœ GPU ํ• ๋‹น
17
  trust_remote_code=True,
18
+ )
19
+
20
  # 1. ๊ฒ€์ƒ‰
21
  top_k = 5
22
  retrieved_docs = search_documents(query, top_k=top_k)
 
32
  prompt += f"[์งˆ๋ฌธ]\n{query}\n\n[๋‹ต๋ณ€]\n"
33
 
34
  # 3. ๋‹ต๋ณ€ ์ƒ์„ฑ
35
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # โœ… model.device
36
  outputs = model.generate(
37
  **inputs,
38
  max_new_tokens=512,
 
42
  do_sample=True,
43
  )
44
 
 
45
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
46
 
 
47
  demo = gr.Interface(fn=generate_response, inputs="text", outputs="text")
48
  demo.launch()