dasomaru commited on
Commit
eaf5ea5
ยท
verified ยท
1 Parent(s): ea7fd0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -21
app.py CHANGED
@@ -2,37 +2,52 @@ import gradio as gr
2
  import spaces
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
5
 
6
  model_name = "dasomaru/gemma-3-4bit-it-demo"
7
 
8
- # ๐Ÿš€ tokenizer๋Š” CPU์—์„œ๋„ ๋ฏธ๋ฆฌ ๋ถˆ๋Ÿฌ์˜ฌ ์ˆ˜ ์žˆ์Œ
9
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
10
- # ๐Ÿš€ model์€ CPU๋กœ๋งŒ ๋จผ์ € ์˜ฌ๋ฆผ (GPU ์•„์ง ์—†์Œ)
11
- model = AutoModelForCausalLM.from_pretrained(
12
- model_name,
13
- torch_dtype=torch.float16, # 4bit model์ด๋‹ˆ๊นŒ
14
- trust_remote_code=True,
15
- )
16
-
17
  @spaces.GPU(duration=300)
18
- def generate_response(prompt):
19
- # ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ์€ ํ•จ์ˆ˜ ๋‚ด๋ถ€์—์„œ ์ˆ˜ํ–‰
20
- tokenizer = AutoTokenizer.from_pretrained("dasomaru/gemma-3-4bit-it-demo")
21
- model = AutoModelForCausalLM.from_pretrained("dasomaru/gemma-3-4bit-it-demo")
22
- model.to("cuda")
23
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
25
- outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.7,
26
- top_p=0.9,
27
- top_k=50,
28
- do_sample=True,)
 
 
 
 
 
 
29
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
30
 
 
31
  demo = gr.Interface(fn=generate_response, inputs="text", outputs="text")
32
  demo.launch()
33
 
34
-
35
-
36
  # zero = torch.Tensor([0]).cuda()
37
  # print(zero.device) # <-- 'cpu' ๐Ÿค”
38
 
 
2
  import spaces
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from retriever.vectordb_rerank import search_documents # ๐Ÿง  RAG ๊ฒ€์ƒ‰๊ธฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
6
 
7
  model_name = "dasomaru/gemma-3-4bit-it-demo"
8
 
 
 
 
 
 
 
 
 
 
9
  @spaces.GPU(duration=300)
10
+ def generate_response(query):
11
+ # ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (ZeroGPU ์˜ˆ์•ฝ ํ›„)
12
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
13
+ model = AutoModelForCausalLM.from_pretrained(
14
+ model_name,
15
+ torch_dtype=torch.float16,
16
+ trust_remote_code=True,
17
+ ).to("cuda")
18
+
19
+ # 1. ๊ฒ€์ƒ‰
20
+ top_k = 5
21
+ retrieved_docs = search_documents(query, top_k=top_k)
22
+
23
+ # 2. ํ”„๋กฌํ”„ํŠธ ์กฐ๋ฆฝ
24
+ prompt = (
25
+ "๋‹น์‹ ์€ ๊ณต์ธ์ค‘๊ฐœ์‚ฌ ์‹œํ—˜ ๋ฌธ์ œ ์ถœ์ œ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.\n\n"
26
+ "๋‹ค์Œ์€ ๊ธฐ์ถœ ๋ฌธ์ œ ๋ฐ ๊ด€๋ จ ๋ฒ•๋ น ์ •๋ณด์ž…๋‹ˆ๋‹ค:\n"
27
+ )
28
+ for idx, doc in enumerate(retrieved_docs, 1):
29
+ prompt += f"- {doc}\n"
30
+ prompt += f"\n์ด ์ •๋ณด๋ฅผ ์ฐธ๊ณ ํ•˜์—ฌ ์‚ฌ์šฉ์ž์˜ ์š”์ฒญ์— ๋‹ต๋ณ€ํ•ด ์ฃผ์„ธ์š”.\n\n"
31
+ prompt += f"[์งˆ๋ฌธ]\n{query}\n\n[๋‹ต๋ณ€]\n"
32
+
33
+ # 3. ๋‹ต๋ณ€ ์ƒ์„ฑ
34
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
35
+ outputs = model.generate(
36
+ **inputs,
37
+ max_new_tokens=512,
38
+ temperature=0.7,
39
+ top_p=0.9,
40
+ top_k=50,
41
+ do_sample=True,
42
+ )
43
+
44
+ # 4. ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜
45
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
46
 
47
+ # Gradio ์•ฑ
48
  demo = gr.Interface(fn=generate_response, inputs="text", outputs="text")
49
  demo.launch()
50
 
 
 
51
  # zero = torch.Tensor([0]).cuda()
52
  # print(zero.device) # <-- 'cpu' ๐Ÿค”
53