Spaces:
Sleeping
Sleeping
File size: 3,184 Bytes
b2c8ad4 2f31f50 b2c8ad4 2f31f50 172af21 c373fa5 172af21 2f31f50 b2c8ad4 2f31f50 b2c8ad4 2f31f50 b2c8ad4 2f31f50 b2c8ad4 2f31f50 b2c8ad4 2f31f50 b2c8ad4 2f31f50 b2c8ad4 2f31f50 b2c8ad4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import json
import gradio as gr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests
def download_file(url, save_path):
response = requests.get(url, stream=True) # ์คํธ๋ฆฌ๋ฐ ๋ชจ๋๋ก ๋ฐ์์ ๋ฉ๋ชจ๋ฆฌ ์ ์ฝ
response.raise_for_status() # ์ค๋ฅ ์์ผ๋ฉด ์์ธ ๋ฐ์
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print(f"ํ์ผ์ด {save_path}์ ์ ์ฅ๋์์ด์!")
# ์ฌ์ฉ ์์
download_file('https://huggingface.co/datasets/Yuchan5386/AI-hub-SummaryData/resolve/main/g1.jsonl?download=true', 'data.jsonl')
file_path = 'data.jsonl'
# 1. ๋ํ ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ (๋ชจ๋ human-gpt ์ ์ถ์ถ)
def load_conversations(file_path):
conversations = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
conv = data.get('conversations', [])
prev = None
for c in conv:
if c.get('from') == 'human':
prev = c['value']
elif c.get('from') == 'gpt' and prev:
conversations.append((prev, c['value']))
prev = None
return conversations
# 2. ํ๊ตญ์ด ๋ถ์ฉ์ด ์ค์ + TF-IDF ๋ฒกํฐ๋ผ์ด์ ์ค๋น
def build_vectorizer(questions):
korean_stopwords = ['๊ทธ๋ฆฌ๊ณ ', '๊ทธ๋ฌ๋', 'ํ์ง๋ง', '๋ํ', '์ด๋ฐ', '์ ๋ฐ', '๊ทธ๋ฐ', '์๋', '์๋', '๊ฒ', '์', '๋๋ฌธ์', 'ํด์']
vectorizer = TfidfVectorizer(stop_words=korean_stopwords)
tfidf_matrix = vectorizer.fit_transform(questions)
return vectorizer, tfidf_matrix
# 3. ์ ์ฌ๋ ๊ธฐ๋ฐ Top-k ์๋ต ์ฐพ๊ธฐ
def find_top_responses(user_input, conversations, vectorizer, tfidf_matrix, top_k=3):
user_vec = vectorizer.transform([user_input])
sims = cosine_similarity(user_vec, tfidf_matrix)[0]
top_indices = sims.argsort()[-top_k:][::-1]
results = ""
for i, idx in enumerate(top_indices, 1):
answer = conversations[idx][1]
score = sims[idx]
results += f"{i}. {answer} (์ ์ฌ๋: {score:.2f})\n\n"
return results.strip()
# ์ด๊ธฐํ
convs = load_conversations(file_path)
questions = [q for q, _ in convs]
vectorizer, tfidf_matrix = build_vectorizer(questions)
# Gradio ์ธํฐํ์ด์ค ํจ์
def chatbot_interface(user_input):
if user_input.strip().lower() == "์ข
๋ฃ":
return "ChatBot: ์๋
~ ๋ ๋๋ฌ์!"
return find_top_responses(user_input, convs, vectorizer, tfidf_matrix)
# Gradio UI ๊ตฌ์ฑ
with gr.Blocks() as demo:
gr.Markdown("## ChatBot (TF-IDF ๊ธฐ๋ฐ Top-3 ๋ต๋ณ ์ถ์ฒ)")
with gr.Row():
with gr.Column():
input_box = gr.Textbox(label="์ง๋ฌธ์ ์
๋ ฅํด์ค!", placeholder="์: ์ฑ๋ด ์ด๋ป๊ฒ ๋ง๋ค์ด์?")
submit_btn = gr.Button("Toki์๊ฒ ๋ฌผ์ด๋ณด๊ธฐ ๐ง ")
with gr.Column():
output_box = gr.Textbox(label="Toki์ ์ถ์ฒ ๋ต๋ณ Top-3", lines=10)
submit_btn.click(fn=chatbot_interface, inputs=input_box, outputs=output_box)
demo.launch() |