Spaces:
Sleeping
Sleeping
import json | |
import gradio as gr | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import requests | |
def download_file(url, save_path): | |
response = requests.get(url, stream=True) # ์คํธ๋ฆฌ๋ฐ ๋ชจ๋๋ก ๋ฐ์์ ๋ฉ๋ชจ๋ฆฌ ์ ์ฝ | |
response.raise_for_status() # ์ค๋ฅ ์์ผ๋ฉด ์์ธ ๋ฐ์ | |
with open(save_path, 'wb') as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
if chunk: | |
f.write(chunk) | |
print(f"ํ์ผ์ด {save_path}์ ์ ์ฅ๋์์ด์!") | |
# ์ฌ์ฉ ์์ | |
download_file('https://huggingface.co/datasets/Yuchan5386/AI-hub-SummaryData/resolve/main/g1.jsonl?download=true', 'data.jsonl') | |
file_path = 'data.jsonl' | |
# 1. ๋ํ ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ (๋ชจ๋ human-gpt ์ ์ถ์ถ) | |
def load_conversations(file_path): | |
conversations = [] | |
with open(file_path, 'r', encoding='utf-8') as f: | |
for line in f: | |
data = json.loads(line) | |
conv = data.get('conversations', []) | |
prev = None | |
for c in conv: | |
if c.get('from') == 'human': | |
prev = c['value'] | |
elif c.get('from') == 'gpt' and prev: | |
conversations.append((prev, c['value'])) | |
prev = None | |
return conversations | |
# 2. ํ๊ตญ์ด ๋ถ์ฉ์ด ์ค์ + TF-IDF ๋ฒกํฐ๋ผ์ด์ ์ค๋น | |
def build_vectorizer(questions): | |
korean_stopwords = ['๊ทธ๋ฆฌ๊ณ ', '๊ทธ๋ฌ๋', 'ํ์ง๋ง', '๋ํ', '์ด๋ฐ', '์ ๋ฐ', '๊ทธ๋ฐ', '์๋', '์๋', '๊ฒ', '์', '๋๋ฌธ์', 'ํด์'] | |
vectorizer = TfidfVectorizer(stop_words=korean_stopwords) | |
tfidf_matrix = vectorizer.fit_transform(questions) | |
return vectorizer, tfidf_matrix | |
# 3. ์ ์ฌ๋ ๊ธฐ๋ฐ Top-k ์๋ต ์ฐพ๊ธฐ | |
def find_top_responses(user_input, conversations, vectorizer, tfidf_matrix, top_k=3): | |
user_vec = vectorizer.transform([user_input]) | |
sims = cosine_similarity(user_vec, tfidf_matrix)[0] | |
top_indices = sims.argsort()[-top_k:][::-1] | |
results = "" | |
for i, idx in enumerate(top_indices, 1): | |
answer = conversations[idx][1] | |
score = sims[idx] | |
results += f"{i}. {answer} (์ ์ฌ๋: {score:.2f})\n\n" | |
return results.strip() | |
# ์ด๊ธฐํ | |
convs = load_conversations(file_path) | |
questions = [q for q, _ in convs] | |
vectorizer, tfidf_matrix = build_vectorizer(questions) | |
# Gradio ์ธํฐํ์ด์ค ํจ์ | |
def chatbot_interface(user_input): | |
if user_input.strip().lower() == "์ข ๋ฃ": | |
return "ChatBot: ์๋ ~ ๋ ๋๋ฌ์!" | |
return find_top_responses(user_input, convs, vectorizer, tfidf_matrix) | |
# Gradio UI ๊ตฌ์ฑ | |
with gr.Blocks() as demo: | |
gr.Markdown("## ChatBot (TF-IDF ๊ธฐ๋ฐ Top-3 ๋ต๋ณ ์ถ์ฒ)") | |
with gr.Row(): | |
with gr.Column(): | |
input_box = gr.Textbox(label="์ง๋ฌธ์ ์ ๋ ฅํด์ค!", placeholder="์: ์ฑ๋ด ์ด๋ป๊ฒ ๋ง๋ค์ด์?") | |
submit_btn = gr.Button("Toki์๊ฒ ๋ฌผ์ด๋ณด๊ธฐ ๐ง ") | |
with gr.Column(): | |
output_box = gr.Textbox(label="Toki์ ์ถ์ฒ ๋ต๋ณ Top-3", lines=10) | |
submit_btn.click(fn=chatbot_interface, inputs=input_box, outputs=output_box) | |
demo.launch() |