TestChatbot / app.py
Yuchan5386's picture
Update app.py
c373fa5 verified
import json
import gradio as gr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests
def download_file(url, save_path):
response = requests.get(url, stream=True) # ์ŠคํŠธ๋ฆฌ๋ฐ ๋ชจ๋“œ๋กœ ๋ฐ›์•„์„œ ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ
response.raise_for_status() # ์˜ค๋ฅ˜ ์žˆ์œผ๋ฉด ์˜ˆ์™ธ ๋ฐœ์ƒ
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print(f"ํŒŒ์ผ์ด {save_path}์— ์ €์žฅ๋˜์—ˆ์–ด์š”!")
# ์‚ฌ์šฉ ์˜ˆ์‹œ
download_file('https://huggingface.co/datasets/Yuchan5386/AI-hub-SummaryData/resolve/main/g1.jsonl?download=true', 'data.jsonl')
file_path = 'data.jsonl'
# 1. ๋Œ€ํ™” ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ (๋ชจ๋“  human-gpt ์Œ ์ถ”์ถœ)
def load_conversations(file_path):
conversations = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
conv = data.get('conversations', [])
prev = None
for c in conv:
if c.get('from') == 'human':
prev = c['value']
elif c.get('from') == 'gpt' and prev:
conversations.append((prev, c['value']))
prev = None
return conversations
# 2. ํ•œ๊ตญ์–ด ๋ถˆ์šฉ์–ด ์„ค์ • + TF-IDF ๋ฒกํ„ฐ๋ผ์ด์ € ์ค€๋น„
def build_vectorizer(questions):
korean_stopwords = ['๊ทธ๋ฆฌ๊ณ ', '๊ทธ๋Ÿฌ๋‚˜', 'ํ•˜์ง€๋งŒ', '๋˜ํ•œ', '์ด๋Ÿฐ', '์ €๋Ÿฐ', '๊ทธ๋Ÿฐ', '์žˆ๋Š”', '์—†๋Š”', '๊ฒƒ', '์ˆ˜', '๋•Œ๋ฌธ์—', 'ํ•ด์„œ']
vectorizer = TfidfVectorizer(stop_words=korean_stopwords)
tfidf_matrix = vectorizer.fit_transform(questions)
return vectorizer, tfidf_matrix
# 3. ์œ ์‚ฌ๋„ ๊ธฐ๋ฐ˜ Top-k ์‘๋‹ต ์ฐพ๊ธฐ
def find_top_responses(user_input, conversations, vectorizer, tfidf_matrix, top_k=3):
user_vec = vectorizer.transform([user_input])
sims = cosine_similarity(user_vec, tfidf_matrix)[0]
top_indices = sims.argsort()[-top_k:][::-1]
results = ""
for i, idx in enumerate(top_indices, 1):
answer = conversations[idx][1]
score = sims[idx]
results += f"{i}. {answer} (์œ ์‚ฌ๋„: {score:.2f})\n\n"
return results.strip()
# ์ดˆ๊ธฐํ™”
convs = load_conversations(file_path)
questions = [q for q, _ in convs]
vectorizer, tfidf_matrix = build_vectorizer(questions)
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ํ•จ์ˆ˜
def chatbot_interface(user_input):
if user_input.strip().lower() == "์ข…๋ฃŒ":
return "ChatBot: ์•ˆ๋…•~ ๋˜ ๋†€๋Ÿฌ์™€!"
return find_top_responses(user_input, convs, vectorizer, tfidf_matrix)
# Gradio UI ๊ตฌ์„ฑ
with gr.Blocks() as demo:
gr.Markdown("## ChatBot (TF-IDF ๊ธฐ๋ฐ˜ Top-3 ๋‹ต๋ณ€ ์ถ”์ฒœ)")
with gr.Row():
with gr.Column():
input_box = gr.Textbox(label="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•ด์ค˜!", placeholder="์˜ˆ: ์ฑ—๋ด‡ ์–ด๋–ป๊ฒŒ ๋งŒ๋“ค์–ด์š”?")
submit_btn = gr.Button("Toki์—๊ฒŒ ๋ฌผ์–ด๋ณด๊ธฐ ๐Ÿง ")
with gr.Column():
output_box = gr.Textbox(label="Toki์˜ ์ถ”์ฒœ ๋‹ต๋ณ€ Top-3", lines=10)
submit_btn.click(fn=chatbot_interface, inputs=input_box, outputs=output_box)
demo.launch()