File size: 3,184 Bytes
b2c8ad4
2f31f50
b2c8ad4
 
2f31f50
172af21
 
 
 
 
 
 
 
 
 
 
 
 
c373fa5
172af21
2f31f50
b2c8ad4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f31f50
b2c8ad4
 
 
 
 
 
2f31f50
b2c8ad4
 
 
 
 
 
 
 
 
 
 
2f31f50
b2c8ad4
 
 
 
2f31f50
b2c8ad4
 
 
 
 
2f31f50
b2c8ad4
 
 
 
 
 
 
 
 
2f31f50
b2c8ad4
2f31f50
b2c8ad4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import json
import gradio as gr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import requests

def download_file(url, save_path):
    response = requests.get(url, stream=True)  # ์ŠคํŠธ๋ฆฌ๋ฐ ๋ชจ๋“œ๋กœ ๋ฐ›์•„์„œ ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ
    response.raise_for_status()  # ์˜ค๋ฅ˜ ์žˆ์œผ๋ฉด ์˜ˆ์™ธ ๋ฐœ์ƒ
    
    with open(save_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
    print(f"ํŒŒ์ผ์ด {save_path}์— ์ €์žฅ๋˜์—ˆ์–ด์š”!")

# ์‚ฌ์šฉ ์˜ˆ์‹œ
download_file('https://huggingface.co/datasets/Yuchan5386/AI-hub-SummaryData/resolve/main/g1.jsonl?download=true', 'data.jsonl')
file_path = 'data.jsonl'

# 1. ๋Œ€ํ™” ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ (๋ชจ๋“  human-gpt ์Œ ์ถ”์ถœ)
def load_conversations(file_path):
    conversations = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            conv = data.get('conversations', [])
            prev = None
            for c in conv:
                if c.get('from') == 'human':
                    prev = c['value']
                elif c.get('from') == 'gpt' and prev:
                    conversations.append((prev, c['value']))
                    prev = None
    return conversations

# 2. ํ•œ๊ตญ์–ด ๋ถˆ์šฉ์–ด ์„ค์ • + TF-IDF ๋ฒกํ„ฐ๋ผ์ด์ € ์ค€๋น„
def build_vectorizer(questions):
    korean_stopwords = ['๊ทธ๋ฆฌ๊ณ ', '๊ทธ๋Ÿฌ๋‚˜', 'ํ•˜์ง€๋งŒ', '๋˜ํ•œ', '์ด๋Ÿฐ', '์ €๋Ÿฐ', '๊ทธ๋Ÿฐ', '์žˆ๋Š”', '์—†๋Š”', '๊ฒƒ', '์ˆ˜', '๋•Œ๋ฌธ์—', 'ํ•ด์„œ']
    vectorizer = TfidfVectorizer(stop_words=korean_stopwords)
    tfidf_matrix = vectorizer.fit_transform(questions)
    return vectorizer, tfidf_matrix

# 3. ์œ ์‚ฌ๋„ ๊ธฐ๋ฐ˜ Top-k ์‘๋‹ต ์ฐพ๊ธฐ
def find_top_responses(user_input, conversations, vectorizer, tfidf_matrix, top_k=3):
    user_vec = vectorizer.transform([user_input])
    sims = cosine_similarity(user_vec, tfidf_matrix)[0]
    top_indices = sims.argsort()[-top_k:][::-1]
    results = ""
    for i, idx in enumerate(top_indices, 1):
        answer = conversations[idx][1]
        score = sims[idx]
        results += f"{i}. {answer} (์œ ์‚ฌ๋„: {score:.2f})\n\n"
    return results.strip()

# ์ดˆ๊ธฐํ™”
convs = load_conversations(file_path)
questions = [q for q, _ in convs]
vectorizer, tfidf_matrix = build_vectorizer(questions)

# Gradio ์ธํ„ฐํŽ˜์ด์Šค ํ•จ์ˆ˜
def chatbot_interface(user_input):
    if user_input.strip().lower() == "์ข…๋ฃŒ":
        return "ChatBot: ์•ˆ๋…•~ ๋˜ ๋†€๋Ÿฌ์™€!"
    return find_top_responses(user_input, convs, vectorizer, tfidf_matrix)

# Gradio UI ๊ตฌ์„ฑ
with gr.Blocks() as demo:
    gr.Markdown("##  ChatBot (TF-IDF ๊ธฐ๋ฐ˜ Top-3 ๋‹ต๋ณ€ ์ถ”์ฒœ)")
    with gr.Row():
        with gr.Column():
            input_box = gr.Textbox(label="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•ด์ค˜!", placeholder="์˜ˆ: ์ฑ—๋ด‡ ์–ด๋–ป๊ฒŒ ๋งŒ๋“ค์–ด์š”?")
            submit_btn = gr.Button("Toki์—๊ฒŒ ๋ฌผ์–ด๋ณด๊ธฐ ๐Ÿง ")
        with gr.Column():
            output_box = gr.Textbox(label="Toki์˜ ์ถ”์ฒœ ๋‹ต๋ณ€ Top-3", lines=10)

    submit_btn.click(fn=chatbot_interface, inputs=input_box, outputs=output_box)

demo.launch()