import json import gradio as gr from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import requests def download_file(url, save_path): response = requests.get(url, stream=True) # 스트리밍 모드로 받아서 메모리 절약 response.raise_for_status() # 오류 있으면 예외 발생 with open(save_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) print(f"파일이 {save_path}에 저장되었어요!") # 사용 예시 download_file('https://huggingface.co/datasets/Yuchan5386/AI-hub-SummaryData/resolve/main/g1.jsonl?download=true', 'data.jsonl') file_path = 'data.jsonl' # 1. 대화 데이터 불러오기 (모든 human-gpt 쌍 추출) def load_conversations(file_path): conversations = [] with open(file_path, 'r', encoding='utf-8') as f: for line in f: data = json.loads(line) conv = data.get('conversations', []) prev = None for c in conv: if c.get('from') == 'human': prev = c['value'] elif c.get('from') == 'gpt' and prev: conversations.append((prev, c['value'])) prev = None return conversations # 2. 한국어 불용어 설정 + TF-IDF 벡터라이저 준비 def build_vectorizer(questions): korean_stopwords = ['그리고', '그러나', '하지만', '또한', '이런', '저런', '그런', '있는', '없는', '것', '수', '때문에', '해서'] vectorizer = TfidfVectorizer(stop_words=korean_stopwords) tfidf_matrix = vectorizer.fit_transform(questions) return vectorizer, tfidf_matrix # 3. 유사도 기반 Top-k 응답 찾기 def find_top_responses(user_input, conversations, vectorizer, tfidf_matrix, top_k=3): user_vec = vectorizer.transform([user_input]) sims = cosine_similarity(user_vec, tfidf_matrix)[0] top_indices = sims.argsort()[-top_k:][::-1] results = "" for i, idx in enumerate(top_indices, 1): answer = conversations[idx][1] score = sims[idx] results += f"{i}. {answer} (유사도: {score:.2f})\n\n" return results.strip() # 초기화 convs = load_conversations(file_path) questions = [q for q, _ in convs] vectorizer, tfidf_matrix = build_vectorizer(questions) # Gradio 인터페이스 함수 def chatbot_interface(user_input): if user_input.strip().lower() == "종료": return "ChatBot: 안녕~ 또 놀러와!" return find_top_responses(user_input, convs, vectorizer, tfidf_matrix) # Gradio UI 구성 with gr.Blocks() as demo: gr.Markdown("## ChatBot (TF-IDF 기반 Top-3 답변 추천)") with gr.Row(): with gr.Column(): input_box = gr.Textbox(label="질문을 입력해줘!", placeholder="예: 챗봇 어떻게 만들어요?") submit_btn = gr.Button("Toki에게 물어보기 🧠") with gr.Column(): output_box = gr.Textbox(label="Toki의 추천 답변 Top-3", lines=10) submit_btn.click(fn=chatbot_interface, inputs=input_box, outputs=output_box) demo.launch()