# 위키 요약 관련 def extract_main_query(text): sentences = re.split(r'[.?!]\s*', text) sentences = [s.strip() for s in sentences if s.strip()] if not sentences: return text last = sentences[-1] last = re.sub(r'[^가-힣a-zA-Z0-9 ]', '', last) particles = ['이', '가', '은', '는', '을', '를', '의', '에서', '에게', '한테', '보다'] for p in particles: last = re.sub(rf'\b(\w+){p}\b', r'\1', last) return last.strip() def get_wikipedia_summary(query): cleaned_query = extract_main_query(query) url = f"https://ko.wikipedia.org/api/rest_v1/page/summary/{cleaned_query}" res = requests.get(url) if res.status_code == 200: return res.json().get("extract", "요약 정보를 찾을 수 없습니다.") else: return "위키백과에서 정보를 가져올 수 없습니다." def textrank_summarize(text, top_n=3): sentences = re.split(r'(?<=[.!?])\s+', text.strip()) sentences = [s.strip() for s in sentences if len(s.strip()) > 10] if len(sentences) <= top_n: return text vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(sentences) sim_matrix = cosine_similarity(tfidf_matrix) np.fill_diagonal(sim_matrix, 0) def pagerank(matrix, damping=0.85, max_iter=100, tol=1e-4): N = matrix.shape[0] ranks = np.ones(N) / N row_sums = np.sum(matrix, axis=1) row_sums[row_sums == 0] = 1 for _ in range(max_iter): prev_ranks = ranks.copy() for i in range(N): incoming = matrix[:, i] ranks[i] = (1 - damping) / N + damping * np.sum(incoming * prev_ranks / row_sums) if np.linalg.norm(ranks - prev_ranks) < tol: break return ranks scores = pagerank(sim_matrix) ranked_idx = np.argsort(scores)[::-1] selected_idx = sorted(ranked_idx[:top_n]) summary = ' '.join([sentences[i] for i in selected_idx]) return summary def summarize_from_wikipedia(query, top_n=3): raw_summary = get_wikipedia_summary(query) first_summary = textrank_summarize(raw_summary, top_n=top_n) second_summary = textrank_summarize(first_summary, top_n=top_n) return second_summary