Yuchan5386 commited on
Commit
34115e3
·
verified ·
1 Parent(s): b0c813a

Update api.py

Browse files
Files changed (1) hide show
  1. api.py +51 -1
api.py CHANGED
@@ -215,6 +215,17 @@ def extract_main_query(text):
215
  last = re.sub(rf'\b(\w+){p}\b', r'\1', last)
216
  return last.strip()
217
 
 
 
 
 
 
 
 
 
 
 
 
218
  def get_wikipedia_summary(query):
219
  cleaned_query = extract_main_query(query)
220
  url = f"https://ko.wikipedia.org/api/rest_v1/page/summary/{cleaned_query}"
@@ -224,6 +235,45 @@ def get_wikipedia_summary(query):
224
  else:
225
  return "위키백과에서 정보를 가져올 수 없습니다."
226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  def simple_intent_classifier(text):
228
  text = text.lower()
229
  greet_keywords = ["안녕", "반가워", "이름", "누구", "소개", "어디서 왔", "정체", "몇 살", "너 뭐야"]
@@ -268,7 +318,7 @@ def respond(input_text):
268
  keyword = re.sub(r"(에 대해|에 대한|에 대해서)?\s*(설명해줘|알려줘|뭐야|개념|정의|정보)?", "", input_text).strip()
269
  if not keyword:
270
  return "어떤 주제에 대해 궁금한가요?"
271
- summary = get_wikipedia_summary(keyword)
272
  return f"{summary}\n다른 궁금한 점 있으신가요?"
273
 
274
  return generate_text_topkp(model, input_text)
 
215
  last = re.sub(rf'\b(\w+){p}\b', r'\1', last)
216
  return last.strip()
217
 
218
+ import re
219
+ import requests
220
+ import numpy as np
221
+ from sklearn.feature_extraction.text import TfidfVectorizer
222
+ from sklearn.metrics.pairwise import cosine_similarity
223
+
224
+ # 1. 입력에서 핵심 쿼리 추출 (간단히 영어/한글만 남기기)
225
+ def extract_main_query(text):
226
+ return re.sub(r'[^\w가-힣]', '', text.strip())
227
+
228
+ # 2. 위키백과 요약 가져오기
229
  def get_wikipedia_summary(query):
230
  cleaned_query = extract_main_query(query)
231
  url = f"https://ko.wikipedia.org/api/rest_v1/page/summary/{cleaned_query}"
 
235
  else:
236
  return "위키백과에서 정보를 가져올 수 없습니다."
237
 
238
+ # 3. TextRank 요약기
239
+ def textrank_summarize(text, top_n=3):
240
+ sentences = re.split(r'(?<=[.!?])\s+', text.strip())
241
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
242
+
243
+ if len(sentences) <= top_n:
244
+ return text # 문장이 너무 적으면 원문 반환
245
+
246
+ vectorizer = TfidfVectorizer()
247
+ tfidf_matrix = vectorizer.fit_transform(sentences)
248
+ sim_matrix = cosine_similarity(tfidf_matrix)
249
+ np.fill_diagonal(sim_matrix, 0)
250
+
251
+ def pagerank(matrix, damping=0.85, max_iter=100, tol=1e-4):
252
+ N = matrix.shape[0]
253
+ ranks = np.ones(N) / N
254
+ row_sums = np.sum(matrix, axis=1)
255
+ row_sums[row_sums == 0] = 1 # NaN 방지용 처리
256
+ for _ in range(max_iter):
257
+ prev_ranks = ranks.copy()
258
+ for i in range(N):
259
+ incoming = matrix[:, i]
260
+ ranks[i] = (1 - damping) / N + damping * np.sum(incoming * prev_ranks / row_sums)
261
+ if np.linalg.norm(ranks - prev_ranks) < tol:
262
+ break
263
+ return ranks
264
+
265
+ scores = pagerank(sim_matrix)
266
+ ranked_idx = np.argsort(scores)[::-1]
267
+ selected_idx = sorted(ranked_idx[:top_n])
268
+ summary = ' '.join([sentences[i] for i in selected_idx])
269
+
270
+ return summary
271
+
272
+ # 4. 전체 파이프라인
273
+ def summarize_from_wikipedia(query, top_n=3):
274
+ raw_summary = get_wikipedia_summary(query)
275
+ return textrank_summarize(raw_summary, top_n=top_n)
276
+
277
  def simple_intent_classifier(text):
278
  text = text.lower()
279
  greet_keywords = ["안녕", "반가워", "이름", "누구", "소개", "어디서 왔", "정체", "몇 살", "너 뭐야"]
 
318
  keyword = re.sub(r"(에 대해|에 대한|에 대해서)?\s*(설명해줘|알려줘|뭐야|개념|정의|정보)?", "", input_text).strip()
319
  if not keyword:
320
  return "어떤 주제에 대해 궁금한가요?"
321
+ summary = summarize_from_wikipedia(keyword)
322
  return f"{summary}\n다른 궁금한 점 있으신가요?"
323
 
324
  return generate_text_topkp(model, input_text)