Spaces:
Sleeping
Sleeping
# μν€ μμ½ κ΄λ ¨ | |
def extract_main_query(text): | |
sentences = re.split(r'[.?!]\s*', text) | |
sentences = [s.strip() for s in sentences if s.strip()] | |
if not sentences: | |
return text | |
last = sentences[-1] | |
last = re.sub(r'[^κ°-ν£a-zA-Z0-9 ]', '', last) | |
particles = ['μ΄', 'κ°', 'μ', 'λ', 'μ', 'λ₯Ό', 'μ', 'μμ', 'μκ²', 'νν ', '보λ€'] | |
for p in particles: | |
last = re.sub(rf'\b(\w+){p}\b', r'\1', last) | |
return last.strip() | |
def get_wikipedia_summary(query): | |
cleaned_query = extract_main_query(query) | |
url = f"https://ko.wikipedia.org/api/rest_v1/page/summary/{cleaned_query}" | |
res = requests.get(url) | |
if res.status_code == 200: | |
return res.json().get("extract", "μμ½ μ 보λ₯Ό μ°Ύμ μ μμ΅λλ€.") | |
else: | |
return "μν€λ°±κ³Όμμ μ 보λ₯Ό κ°μ Έμ¬ μ μμ΅λλ€." | |
def textrank_summarize(text, top_n=3): | |
sentences = re.split(r'(?<=[.!?])\s+', text.strip()) | |
sentences = [s.strip() for s in sentences if len(s.strip()) > 10] | |
if len(sentences) <= top_n: | |
return text | |
vectorizer = TfidfVectorizer() | |
tfidf_matrix = vectorizer.fit_transform(sentences) | |
sim_matrix = cosine_similarity(tfidf_matrix) | |
np.fill_diagonal(sim_matrix, 0) | |
def pagerank(matrix, damping=0.85, max_iter=100, tol=1e-4): | |
N = matrix.shape[0] | |
ranks = np.ones(N) / N | |
row_sums = np.sum(matrix, axis=1) | |
row_sums[row_sums == 0] = 1 | |
for _ in range(max_iter): | |
prev_ranks = ranks.copy() | |
for i in range(N): | |
incoming = matrix[:, i] | |
ranks[i] = (1 - damping) / N + damping * np.sum(incoming * prev_ranks / row_sums) | |
if np.linalg.norm(ranks - prev_ranks) < tol: | |
break | |
return ranks | |
scores = pagerank(sim_matrix) | |
ranked_idx = np.argsort(scores)[::-1] | |
selected_idx = sorted(ranked_idx[:top_n]) | |
summary = ' '.join([sentences[i] for i in selected_idx]) | |
return summary | |
def summarize_from_wikipedia(query, top_n=3): | |
raw_summary = get_wikipedia_summary(query) | |
first_summary = textrank_summarize(raw_summary, top_n=top_n) | |
second_summary = textrank_summarize(first_summary, top_n=top_n) | |
return second_summary |