Yuchan5386 commited on
Commit
5ed8573
ยท
verified ยท
1 Parent(s): ced5c26

Create hist.txt

Browse files
Files changed (1) hide show
  1. hist.txt +56 -0
hist.txt ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # ์œ„ํ‚ค ์š”์•ฝ ๊ด€๋ จ
3
+ def extract_main_query(text):
4
+ sentences = re.split(r'[.?!]\s*', text)
5
+ sentences = [s.strip() for s in sentences if s.strip()]
6
+ if not sentences:
7
+ return text
8
+ last = sentences[-1]
9
+ last = re.sub(r'[^๊ฐ€-ํžฃa-zA-Z0-9 ]', '', last)
10
+ particles = ['์ด', '๊ฐ€', '์€', '๋Š”', '์„', '๋ฅผ', '์˜', '์—์„œ', '์—๊ฒŒ', 'ํ•œํ…Œ', '๋ณด๋‹ค']
11
+ for p in particles:
12
+ last = re.sub(rf'\b(\w+){p}\b', r'\1', last)
13
+ return last.strip()
14
+
15
+ def get_wikipedia_summary(query):
16
+ cleaned_query = extract_main_query(query)
17
+ url = f"https://ko.wikipedia.org/api/rest_v1/page/summary/{cleaned_query}"
18
+ res = requests.get(url)
19
+ if res.status_code == 200:
20
+ return res.json().get("extract", "์š”์•ฝ ์ •๋ณด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
21
+ else:
22
+ return "์œ„ํ‚ค๋ฐฑ๊ณผ์—์„œ ์ •๋ณด๋ฅผ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
23
+
24
+ def textrank_summarize(text, top_n=3):
25
+ sentences = re.split(r'(?<=[.!?])\s+', text.strip())
26
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
27
+ if len(sentences) <= top_n:
28
+ return text
29
+ vectorizer = TfidfVectorizer()
30
+ tfidf_matrix = vectorizer.fit_transform(sentences)
31
+ sim_matrix = cosine_similarity(tfidf_matrix)
32
+ np.fill_diagonal(sim_matrix, 0)
33
+ def pagerank(matrix, damping=0.85, max_iter=100, tol=1e-4):
34
+ N = matrix.shape[0]
35
+ ranks = np.ones(N) / N
36
+ row_sums = np.sum(matrix, axis=1)
37
+ row_sums[row_sums == 0] = 1
38
+ for _ in range(max_iter):
39
+ prev_ranks = ranks.copy()
40
+ for i in range(N):
41
+ incoming = matrix[:, i]
42
+ ranks[i] = (1 - damping) / N + damping * np.sum(incoming * prev_ranks / row_sums)
43
+ if np.linalg.norm(ranks - prev_ranks) < tol:
44
+ break
45
+ return ranks
46
+ scores = pagerank(sim_matrix)
47
+ ranked_idx = np.argsort(scores)[::-1]
48
+ selected_idx = sorted(ranked_idx[:top_n])
49
+ summary = ' '.join([sentences[i] for i in selected_idx])
50
+ return summary
51
+
52
+ def summarize_from_wikipedia(query, top_n=3):
53
+ raw_summary = get_wikipedia_summary(query)
54
+ first_summary = textrank_summarize(raw_summary, top_n=top_n)
55
+ second_summary = textrank_summarize(first_summary, top_n=top_n)
56
+ return second_summary