chenyingli commited on
Commit
b57a642
·
verified ·
1 Parent(s): 2127284

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +224 -0
app.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import requests
4
+ import dotenv
5
+ from bs4 import BeautifulSoup
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import time
9
+
10
+ # 加载环境变量(其中需要包含 S2_API_KEY)
11
+ dotenv.load_dotenv()
12
+
13
+ def get_papers_since(url, since_year=2023):
14
+ """
15
+ 给定一个 Google Scholar citations page URL,返回指定年份(默认2023)及之后的论文列表。
16
+
17
+ 每篇论文用一个字典表示:{"title": ..., "year": ...}
18
+ """
19
+ headers = {
20
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
21
+ }
22
+
23
+ papers = []
24
+ start = 0
25
+ pagesize = 20 # Google Scholar 通常每页显示 20 条记录
26
+
27
+ while True:
28
+ params = {"cstart": start, "pagesize": pagesize}
29
+ response = requests.get(url, params=params, headers=headers)
30
+ if response.status_code != 200:
31
+ break
32
+
33
+ soup = BeautifulSoup(response.text, "html.parser")
34
+ rows = soup.find_all("tr", class_="gsc_a_tr")
35
+ if not rows:
36
+ break
37
+
38
+ for row in rows:
39
+ title_tag = row.find("a", class_="gsc_a_at")
40
+ title = title_tag.text.strip() if title_tag else "Unknown Title"
41
+ year_tag = row.find("span", class_="gsc_a_h")
42
+ try:
43
+ year = int(year_tag.text.strip()) if year_tag else None
44
+ except ValueError:
45
+ year = None
46
+
47
+ if year and year >= since_year:
48
+ papers.append({"title": title, "year": year})
49
+
50
+ next_button = soup.find("button", id="gsc_bpf_more")
51
+ if next_button and "disabled" not in next_button.attrs:
52
+ start += pagesize
53
+ else:
54
+ break
55
+
56
+ return papers
57
+
58
+ def search_paper_by_title(paper_title: str):
59
+ """
60
+ 使用 Semantic Scholar "match" endpoint 根据标题搜索论文,返回 top-1 匹配结果。
61
+
62
+ 返回:
63
+ dict: 包含 'paperId', 'title', 'abstract', 'venue' 四个键的字典。
64
+ 若找不到对应论文(HTTP 404 或无结果),则返回 None。
65
+ 若出现其它错误则抛出异常。
66
+ """
67
+ base_url = "https://api.semanticscholar.org/graph/v1/paper/search/match"
68
+ params = {
69
+ "query": paper_title,
70
+ "fields": "paperId,abstract,title,venue"
71
+ }
72
+ headers = {
73
+ "x-api-key": os.getenv("S2_API_KEY")
74
+ }
75
+
76
+ response = requests.get(base_url, params=params, headers=headers)
77
+
78
+ if response.status_code == 404:
79
+ return None
80
+ elif response.status_code != 200:
81
+ raise Exception(f"错误 {response.status_code}: {response.text}")
82
+
83
+ data_list = response.json().get("data", [])
84
+ if not data_list:
85
+ return None
86
+
87
+ paper_data = data_list[0]
88
+ return {
89
+ "paperId": paper_data.get("paperId"),
90
+ "title": paper_data.get("title"),
91
+ "abstract": paper_data.get("abstract"),
92
+ "venue": paper_data.get("venue")
93
+ }
94
+
95
+ def process_profiles(profiles_text, wechat, progress=None):
96
+ """
97
+ 1. 将用户输入的多个 Google Scholar profile links (以换行分隔) 转为列表
98
+ 2. 爬取所有链接在 2023 年及之后的论文
99
+ 3. 对每篇论文调用 Semantic Scholar 获取匹配信息,若无匹配则丢弃
100
+ 4. 根据 paperId 去重
101
+ 5. 返回一个 CSV 文件内容(bytes),用于用户下载
102
+ """
103
+ log_messages = ["开始处理..."]
104
+
105
+ def update_log(message):
106
+ log_messages.append(message)
107
+ return "\n".join(log_messages)
108
+
109
+ if not profiles_text.strip():
110
+ return update_log("错误: 未提供任何链接"), None
111
+
112
+ if not wechat.strip():
113
+ return update_log("错误: 未提供微信号"), None
114
+
115
+ # 将用户输入按换行拆分
116
+ profile_urls = [line.strip() for line in profiles_text.splitlines() if line.strip()]
117
+
118
+ message = f"已识别 {len(profile_urls)} 个档案链接,开始处理..."
119
+ yield update_log(message), None
120
+
121
+ all_papers = []
122
+ for i, url in enumerate(profile_urls):
123
+ message = f"处理第 {i+1}/{len(profile_urls)} 个档案: {url}"
124
+ yield update_log(message), None
125
+
126
+ papers = get_papers_since(url, 2023)
127
+ all_papers.extend(papers)
128
+
129
+ message = f"已从档案 {i+1}/{len(profile_urls)} 中收集 {len(papers)} 篇论文"
130
+ yield update_log(message), None
131
+
132
+ message = f"共收集到 {len(all_papers)} 篇论文,开始获取详细信息..."
133
+ yield update_log(message), None
134
+
135
+ paperid_map = {}
136
+ total_papers = len(all_papers)
137
+
138
+ for i, paper in enumerate(all_papers):
139
+ title = paper["title"]
140
+ year = paper["year"]
141
+
142
+ if i % 10 == 0 or i == total_papers - 1:
143
+ percent = round((i+1)/total_papers*100)
144
+ message = f"正在处理论文 {i+1}/{total_papers} ({percent}% 完成)"
145
+ yield update_log(message), None
146
+
147
+ if i > 0 and i % 10 == 0:
148
+ time.sleep(1)
149
+
150
+ try:
151
+ paper_info = search_paper_by_title(title)
152
+ except Exception as e:
153
+ continue
154
+
155
+ if not paper_info or not paper_info.get("paperId"):
156
+ continue
157
+
158
+ pid = paper_info["paperId"]
159
+ if pid not in paperid_map:
160
+ paperid_map[pid] = {
161
+ "paperId": pid,
162
+ "title": paper_info["title"],
163
+ "abstract": paper_info["abstract"],
164
+ "venue": paper_info["venue"],
165
+ "year": year
166
+ }
167
+
168
+ if not paperid_map:
169
+ message = "错误: 未找到任何匹配的论文"
170
+ return update_log(message), None
171
+
172
+ message = f"去重后共有 {len(paperid_map)} 篇论文,正在生成CSV..."
173
+ yield update_log(message), None
174
+
175
+ df = pd.DataFrame(list(paperid_map.values()))
176
+
177
+ # 使用微信号作为 CSV 文件名
178
+ temp_csv_path = f"{wechat.strip()}.csv"
179
+ df.to_csv(temp_csv_path, index=False, encoding='utf-8')
180
+
181
+ message = f"任务完成! 已生成包含 {len(paperid_map)} 篇论文的CSV文件: {temp_csv_path}"
182
+ yield update_log(message), temp_csv_path
183
+
184
+ def build_app():
185
+ """
186
+ 使用 gradio 搭建一个小型 Demo app。
187
+ """
188
+ with gr.Blocks() as demo:
189
+ gr.Markdown("## Google Scholar & Semantic Scholar 信息整合工具")
190
+ gr.Markdown("在下方输入任意多个 Google Scholar Profile 链接,每行一个,然后输入微信号,点击 **开始爬取**。")
191
+
192
+ profile_input = gr.Textbox(
193
+ lines=5,
194
+ placeholder="粘贴或输入多个 Google Scholar 个人页面链接(每行一个,需要下面格式 'https://scholar.google.com/citations?user=NVii64oAAAAJ')"
195
+ )
196
+ # 新增微信号输入框
197
+ wechat_input = gr.Textbox(
198
+ label="微信号",
199
+ placeholder="请输入您的微信号"
200
+ )
201
+
202
+ progress_output = gr.Textbox(
203
+ label="进度",
204
+ value="等待开始...",
205
+ lines=10,
206
+ interactive=False
207
+ )
208
+
209
+ download_output = gr.File(label="下载结果 CSV")
210
+
211
+ run_button = gr.Button("开始爬取")
212
+
213
+ # 传入两个输入值
214
+ run_button.click(
215
+ fn=process_profiles,
216
+ inputs=[profile_input, wechat_input],
217
+ outputs=[progress_output, download_output]
218
+ )
219
+
220
+ return demo
221
+
222
+ if __name__ == "__main__":
223
+ app = build_app()
224
+ app.launch()