Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
import requests
|
4 |
+
import dotenv
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import gradio as gr
|
7 |
+
import pandas as pd
|
8 |
+
import time
|
9 |
+
|
10 |
+
# 加载环境变量(其中需要包含 S2_API_KEY)
|
11 |
+
dotenv.load_dotenv()
|
12 |
+
|
13 |
+
def get_papers_since(url, since_year=2023):
|
14 |
+
"""
|
15 |
+
给定一个 Google Scholar citations page URL,返回指定年份(默认2023)及之后的论文列表。
|
16 |
+
|
17 |
+
每篇论文用一个字典表示:{"title": ..., "year": ...}
|
18 |
+
"""
|
19 |
+
headers = {
|
20 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
|
21 |
+
}
|
22 |
+
|
23 |
+
papers = []
|
24 |
+
start = 0
|
25 |
+
pagesize = 20 # Google Scholar 通常每页显示 20 条记录
|
26 |
+
|
27 |
+
while True:
|
28 |
+
params = {"cstart": start, "pagesize": pagesize}
|
29 |
+
response = requests.get(url, params=params, headers=headers)
|
30 |
+
if response.status_code != 200:
|
31 |
+
break
|
32 |
+
|
33 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
34 |
+
rows = soup.find_all("tr", class_="gsc_a_tr")
|
35 |
+
if not rows:
|
36 |
+
break
|
37 |
+
|
38 |
+
for row in rows:
|
39 |
+
title_tag = row.find("a", class_="gsc_a_at")
|
40 |
+
title = title_tag.text.strip() if title_tag else "Unknown Title"
|
41 |
+
year_tag = row.find("span", class_="gsc_a_h")
|
42 |
+
try:
|
43 |
+
year = int(year_tag.text.strip()) if year_tag else None
|
44 |
+
except ValueError:
|
45 |
+
year = None
|
46 |
+
|
47 |
+
if year and year >= since_year:
|
48 |
+
papers.append({"title": title, "year": year})
|
49 |
+
|
50 |
+
next_button = soup.find("button", id="gsc_bpf_more")
|
51 |
+
if next_button and "disabled" not in next_button.attrs:
|
52 |
+
start += pagesize
|
53 |
+
else:
|
54 |
+
break
|
55 |
+
|
56 |
+
return papers
|
57 |
+
|
58 |
+
def search_paper_by_title(paper_title: str):
|
59 |
+
"""
|
60 |
+
使用 Semantic Scholar "match" endpoint 根据标题搜索论文,返回 top-1 匹配结果。
|
61 |
+
|
62 |
+
返回:
|
63 |
+
dict: 包含 'paperId', 'title', 'abstract', 'venue' 四个键的字典。
|
64 |
+
若找不到对应论文(HTTP 404 或无结果),则返回 None。
|
65 |
+
若出现其它错误则抛出异常。
|
66 |
+
"""
|
67 |
+
base_url = "https://api.semanticscholar.org/graph/v1/paper/search/match"
|
68 |
+
params = {
|
69 |
+
"query": paper_title,
|
70 |
+
"fields": "paperId,abstract,title,venue"
|
71 |
+
}
|
72 |
+
headers = {
|
73 |
+
"x-api-key": os.getenv("S2_API_KEY")
|
74 |
+
}
|
75 |
+
|
76 |
+
response = requests.get(base_url, params=params, headers=headers)
|
77 |
+
|
78 |
+
if response.status_code == 404:
|
79 |
+
return None
|
80 |
+
elif response.status_code != 200:
|
81 |
+
raise Exception(f"错误 {response.status_code}: {response.text}")
|
82 |
+
|
83 |
+
data_list = response.json().get("data", [])
|
84 |
+
if not data_list:
|
85 |
+
return None
|
86 |
+
|
87 |
+
paper_data = data_list[0]
|
88 |
+
return {
|
89 |
+
"paperId": paper_data.get("paperId"),
|
90 |
+
"title": paper_data.get("title"),
|
91 |
+
"abstract": paper_data.get("abstract"),
|
92 |
+
"venue": paper_data.get("venue")
|
93 |
+
}
|
94 |
+
|
95 |
+
def process_profiles(profiles_text, wechat, progress=None):
|
96 |
+
"""
|
97 |
+
1. 将用户输入的多个 Google Scholar profile links (以换行分隔) 转为列表
|
98 |
+
2. 爬取所有链接在 2023 年及之后的论文
|
99 |
+
3. 对每篇论文调用 Semantic Scholar 获取匹配信息,若无匹配则丢弃
|
100 |
+
4. 根据 paperId 去重
|
101 |
+
5. 返回一个 CSV 文件内容(bytes),用于用户下载
|
102 |
+
"""
|
103 |
+
log_messages = ["开始处理..."]
|
104 |
+
|
105 |
+
def update_log(message):
|
106 |
+
log_messages.append(message)
|
107 |
+
return "\n".join(log_messages)
|
108 |
+
|
109 |
+
if not profiles_text.strip():
|
110 |
+
return update_log("错误: 未提供任何链接"), None
|
111 |
+
|
112 |
+
if not wechat.strip():
|
113 |
+
return update_log("错误: 未提供微信号"), None
|
114 |
+
|
115 |
+
# 将用户输入按换行拆分
|
116 |
+
profile_urls = [line.strip() for line in profiles_text.splitlines() if line.strip()]
|
117 |
+
|
118 |
+
message = f"已识别 {len(profile_urls)} 个档案链接,开始处理..."
|
119 |
+
yield update_log(message), None
|
120 |
+
|
121 |
+
all_papers = []
|
122 |
+
for i, url in enumerate(profile_urls):
|
123 |
+
message = f"处理第 {i+1}/{len(profile_urls)} 个档案: {url}"
|
124 |
+
yield update_log(message), None
|
125 |
+
|
126 |
+
papers = get_papers_since(url, 2023)
|
127 |
+
all_papers.extend(papers)
|
128 |
+
|
129 |
+
message = f"已从档案 {i+1}/{len(profile_urls)} 中收集 {len(papers)} 篇论文"
|
130 |
+
yield update_log(message), None
|
131 |
+
|
132 |
+
message = f"共收集到 {len(all_papers)} 篇论文,开始获取详细信息..."
|
133 |
+
yield update_log(message), None
|
134 |
+
|
135 |
+
paperid_map = {}
|
136 |
+
total_papers = len(all_papers)
|
137 |
+
|
138 |
+
for i, paper in enumerate(all_papers):
|
139 |
+
title = paper["title"]
|
140 |
+
year = paper["year"]
|
141 |
+
|
142 |
+
if i % 10 == 0 or i == total_papers - 1:
|
143 |
+
percent = round((i+1)/total_papers*100)
|
144 |
+
message = f"正在处理论文 {i+1}/{total_papers} ({percent}% 完成)"
|
145 |
+
yield update_log(message), None
|
146 |
+
|
147 |
+
if i > 0 and i % 10 == 0:
|
148 |
+
time.sleep(1)
|
149 |
+
|
150 |
+
try:
|
151 |
+
paper_info = search_paper_by_title(title)
|
152 |
+
except Exception as e:
|
153 |
+
continue
|
154 |
+
|
155 |
+
if not paper_info or not paper_info.get("paperId"):
|
156 |
+
continue
|
157 |
+
|
158 |
+
pid = paper_info["paperId"]
|
159 |
+
if pid not in paperid_map:
|
160 |
+
paperid_map[pid] = {
|
161 |
+
"paperId": pid,
|
162 |
+
"title": paper_info["title"],
|
163 |
+
"abstract": paper_info["abstract"],
|
164 |
+
"venue": paper_info["venue"],
|
165 |
+
"year": year
|
166 |
+
}
|
167 |
+
|
168 |
+
if not paperid_map:
|
169 |
+
message = "错误: 未找到任何匹配的论文"
|
170 |
+
return update_log(message), None
|
171 |
+
|
172 |
+
message = f"去重后共有 {len(paperid_map)} 篇论文,正在生成CSV..."
|
173 |
+
yield update_log(message), None
|
174 |
+
|
175 |
+
df = pd.DataFrame(list(paperid_map.values()))
|
176 |
+
|
177 |
+
# 使用微信号作为 CSV 文件名
|
178 |
+
temp_csv_path = f"{wechat.strip()}.csv"
|
179 |
+
df.to_csv(temp_csv_path, index=False, encoding='utf-8')
|
180 |
+
|
181 |
+
message = f"任务完成! 已生成包含 {len(paperid_map)} 篇论文的CSV文件: {temp_csv_path}"
|
182 |
+
yield update_log(message), temp_csv_path
|
183 |
+
|
184 |
+
def build_app():
|
185 |
+
"""
|
186 |
+
使用 gradio 搭建一个小型 Demo app。
|
187 |
+
"""
|
188 |
+
with gr.Blocks() as demo:
|
189 |
+
gr.Markdown("## Google Scholar & Semantic Scholar 信息整合工具")
|
190 |
+
gr.Markdown("在下方输入任意多个 Google Scholar Profile 链接,每行一个,然后输入微信号,点击 **开始爬取**。")
|
191 |
+
|
192 |
+
profile_input = gr.Textbox(
|
193 |
+
lines=5,
|
194 |
+
placeholder="粘贴或输入多个 Google Scholar 个人页面链接(每行一个,需要下面格式 'https://scholar.google.com/citations?user=NVii64oAAAAJ')"
|
195 |
+
)
|
196 |
+
# 新增微信号输入框
|
197 |
+
wechat_input = gr.Textbox(
|
198 |
+
label="微信号",
|
199 |
+
placeholder="请输入您的微信号"
|
200 |
+
)
|
201 |
+
|
202 |
+
progress_output = gr.Textbox(
|
203 |
+
label="进度",
|
204 |
+
value="等待开始...",
|
205 |
+
lines=10,
|
206 |
+
interactive=False
|
207 |
+
)
|
208 |
+
|
209 |
+
download_output = gr.File(label="下载结果 CSV")
|
210 |
+
|
211 |
+
run_button = gr.Button("开始爬取")
|
212 |
+
|
213 |
+
# 传入两个输入值
|
214 |
+
run_button.click(
|
215 |
+
fn=process_profiles,
|
216 |
+
inputs=[profile_input, wechat_input],
|
217 |
+
outputs=[progress_output, download_output]
|
218 |
+
)
|
219 |
+
|
220 |
+
return demo
|
221 |
+
|
222 |
+
if __name__ == "__main__":
|
223 |
+
app = build_app()
|
224 |
+
app.launch()
|