P2P / start.py
suntao.0
init
04aed77
raw
history blame
1.94 kB
import logging
import os
import glob
from main import generate_paper_poster
from tqdm import tqdm
import concurrent.futures
def process_papers(input_dir, output_dir, url, model):
os.makedirs(output_dir, exist_ok=True)
paper_files = os.listdir(input_dir)
pdf_files = [
os.path.join(input_dir, file, "paper.pdf")
for file in paper_files
if os.path.isdir(os.path.join(input_dir, file))
]
def process_single_pdf(pdf_file):
try:
file_id = os.path.basename(os.path.dirname(pdf_file))
poster_dir = os.path.join(output_dir, file_id)
os.makedirs(poster_dir, exist_ok=True)
output_file = os.path.join(poster_dir, "poster.json")
output_png = os.path.join(poster_dir, "poster.png")
if os.path.exists(output_file) and os.path.exists(output_png):
print(f"跳过已存在的文件: {output_file}")
return
generate_paper_poster(
url=url,
pdf=pdf_file,
model=model,
output=output_file,
text_prompt=" ",
figures_prompt=" ",
)
print(f"成功生成: {output_file}")
except Exception as e:
print(f"处理文件 {pdf_file} 时出错: {e}")
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
futures = [
executor.submit(process_single_pdf, pdf_file) for pdf_file in pdf_files
]
for _ in tqdm(
concurrent.futures.as_completed(futures),
total=len(futures),
desc=f"处理文件 {model}",
):
pass
if __name__ == "__main__":
url = ""
input_dir = "eval/data"
models = []
for model in models:
output_dir = f"eval/temp-v2/{model.replace('/', '-')}"
process_papers(input_dir, output_dir, url, model)