Spaces:

ASC8384
/

P2P

Running

App Files Files Community

ASC8384 commited on 25 days ago

Commit

d4bdb14

1 Parent(s): 42bb4dc

MultiThread

Browse files

Files changed (3) hide show

main.py +24 -24
poster/figures.py +16 -4
poster/poster.py +58 -12

main.py CHANGED Viewed

@@ -39,45 +39,45 @@ def generate_paper_poster(
     figures_cap_cache = f"{pdf_stem}_figures_cap.json"
     figures = []
-    figures_cap = []
     print("开始提取图片...")
     if os.path.exists(figures_cache) and os.path.exists(figures_cap_cache):
         print(f"使用缓存的图片: {figures_cache}")
         with open(figures_cache, "r") as f:
             figures = json.load(f)
-        with open(figures_cap_cache, "r") as f:
-            figures_cap = json.load(f)
     else:
         figures_img = extract_figures(url, pdf, task="figure")
         figures_table = extract_figures(url, pdf, task="table")
-        img_caption = extract_figures(url, pdf, task="figurecaption")
-        table_caption = extract_figures(url, pdf, task="tablecaption")
-        threshold = 0.85
-        while True:
-            figures = [
-                image
-                for image, score in figures_img + figures_table
-                if score >= threshold
-            ]
-            figures_cap = [
-                image
-                for image, score in img_caption + table_caption
-                if score >= threshold
-            ]
-            print(f"{threshold:.2f} 提取到 {len(figures)} / {len(figures_cap)} 张图像")
-            if len(figures) == len(figures_cap):
-                break
-            threshold -= 0.05
         with open(figures_cache, "w") as f:
             json.dump(figures, f, ensure_ascii=False)
-        with open(figures_cap_cache, "w") as f:
-            json.dump(figures_cap, f, ensure_ascii=False)
     while True:
         try:
             result = generate_poster_v3(
-                vendor, model, text_prompt, figures_prompt, pdf, figures_cap, figures
             )
             poster = result["image_based_poster"]

     figures_cap_cache = f"{pdf_stem}_figures_cap.json"
     figures = []
+    # figures_cap = []
     print("开始提取图片...")
     if os.path.exists(figures_cache) and os.path.exists(figures_cap_cache):
         print(f"使用缓存的图片: {figures_cache}")
         with open(figures_cache, "r") as f:
             figures = json.load(f)
+        # with open(figures_cap_cache, "r") as f:
+        #     figures_cap = json.load(f)
     else:
         figures_img = extract_figures(url, pdf, task="figure")
         figures_table = extract_figures(url, pdf, task="table")
+        # img_caption = extract_figures(url, pdf, task="figurecaption")
+        # table_caption = extract_figures(url, pdf, task="tablecaption")
+        threshold = 0.75
+        # while True:
+        figures = [
+            image
+            for image, score in figures_img + figures_table
+            if score >= threshold
+        ]
+            # figures_cap = [
+            #     image
+            #     for image, score in img_caption + table_caption
+            #     if score >= threshold
+            # ]
+            # print(f"{threshold:.2f} 提取到 {len(figures)} / {len(figures_cap)} 张图像")
+            # if len(figures) == len(figures_cap):
+            #     break
+            # threshold -= 0.05
         with open(figures_cache, "w") as f:
             json.dump(figures, f, ensure_ascii=False)
+        # with open(figures_cap_cache, "w") as f:
+        #     json.dump(figures_cap, f, ensure_ascii=False)
     while True:
         try:
             result = generate_poster_v3(
+                vendor, model, text_prompt, figures_prompt, pdf, figures, figures
             )
             poster = result["image_based_poster"]

poster/figures.py CHANGED Viewed

@@ -2,6 +2,7 @@ import base64
 import requests
 import os
 from pathlib import Path
 from io import BytesIO
 from PIL import Image
@@ -31,14 +32,25 @@ def _extract_figures(
 def extract_figures(
-    url: str, pdf: str, task: str = "figure"
 ) -> list[tuple[str, float]]:
     loader = ImagePDFLoader(pdf)
     images = loader.load()
     figures = []
-    for image in images:
-        figures.extend(_extract_figures(url, image, task))
     base64_figures = []
     for figure, score in figures:
@@ -52,7 +64,7 @@ def extract_figures(
 if __name__ == "__main__":
-    url = ""
     pdf = "1.pdf"
     output_dir = Path("output")

 import requests
 import os
 from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from io import BytesIO
 from PIL import Image
 def extract_figures(
+    url: str, pdf: str, task: str = "figure", max_workers: int = 4
 ) -> list[tuple[str, float]]:
     loader = ImagePDFLoader(pdf)
     images = loader.load()
     figures = []
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_image = {
+            executor.submit(_extract_figures, url, image, task): image
+            for image in images
+        }
+        for future in as_completed(future_to_image):
+            try:
+                result = future.result()
+                figures.extend(result)
+            except Exception as exc:
+                print(f'图像处理时发生错误: {exc}')
     base64_figures = []
     for figure, score in figures:
 if __name__ == "__main__":
+    url = "https://kr4t0n--yolo-layout-detection-temp-layoutdetection-predict.modal.run"
     pdf = "1.pdf"
     output_dir = Path("output")

poster/poster.py CHANGED Viewed

@@ -6,6 +6,7 @@ import re
 import subprocess
 import time
 import cairosvg
 from PIL import Image
 from pdf2image import convert_from_path
@@ -388,8 +389,12 @@ def generate_html_v2(vendor: str, model: str, poster: BaseModel, figures: list[s
             / poster_total_size
         )
-    max_attempts = 5
-    attempt = 1
     while True:
         body = re.search(r"```html\n(.*?)\n```", output, re.DOTALL).group(1)
@@ -401,6 +406,13 @@ def generate_html_v2(vendor: str, model: str, poster: BaseModel, figures: list[s
         section_sizes = get_sizes("section", html_with_figures)
         proportion = calculate_blank_proportion(poster_sizes, section_sizes)
         if proportion <= 0.1:
             print(
                 f"Attempted {attempt} times, remaining {proportion:.0%} blank spaces."
@@ -409,10 +421,16 @@ def generate_html_v2(vendor: str, model: str, poster: BaseModel, figures: list[s
         attempt += 1
         if attempt > max_attempts:
-            raise ValueError(f"Invalid blank spaces: {proportion:.0%}")
         react = [
-            # AIMessage(""),
             HumanMessage(
                 content=f"""# Previous Body
 {body}
@@ -514,10 +532,16 @@ def generate_poster_v3(
                 model=model,
                 temperature=1,
                 max_tokens=8000,
-                # model_kwargs={
-                #     "extra_body": {"chat_template_kwargs": {"enable_thinking": False}}
-                # },
             )
     loader = PyMuPDFLoader(pdf)
     pages = loader.load()
     paper_content = "\n".join([page.page_content for page in pages])
@@ -629,16 +653,38 @@ Paper content:
             figures_with_descriptions = f.read()
     else:
         figure_chain = figures_description_prompt | (mllm if use_claude else llm)
-        for i, figure in enumerate(tqdm(figures, desc=f"处理图片 {pdf}")):
             figure_description_response = figure_chain.invoke({"image_data": figure})
             figures_with_descriptions += f"""
 <figure_{i}>
-{figure_description_response.content}
 </figure_{i}>
 """
-            figure_list.append(
-                {"figure": figure, "description": figure_description_response.content}
-            )
         if use_claude:
             with open(figures_description_cache, "w") as f:
                 f.write(figures_with_descriptions)

 import subprocess
 import time
 import cairosvg
+from concurrent.futures import ThreadPoolExecutor
 from PIL import Image
 from pdf2image import convert_from_path
             / poster_total_size
         )
+    max_attempts = 6
+    attempt = 0
+    min_proportion = float('inf')
+    min_html = None
+    min_html_with_figures = None
     while True:
         body = re.search(r"```html\n(.*?)\n```", output, re.DOTALL).group(1)
         section_sizes = get_sizes("section", html_with_figures)
         proportion = calculate_blank_proportion(poster_sizes, section_sizes)
+        print(f"当前比例: {proportion:.0%}")
+        if proportion < min_proportion:
+            min_proportion = proportion
+            min_html = html
+            min_html_with_figures = html_with_figures
         if proportion <= 0.1:
             print(
                 f"Attempted {attempt} times, remaining {proportion:.0%} blank spaces."
         attempt += 1
         if attempt > max_attempts:
+            if min_proportion <= 0.2:
+                print(
+                    f"Reached max attempts ({max_attempts}), returning best result with {min_proportion:.0%} blank spaces."
+                )
+                return {"html": min_html, "html_with_figures": min_html_with_figures}
+            else:
+                raise ValueError(f"Invalid blank spaces: {min_proportion:.0%}")
         react = [
             HumanMessage(
                 content=f"""# Previous Body
 {body}
                 model=model,
                 temperature=1,
                 max_tokens=8000,
             )
+    elif vendor == "azure":
+        llm = AzureChatOpenAI(
+            azure_deployment=model,
+            temperature=1,
+            max_tokens=8000,
+        )
+    else:
+        raise ValueError(f"Unsupported vendor: {vendor}")
     loader = PyMuPDFLoader(pdf)
     pages = loader.load()
     paper_content = "\n".join([page.page_content for page in pages])
             figures_with_descriptions = f.read()
     else:
         figure_chain = figures_description_prompt | (mllm if use_claude else llm)
+        def process_single_figure(figure_data):
+            figure, index = figure_data
             figure_description_response = figure_chain.invoke({"image_data": figure})
+            return {
+                "index": index,
+                "figure": figure,
+                "description": figure_description_response.content
+            }
+        figure_data_list = [(figure, i) for i, figure in enumerate(figures)]
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            results = list(tqdm(
+                executor.map(process_single_figure, figure_data_list),
+                total=len(figure_data_list),
+                desc=f"处理图片 {pdf}"
+            ))
+        for result in results:
+            i = result["index"]
+            print(f"处理图片 {i} 完成")
             figures_with_descriptions += f"""
 <figure_{i}>
+{result["description"]}
 </figure_{i}>
 """
+            figure_list.append({
+                "figure": result["figure"],
+                "description": result["description"]
+            })
         if use_claude:
             with open(figures_description_cache, "w") as f:
                 f.write(figures_with_descriptions)