File size: 68,006 Bytes
0a62c81 fde7ed2 0a62c81 fde7ed2 121b98c fde7ed2 121b98c fde7ed2 0a62c81 fde7ed2 0a62c81 121b98c 0a62c81 121b98c 0a62c81 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF Mistral OCR 匯出工具
本程式可將 PDF 文件自動化轉換為 Markdown 格式,包含以下流程:
1. 使用 Mistral OCR 模型辨識 PDF 內文與圖片
2. 將辨識結果組成含圖片的 Markdown 檔
3. 使用 Gemini 模型將英文內容翻譯為台灣繁體中文
4. 匯出 Markdown 檔(原文版 + 翻譯版)與對應圖片
新增功能:
- 處理過程中的檢查點,可以保存中間結果
- Gradio 介面,方便調整參數和選擇輸出格式
"""
# Standard libraries
import os
import json
import base64
import time
import tempfile # Already imported, ensure it's used correctly later
from pathlib import Path
import pickle
import certifi
import shutil # Added for zipping images
os.environ["SSL_CERT_FILE"] = certifi.where()
# Third-party libraries
from IPython.display import Markdown, display
from pydantic import BaseModel
from dotenv import load_dotenv
import gradio as gr
# Mistral AI
from mistralai import Mistral
from mistralai.models import OCRResponse, ImageURLChunk, DocumentURLChunk, TextChunk
# Google Gemini
from google import genai
from google.genai import types
# OpenAI
# Import the library (add 'openai' to requirements.txt)
try:
from openai import OpenAI
except ImportError:
print("⚠️ OpenAI library not found. Please install it: pip install openai")
OpenAI = None # Set to None if import fails
# ===== Pydantic Models =====
class StructuredOCR(BaseModel):
file_name: str
topics: list[str]
languages: str
ocr_contents: dict
# ===== Utility Functions =====
def retry_with_backoff(func, retries=5, base_delay=1.5):
"""Retry a function with exponential backoff."""
for attempt in range(retries):
try:
return func()
except Exception as e:
if "429" in str(e):
wait_time = base_delay * (2 ** attempt)
print(f"⚠️ API rate limit hit. Retrying in {wait_time:.1f}s...")
time.sleep(wait_time)
else:
raise e
raise RuntimeError("❌ Failed after multiple retries.")
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
"""Replace image placeholders in markdown with base64-encoded images."""
for img_name, base64_str in images_dict.items():
markdown_str = markdown_str.replace(
f"", f""
)
return markdown_str
def get_combined_markdown(ocr_response: OCRResponse) -> str:
"""Combine OCR text and images into a single markdown document."""
markdowns: list[str] = []
for page in ocr_response.pages:
image_data = {img.id: img.image_base64 for img in page.images}
markdowns.append(replace_images_in_markdown(page.markdown, image_data))
return "\n\n".join(markdowns)
def insert_ocr_below_images(markdown_str, ocr_img_map, page_idx):
"""Insert OCR results below images in markdown."""
for img_id, ocr_text in ocr_img_map.get(page_idx, {}).items():
markdown_str = markdown_str.replace(
f"",
f"\n\n> 📄 Image OCR Result:\n\n```json\n{ocr_text}\n```"
)
return markdown_str
def save_images_and_replace_links(markdown_str, images_dict, page_idx, image_folder="images"):
"""Save base64 images to files and update markdown links."""
os.makedirs(image_folder, exist_ok=True)
image_id_to_path = {}
for i, (img_id, base64_str) in enumerate(images_dict.items()):
img_bytes = base64.b64decode(base64_str.split(",")[-1])
# 使用相對路徑,僅保留資料夾名稱和檔案名稱
img_path = f"{os.path.basename(image_folder)}/page_{page_idx+1}_img_{i+1}.png"
# 實際儲存的完整路徑
full_img_path = os.path.join(image_folder, f"page_{page_idx+1}_img_{i+1}.png")
with open(full_img_path, "wb") as f:
f.write(img_bytes)
image_id_to_path[img_id] = img_path
for img_id, img_path in image_id_to_path.items():
markdown_str = markdown_str.replace(
f"", f""
)
return markdown_str
# ===== Translation Functions =====
# Default translation system prompt
DEFAULT_TRANSLATION_SYSTEM_INSTRUCTION = """
你是一位專業的技術文件翻譯者。請將我提供的英文 Markdown 內容翻譯成**台灣繁體中文**。
**核心要求:**
1. **翻譯所有英文文字:** 你的主要工作是翻譯內容中的英文敘述性文字(段落、列表、表格等)。
2. **保持結構與程式碼不變:**
* **不要**更改任何 Markdown 標記(如 `#`, `*`, `-`, `[]()`, `![]()`, ``` ```, ` `` `, `---`)。
* **不要**翻譯或修改程式碼區塊 (``` ... ```) 和行內程式碼 (`code`) 裡的任何內容。
* 若有 JSON,**不要**更改鍵(key),僅翻譯字串值(value)。
3. **處理專有名詞:** 對於普遍接受的英文技術術語、縮寫或專有名詞(例如 API, SDK, CPU, Google, Python 等),傾向於**保留英文原文**。但請確保翻譯了其他所有非術語的常規英文文字。
4. **直接輸出結果:** 請直接回傳翻譯後的完整 Markdown 文件,不要添加任何額外說明。
"""
# Updated signature to accept openai_client
def translate_markdown_pages(pages, gemini_client, openai_client, model="gemini-2.0-flash", system_instruction=None):
"""Translate markdown pages using the selected API (Gemini or OpenAI). Yields progress strings and translated page content."""
if system_instruction is None:
system_instruction = DEFAULT_TRANSLATION_SYSTEM_INSTRUCTION
# No longer collecting in a list here, will yield pages directly
total_pages = len(pages) # Get total pages for progress
for idx, page in enumerate(pages):
progress_message = f"🔁 正在翻譯第 {idx+1} / {total_pages} 頁..."
print(progress_message) # Print to console
yield progress_message # Yield progress string for Gradio log
try:
if model.startswith("gpt-"):
# --- OpenAI Translation Logic ---
if not openai_client:
error_msg = f"⚠️ OpenAI client not initialized for translation model {model}. Skipping page {idx+1}."
print(error_msg)
yield error_msg
yield f"--- ERROR: OpenAI Client Error for Page {idx+1} ---\n\n{page}"
continue # Skip to next page
print(f" - Translating using OpenAI model: {model}")
try:
# Construct messages for OpenAI translation
# Use the provided system_instruction as the system message
messages = [
{"role": "system", "content": system_instruction},
{"role": "user", "content": page}
]
response = openai_client.chat.completions.create(
model=model,
messages=messages,
temperature=0.1 # Lower temperature for more deterministic translation
)
translated_md = response.choices[0].message.content.strip()
except Exception as openai_e:
error_msg = f"⚠️ OpenAI 翻譯第 {idx+1} / {total_pages} 頁失敗:{openai_e}"
print(error_msg)
yield error_msg # Yield error string to Gradio log
yield f"--- ERROR: OpenAI Translation Failed for Page {idx+1} ---\n\n{page}"
continue # Skip to next page
elif model.startswith("gemini"):
# --- Gemini Translation Logic ---
print(f" - Translating using Gemini model: {model}")
response = gemini_client.models.generate_content(
model=model,
config=types.GenerateContentConfig(
system_instruction=system_instruction
),
contents=page
)
translated_md = response.text.strip()
else:
# --- Unsupported Model ---
error_msg = f"⚠️ Unsupported translation model: {model}. Skipping page {idx+1}."
print(error_msg)
yield error_msg
yield f"--- ERROR: Unsupported Translation Model for Page {idx+1} ---\n\n{page}"
continue # Skip to next page
# --- Yield successful translation ---
# translated_pages.append(translated_md) # Removed duplicate append
yield translated_md # Yield the actual translated page content
except Exception as e:
error_msg = f"⚠️ 翻譯第 {idx+1} / {total_pages} 頁失敗:{e}"
print(error_msg)
yield error_msg # Yield error string to Gradio log
# Yield error marker instead of translated content
yield f"--- ERROR: Translation Failed for Page {idx+1} ---\n\n{page}"
final_message = f"✅ 翻譯完成 {total_pages} 頁。"
yield final_message # Yield final translation status string
print(final_message) # Print final translation status
# No return needed for a generator yielding results
# ===== PDF Processing Functions =====
def process_pdf_with_mistral_ocr(pdf_path, client, model="mistral-ocr-latest"):
"""Process PDF with Mistral OCR."""
pdf_file = Path(pdf_path)
# Upload to mistral
uploaded_file = client.files.upload(
file={
"file_name": pdf_file.stem,
"content": pdf_file.read_bytes(),
},
purpose="ocr"
)
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
# OCR analyze PDF
pdf_response = client.ocr.process(
document=DocumentURLChunk(document_url=signed_url.url),
model=model,
include_image_base64=True
)
return pdf_response
# Updated function signature to include structure_text_only
def process_images_with_ocr(pdf_response, mistral_client, gemini_client, openai_client, structure_model="pixtral-12b-latest", structure_text_only=False):
"""Process images from PDF pages with OCR and structure using the specified model."""
image_ocr_results = {}
for page_idx, page in enumerate(pdf_response.pages):
for i, img in enumerate(page.images):
base64_data_url = img.image_base64
# Extract raw base64 data for Gemini
try:
# Handle potential variations in data URL prefix
if ',' in base64_data_url:
base64_content = base64_data_url.split(',', 1)[1]
else:
# Assume it's just the base64 content if no comma prefix
base64_content = base64_data_url
# Decode and re-encode to ensure it's valid base64 bytes for Gemini
image_bytes = base64.b64decode(base64_content)
except Exception as e:
print(f"⚠️ Error decoding base64 for page {page_idx+1}, image {i+1}: {e}. Skipping image.")
continue # Skip this image if base64 is invalid
def run_ocr_and_parse():
# Step 1: Basic OCR (always use Mistral OCR for initial text extraction)
print(f" - Performing basic OCR on page {page_idx+1}, image {i+1}...")
image_response = mistral_client.ocr.process(
document=ImageURLChunk(image_url=base64_data_url),
model="mistral-ocr-latest" # Use the dedicated OCR model here
)
image_ocr_markdown = image_response.pages[0].markdown
print(f" - Basic OCR text extracted.")
# Step 2: Structure the OCR markdown using the selected model
print(f" - Structuring OCR using: {structure_model}")
if structure_model == "pixtral-12b-latest":
print(f" - Using Mistral Pixtral...")
print(f" - Sending request to Pixtral API...") # Added print statement
structured = mistral_client.chat.parse(
model=structure_model, # Use the selected structure_model
messages=[
{
"role": "user",
"content": [
ImageURLChunk(image_url=base64_data_url),
TextChunk(text=(
f"This is the image's OCR in markdown:\n{image_ocr_markdown}\n. "
"Convert this into a structured JSON response with the OCR contents in a sensible dictionary."
))
]
}
],
response_format=StructuredOCR, # Use Pydantic model for expected structure
temperature=0
)
structured_data = structured.choices[0].message.parsed
pretty_text = json.dumps(structured_data.ocr_contents, indent=2, ensure_ascii=False)
elif structure_model.startswith("gemini"): # Handle gemini-flash-2.0 etc.
print(f" - Using Google Gemini ({structure_model})...")
# Define the base prompt text
base_prompt_text = f"""
You are an expert OCR structuring assistant. Your goal is to extract and structure the relevant content into a JSON object based on the provided information.
**Initial OCR Markdown:**
```markdown
{image_ocr_markdown}
```
**Task:**
Generate a JSON object containing the structured OCR content found in the image. Focus on extracting meaningful information and organizing it logically within the JSON. The JSON should represent the `ocr_contents` field.
**Output Format:**
Return ONLY the JSON object, without any surrounding text or markdown formatting. Example:
```json
{{
"title": "Example Title",
"sections": [
{{"header": "Section 1", "content": "Details..."}},
{{"header": "Section 2", "content": "More details..."}}
],
"key_value_pairs": {{
"key1": "value1",
"key2": "value2"
}}
}}
```
(Adapt the structure based on the image content.)
"""
# Prepare API call based on structure_text_only flag
gemini_contents = []
if structure_text_only:
print(" - Mode: Text-only structuring")
# Modify prompt slightly for text-only
gemini_prompt = base_prompt_text.replace(
"Analyze the provided image and the initial OCR text",
"Analyze the initial OCR text"
).replace(
"content from the image",
"content from the text"
)
gemini_contents.append(gemini_prompt)
else:
print(" - Mode: Image + Text structuring")
gemini_prompt = base_prompt_text # Use original prompt
# Prepare image part for Gemini using types.Part.from_bytes
# Assuming PNG, might need dynamic type detection in the future
# Pass the decoded image_bytes, not the base64_content string
try: # Corrected indentation
image_part = types.Part.from_bytes(
mime_type="image/png",
data=image_bytes
)
gemini_contents = [gemini_prompt, image_part] # Text prompt first, then image Part
except Exception as e:
print(f" - ⚠️ Error creating Gemini image Part: {e}. Skipping image structuring.")
# Fallback or re-raise depending on desired behavior
pretty_text = json.dumps({"error": "Failed to create Gemini image Part", "details": str(e)}, indent=2, ensure_ascii=False)
return pretty_text # Exit run_ocr_and_parse for this image
# Call Gemini API - Corrected to use gemini_client.models.generate_content
print(f" - Sending request to Gemini API ({structure_model})...") # Added print statement
try:
response = gemini_client.models.generate_content(
model=structure_model,
contents=gemini_contents # Pass the constructed list
)
except Exception as api_e:
print(f" - ⚠️ Error calling Gemini API: {api_e}")
# Fallback or re-raise
pretty_text = json.dumps({"error": "Failed to call Gemini API", "details": str(api_e)}, indent=2, ensure_ascii=False)
return pretty_text # Exit run_ocr_and_parse for this image
# Extract and clean the JSON response
raw_json_text = response.text.strip()
# Remove potential markdown code fences
if raw_json_text.startswith("```json"):
raw_json_text = raw_json_text[7:]
if raw_json_text.endswith("```"):
raw_json_text = raw_json_text[:-3]
raw_json_text = raw_json_text.strip()
# Validate and format the JSON
try:
parsed_json = json.loads(raw_json_text)
pretty_text = json.dumps(parsed_json, indent=2, ensure_ascii=False)
except json.JSONDecodeError as json_e:
print(f" - ⚠️ Gemini response was not valid JSON: {json_e}")
print(f" - Raw response: {raw_json_text}")
# Fallback: return the raw text wrapped in a basic JSON structure
pretty_text = json.dumps({"error": "Failed to parse Gemini JSON response", "raw_output": raw_json_text}, indent=2, ensure_ascii=False)
elif structure_model.startswith("gpt-"):
print(f" - Using OpenAI model: {structure_model}...")
if not openai_client:
print(" - ⚠️ OpenAI client not initialized. Skipping.")
return json.dumps({"error": "OpenAI client not initialized. Check API key and library installation."}, indent=2, ensure_ascii=False)
# Define the base prompt text for OpenAI
openai_base_prompt = f"""
You are an expert OCR structuring assistant. Your goal is to extract and structure the relevant content into a JSON object based on the provided information.
**Initial OCR Markdown:**
```markdown
{image_ocr_markdown}
```
**Task:**
Generate a JSON object containing the structured OCR content found in the image. Focus on extracting meaningful information and organizing it logically within the JSON. The JSON should represent the `ocr_contents` field.
**Output Format:**
Return ONLY the JSON object, without any surrounding text or markdown formatting. Example:
```json
{{
"title": "Example Title",
"sections": [
{{"header": "Section 1", "content": "Details..."}},
{{"header": "Section 2", "content": "More details..."}}
],
"key_value_pairs": {{
"key1": "value1",
"key2": "value2"
}}
}}
```
(Adapt the structure based on the image content. Ensure the output is valid JSON.)
"""
# Prepare payload for OpenAI vision based on structure_text_only
openai_content_list = []
if structure_text_only:
print(" - Mode: Text-only structuring")
# Modify prompt slightly for text-only
openai_prompt = openai_base_prompt.replace(
"Analyze the provided image and the initial OCR text",
"Analyze the initial OCR text"
).replace(
"content from the image",
"content from the text"
)
openai_content_list.append({"type": "text", "text": openai_prompt})
else:
print(" - Mode: Image + Text structuring")
openai_prompt = openai_base_prompt # Use original prompt
# Use the base64_content string directly for the data URL
# Assuming PNG, might need dynamic type detection
image_data_url = f"data:image/png;base64,{base64_content}" # Corrected indentation
openai_content_list.append({"type": "text", "text": openai_prompt})
openai_content_list.append({
"type": "image_url",
"image_url": {"url": image_data_url, "detail": "auto"},
})
print(f" - Sending request to OpenAI API ({structure_model})...")
try:
response = openai_client.chat.completions.create(
model=structure_model,
messages=[
{
"role": "user",
"content": openai_content_list, # Pass the constructed list
}
],
# Optionally add max_tokens if needed, but rely on prompt for JSON structure
# max_tokens=1000,
temperature=0.1 # Lower temperature for deterministic JSON
)
raw_json_text = response.choices[0].message.content.strip()
# Clean potential markdown fences
if raw_json_text.startswith("```json"):
raw_json_text = raw_json_text[7:]
if raw_json_text.endswith("```"):
raw_json_text = raw_json_text[:-3]
raw_json_text = raw_json_text.strip()
# Validate and format JSON
try:
parsed_json = json.loads(raw_json_text)
pretty_text = json.dumps(parsed_json, indent=2, ensure_ascii=False)
except json.JSONDecodeError as json_e:
print(f" - ⚠️ OpenAI response was not valid JSON: {json_e}")
print(f" - Raw response: {raw_json_text}")
pretty_text = json.dumps({"error": "Failed to parse OpenAI JSON response", "raw_output": raw_json_text}, indent=2, ensure_ascii=False)
except Exception as api_e:
print(f" - ⚠️ Error calling OpenAI API: {api_e}")
pretty_text = json.dumps({"error": "Failed to call OpenAI API", "details": str(api_e)}, indent=2, ensure_ascii=False)
else: # Final attempt to correct indentation for the final else
print(f" - ⚠️ Unsupported structure model: {structure_model}. Skipping structuring.")
# Fallback: return the basic OCR markdown wrapped in JSON
pretty_text = json.dumps({"unstructured_ocr": image_ocr_markdown}, indent=2, ensure_ascii=False)
return pretty_text
try:
# Pass the actual structure model name to the inner function if needed,
# or rely on the outer scope variable 'structure_model' as done here.
result = retry_with_backoff(run_ocr_and_parse, retries=4)
image_ocr_results[(page_idx, img.id)] = result
except Exception as e:
print(f"❌ Failed at page {page_idx+1}, image {i+1}: {e}")
# Reorganize results by page
ocr_by_page = {}
for (page_idx, img_id), ocr_text in image_ocr_results.items():
ocr_by_page.setdefault(page_idx, {})[img_id] = ocr_text
print(f" - Successfully processed page {page_idx+1}, image {i+1} with {structure_model}.")
return ocr_by_page
# ===== Checkpoint Functions =====
def save_checkpoint(data, filename, console_output=None):
"""Save data to a checkpoint file."""
with open(filename, 'wb') as f:
pickle.dump(data, f)
message = f"✅ 已儲存檢查點:{filename}"
print(message) # Corrected indentation
# Removed console_output append
return message # Return message
def load_checkpoint(filename, console_output=None):
"""Load data from a checkpoint file."""
if os.path.exists(filename):
with open(filename, 'rb') as f:
data = pickle.load(f)
message = f"✅ 已載入檢查點:{filename}"
print(message)
# Removed console_output append
return data, message # Return message
return None, None # Return None message
# ===== Main Processing Function =====
# Updated function signature to include structure_text_only
def process_pdf_to_markdown(
pdf_path,
mistral_client,
gemini_client,
openai_client,
ocr_model="mistral-ocr-latest",
structure_model="pixtral-12b-latest",
structure_text_only=False, # Added structure_text_only
translation_model="gemini-2.0-flash",
translation_system_prompt=None,
process_images=True,
output_formats_selected=None, # New parameter for selected formats
output_dir=None,
checkpoint_dir=None,
use_existing_checkpoints=True
):
"""Main function to process PDF to markdown with translation. Yields log messages."""
if output_formats_selected is None:
output_formats_selected = ["中文翻譯", "英文原文"] # Default if not provided
pdf_file = Path(pdf_path)
filename_stem = pdf_file.stem
# Sanitize the filename stem here as well
sanitized_stem = filename_stem.replace(" ", "_")
print(f"--- 開始處理檔案: {pdf_file.name} (Sanitized Stem: {sanitized_stem}) ---") # Console print
# Output and checkpoint directories are now expected to be set by the caller (Gradio function)
# os.makedirs(output_dir, exist_ok=True) # Ensure caller created it
# os.makedirs(checkpoint_dir, exist_ok=True) # Ensure caller created it
# Checkpoint files - Use sanitized_stem
pdf_ocr_checkpoint = os.path.join(checkpoint_dir, f"{sanitized_stem}_pdf_ocr.pkl")
image_ocr_checkpoint = os.path.join(checkpoint_dir, f"{sanitized_stem}_image_ocr.pkl")
# Checkpoint for raw page data (list of tuples: (raw_markdown_text, images_dict))
raw_page_data_checkpoint = os.path.join(checkpoint_dir, f"{sanitized_stem}_raw_page_data.pkl")
# Step 1: Process PDF with OCR (with checkpoint)
pdf_response = None
load_msg = None
if use_existing_checkpoints:
pdf_response, load_msg = load_checkpoint(pdf_ocr_checkpoint) # Get message
if load_msg: yield load_msg # Yield message
if pdf_response is None:
msg = "🔍 正在處理 PDF OCR..."
yield msg
print(msg) # Console print
pdf_response = process_pdf_with_mistral_ocr(pdf_path, mistral_client, model=ocr_model)
save_msg = save_checkpoint(pdf_response, pdf_ocr_checkpoint) # save_checkpoint already prints
if save_msg: yield save_msg # Yield message
else:
print("ℹ️ 使用現有 PDF OCR 檢查點。")
# Step 2: Process images with OCR (with checkpoint)
ocr_by_page = {}
if process_images:
load_msg = None
if use_existing_checkpoints:
ocr_by_page, load_msg = load_checkpoint(image_ocr_checkpoint) # Get message
if load_msg: yield load_msg # Yield message
if ocr_by_page is None or not ocr_by_page: # Check if empty dict from checkpoint or explicitly empty
msg = f"🖼️ 正在使用 '{structure_model}' 處理圖片 OCR 與結構化..."
yield msg
print(msg) # Console print
# Pass gemini_client and correct structure_model parameter name
ocr_by_page = process_images_with_ocr(
pdf_response,
mistral_client,
gemini_client,
openai_client,
structure_model=structure_model,
structure_text_only=structure_text_only # Pass the text-only flag
)
save_msg = save_checkpoint(ocr_by_page, image_ocr_checkpoint) # save_checkpoint already prints
if save_msg: yield save_msg # Yield message
else:
print("ℹ️ 使用現有圖片 OCR 檢查點。")
else:
print("ℹ️ 跳過圖片 OCR 處理。") # process_images was False
# Step 3: Create or load RAW page data (markdown text + image dicts)
raw_page_data = None # List of tuples: (raw_markdown_text, images_dict)
load_msg = None
if use_existing_checkpoints:
# Try loading the raw page data checkpoint
raw_page_data, load_msg = load_checkpoint(raw_page_data_checkpoint)
if load_msg: yield load_msg
if raw_page_data is None:
msg = "📝 正在建立原始頁面資料 (Markdown + 圖片資訊)..."
yield msg
print(msg)
raw_page_data = []
for page_idx, page in enumerate(pdf_response.pages):
images_dict = {img.id: img.image_base64 for img in page.images}
raw_md_text = page.markdown # Just the raw text with 
raw_page_data.append((raw_md_text, images_dict)) # Store as tuple
# Save the RAW page data checkpoint
save_msg = save_checkpoint(raw_page_data, raw_page_data_checkpoint)
if save_msg: yield save_msg
else:
print("ℹ️ 使用現有原始頁面資料檢查點。")
# Step 3.5: Conditionally insert image OCR results based on CURRENT UI selection
pages_after_ocr_insertion = [] # List to hold markdown strings after potential OCR insertion
if process_images and ocr_by_page: # Check if UI wants OCR AND if OCR results exist
msg = "✍️ 根據目前設定,正在將圖片 OCR 結果插入 Markdown..."
yield msg
print(msg)
for page_idx, (raw_md, _) in enumerate(raw_page_data): # Iterate through raw data
# Insert OCR results into the raw markdown text BEFORE replacing links
md_with_ocr = insert_ocr_below_images(raw_md, ocr_by_page, page_idx)
pages_after_ocr_insertion.append(md_with_ocr)
else:
# If not inserting OCR, just use the raw markdown text
if process_images and not ocr_by_page:
msg = "ℹ️ 已勾選處理圖片 OCR,但無圖片 OCR 結果可插入 (可能需要重新執行圖片 OCR)。"
yield msg
print(msg)
elif not process_images:
msg = "ℹ️ 未勾選處理圖片 OCR,跳過插入步驟。"
yield msg
print(msg)
# Use the raw markdown text directly
pages_after_ocr_insertion = [raw_md for raw_md, _ in raw_page_data]
# Step 3.6: Save images and replace links in the (potentially modified) markdown
final_markdown_pages = [] # This list will have final file paths as links
# Use sanitized_stem for image folder name
image_folder_name = os.path.join(output_dir, f"images_{sanitized_stem}")
msg = f"🖼️ 正在儲存圖片並更新 Markdown 連結至 '{os.path.basename(image_folder_name)}'..."
yield msg
print(msg)
# Iterate using the pages_after_ocr_insertion list and the original image dicts from raw_page_data
for page_idx, (md_to_link, (_, images_dict)) in enumerate(zip(pages_after_ocr_insertion, raw_page_data)):
# Now save images and replace links on the processed markdown (which might have OCR inserted)
final_md = save_images_and_replace_links(md_to_link, images_dict, page_idx, image_folder=image_folder_name)
final_markdown_pages.append(final_md)
# Step 4: Translate the final markdown pages
translated_markdown_pages = None # Initialize
need_translation = "中文翻譯" in output_formats_selected
if need_translation:
# Translate the final list with correct image links, passing both clients
translation_generator = translate_markdown_pages(
final_markdown_pages,
gemini_client,
openai_client, # Pass openai_client
model=translation_model,
system_instruction=translation_system_prompt
)
# Collect yielded pages from the translation generator
translated_markdown_pages = [] # Initialize list to store results
for item in translation_generator:
# Check if it's a progress string or actual content/error
# Simple check: assume non-empty strings starting with specific emojis are progress/status
if isinstance(item, str) and (item.startswith("🔁") or item.startswith("⚠️") or item.startswith("✅")):
yield item # Forward progress/status string
else:
# Assume it's translated content or an error marker page
translated_markdown_pages.append(item)
else:
yield "ℹ️ 跳過翻譯步驟 (未勾選中文翻譯)。"
print("ℹ️ 跳過翻譯步驟 (未勾選中文翻譯)。")
translated_markdown_pages = None # Ensure it's None if skipped
# Step 5: Combine pages into complete markdown strings
# The "original" output now correctly reflects the final state before translation
final_markdown_original = "\n\n---\n\n".join(final_markdown_pages) # Use the final pages with links
final_markdown_translated = "\n\n---\n\n".join(translated_markdown_pages) if translated_markdown_pages else None
# Step 6: Save files based on selection - Use sanitized_stem
saved_files = {}
if "英文原文" in output_formats_selected:
original_md_name = os.path.join(output_dir, f"{sanitized_stem}_original.md")
try:
with open(original_md_name, "w", encoding="utf-8") as f:
f.write(final_markdown_original)
msg = f"✅ 已儲存原文版:{original_md_name}"
yield msg
print(msg) # Console print
saved_files["original_file"] = original_md_name
except Exception as e:
msg = f"❌ 儲存原文版失敗: {e}"
yield msg
print(msg)
if "中文翻譯" in output_formats_selected and final_markdown_translated:
translated_md_name = os.path.join(output_dir, f"{sanitized_stem}_translated.md")
try:
with open(translated_md_name, "w", encoding="utf-8") as f:
f.write(final_markdown_translated)
msg = f"✅ 已儲存翻譯版:{translated_md_name}"
yield msg
print(msg) # Console print
saved_files["translated_file"] = translated_md_name
except Exception as e:
msg = f"❌ 儲存翻譯版失敗: {e}"
yield msg
print(msg)
# Always report image folder path if it was created (i.e., if images existed and were saved)
# The folder creation happens in save_images_and_replace_links
image_folder_name = os.path.join(output_dir, f"images_{sanitized_stem}")
if os.path.isdir(image_folder_name): # Check if the folder actually exists
msg = f"✅ 圖片資料夾:{image_folder_name}"
yield msg
print(msg) # Console print
saved_files["image_folder"] = image_folder_name
# else: # Optional: Log if folder wasn't created (e.g., PDF had no images)
# msg = f"ℹ️ PDF 文件不包含圖片,未建立圖片資料夾。"
# yield msg
# print(msg)
print(f"--- 完成處理檔案: {pdf_file.name} ---") # Console print
# Return the final result dictionary for Gradio UI update
yield {
"saved_files": saved_files, # Dictionary of saved file paths
"translated_content": final_markdown_translated,
"original_content": final_markdown_original,
"output_formats_selected": output_formats_selected # Pass back selections
}
# ===== Gradio Interface =====
def create_gradio_interface():
"""Create a Gradio interface for the PDF to Markdown tool."""
# Client initialization is now moved inside process_pdf
# Define processing function for Gradio
def process_pdf( # Updated signature to accept API keys and return file paths + log
pdf_file,
# API Keys from UI
mistral_api_key_input,
gemini_api_key_input,
openai_api_key_input,
# Other parameters
ocr_model,
structure_model,
translation_model,
translation_system_prompt,
process_images,
output_format, # CheckboxGroup list
use_existing_checkpoints,
structure_text_only
): # -> tuple[str | None, str | None, str | None, str]:
# Accumulate logs for console output
log_accumulator = ""
mistral_client = None
gemini_client = None
openai_client = None
print("\n--- Gradio 處理請求開始 ---") # Console print
# Placeholders for file outputs and log
output_original_md_path = None
output_translated_md_path = None
output_images_zip_path = None
# --- Early Exit Checks ---
if pdf_file is None:
log_accumulator += "❌ 請先上傳 PDF 檔案\n"
print("❌ 錯誤:未上傳 PDF 檔案")
# Return Nones for files/previews and the error log (6 values total)
yield None, None, None, None, None, "❌ 錯誤:未上傳 PDF 檔案\n" + log_accumulator
return
# --- API Key and Client Initialization ---
log_accumulator += "🔑 正在初始化 API Clients...\n"
# Yield updates for the log output only (6 values total)
yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), log_accumulator
# Mistral (Required)
if not mistral_api_key_input:
log_accumulator += "❌ 錯誤:請務必提供 Mistral API Key。\n"
print("❌ 錯誤:未提供 Mistral API Key")
# Yield Nones for files/previews and the error log (6 values total)
yield None, None, None, None, None, log_accumulator
return
try:
mistral_client = Mistral(api_key=mistral_api_key_input)
log_accumulator += "✅ Mistral Client 初始化成功。\n"
print("✅ Mistral Client initialized.")
except Exception as e:
log_accumulator += f"❌ 初始化 Mistral Client 失敗: {e}\n"
print(f"❌ Error initializing Mistral Client: {e}")
# Yield Nones for files/previews and the error log (6 values total)
yield None, None, None, None, None, log_accumulator
return
# Gemini (Optional, depends on model selection later)
if gemini_api_key_input:
try:
gemini_client = genai.Client(api_key=gemini_api_key_input)
log_accumulator += "✅ Gemini Client 初始化成功。\n"
print("✅ Gemini Client initialized.")
except Exception as e:
log_accumulator += f"⚠️ 初始化 Gemini Client 失敗 (若未使用 Gemini 模型可忽略): {e}\n"
print(f"⚠️ Error initializing Gemini Client (ignore if not using Gemini models): {e}")
gemini_client = None # Ensure it's None if init fails
else:
log_accumulator += "ℹ️ 未提供 Gemini API Key,將無法使用 Gemini 模型。\n"
print("ℹ️ Gemini API Key not provided.")
gemini_client = None
# OpenAI (Optional, depends on model selection later)
if openai_api_key_input and OpenAI:
try:
openai_client = OpenAI(api_key=openai_api_key_input)
log_accumulator += "✅ OpenAI Client 初始化成功。\n"
print("✅ OpenAI Client initialized.")
except Exception as e:
log_accumulator += f"⚠️ 初始化 OpenAI Client 失敗 (若未使用 OpenAI 模型可忽略): {e}\n"
print(f"⚠️ Error initializing OpenAI Client (ignore if not using OpenAI models): {e}")
openai_client = None # Ensure it's None if init fails
elif not OpenAI:
log_accumulator += "ℹ️ OpenAI library 未安裝,無法使用 OpenAI 模型。\n"
print("ℹ️ OpenAI library not installed.")
openai_client = None
else:
log_accumulator += "ℹ️ 未提供 OpenAI API Key,將無法使用 OpenAI 模型。\n"
print("ℹ️ OpenAI API Key not provided.")
openai_client = None
# --- End API Key and Client Initialization ---
if not output_format:
log_accumulator += "❌ 請至少選擇一種輸出格式(中文翻譯 或 英文原文)\n"
print("❌ 錯誤:未選擇輸出格式")
# Yield Nones for files/previews and the error log (6 values total)
yield None, None, None, None, None, "❌ 錯誤:未選擇輸出格式\n" + log_accumulator
return
pdf_path_obj = Path(pdf_file.name) # Use pdf_file.name for Path object with temp files
filename_stem = pdf_path_obj.stem
# Sanitize the filename stem (replace spaces with underscores)
sanitized_stem = filename_stem.replace(" ", "_")
print(f"收到檔案: {pdf_path_obj.name} (Sanitized Stem: {sanitized_stem})") # Console print
print(f"選擇的輸出格式: {output_format}")
# --- Output Directory Logic (Using Temp Dir for Gradio Compatibility) ---
try:
# Create a unique temporary directory for this run's outputs
# This directory will be inside Gradio's allowed paths (/tmp)
temp_base_dir = tempfile.mkdtemp()
output_dir = os.path.join(temp_base_dir, "outputs") # Subdir for final files
checkpoint_dir = os.path.join(temp_base_dir, f"checkpoints_{sanitized_stem}") # Subdir for checkpoints
os.makedirs(output_dir, exist_ok=True)
os.makedirs(checkpoint_dir, exist_ok=True)
log_accumulator += f"📂 使用暫存輸出目錄: {output_dir}\n"
log_accumulator += f"💾 使用暫存檢查點目錄: {checkpoint_dir}\n"
print(f"Using temporary output directory: {output_dir}")
print(f"Using temporary checkpoint directory: {checkpoint_dir}")
except Exception as e:
error_msg = f"❌ 無法建立暫存目錄: {e}"
log_accumulator += f"{error_msg}\n"
print(f"❌ 錯誤:{error_msg}")
# Yield Nones for files/previews and the error log (6 values total)
yield None, None, None, None, None, f"❌ 錯誤:{error_msg}\n" + log_accumulator
return
# --- End Output Directory Logic ---
# --- Initial Log Messages ---
# Yield updates for the log output only (6 values total)
log_accumulator += f"🚀 開始處理 PDF: {pdf_path_obj.name}\n"
yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), log_accumulator
# Log the temp dirs being used
log_accumulator += f"📂 使用暫存輸出目錄: {output_dir}\n" # Added log message back
yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), log_accumulator
log_accumulator += f"💾 使用暫存檢查點目錄: {checkpoint_dir}\n" # Added log message back
yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), log_accumulator
# Determine if translation is needed based on CheckboxGroup selection
# The 'translate' checkbox is now less relevant, primary control is output_format
need_translation_for_processing = "中文翻譯" in output_format
log_accumulator += "✅ 將產生中文翻譯\n" if need_translation_for_processing else "ℹ️ 不產生中文翻譯 (未勾選)\n"
yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), log_accumulator
log_accumulator += "✅ 使用現有檢查點(如果存在)\n" if use_existing_checkpoints else "🔄 重新處理所有步驟(不使用現有檢查點)\n"
yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), log_accumulator
print(f"需要翻譯: {need_translation_for_processing}, 使用檢查點: {use_existing_checkpoints}")
# --- Main Processing ---
try:
# process_pdf_to_markdown is a generator, iterate through its yields
processor = process_pdf_to_markdown(
pdf_path=pdf_file, # Pass the file path/object directly
mistral_client=mistral_client,
gemini_client=gemini_client,
openai_client=openai_client,
ocr_model=ocr_model,
structure_model=structure_model,
structure_text_only=structure_text_only, # Pass text-only flag
translation_model=translation_model,
translation_system_prompt=translation_system_prompt if translation_system_prompt.strip() else None,
process_images=process_images,
output_formats_selected=output_format, # Pass selected formats
output_dir=output_dir,
checkpoint_dir=checkpoint_dir,
use_existing_checkpoints=use_existing_checkpoints
)
result_data = None
# Iterate through the generator from process_pdf_to_markdown
for item in processor:
if isinstance(item, dict): # Check if it's the final result dict
result_data = item
# Don't yield the dict itself to the log
elif isinstance(item, str):
# Append and yield intermediate logs (6 values total)
log_accumulator += f"{item}\n"
# Yield updates for the log output only
yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), log_accumulator
# Handle potential other types if necessary, otherwise ignore
# --- Process Final Result for UI ---
# This part runs after the processor generator is exhausted
if result_data:
saved_files_dict = result_data.get("saved_files", {})
output_original_md_path = saved_files_dict.get("original_file")
output_translated_md_path = saved_files_dict.get("translated_file")
image_folder_path = saved_files_dict.get("image_folder") # Gets the folder path
# Zip the image folder only if the path exists and it's a directory
if image_folder_path and os.path.isdir(image_folder_path):
log_accumulator += f"ℹ️ 找到圖片資料夾: {image_folder_path},嘗試壓縮...\n"
print(f"ℹ️ Found image folder: {image_folder_path}, attempting to zip...")
zip_base_name = image_folder_path # Use folder name as base for zip path
try:
# Ensure the target zip path doesn't conflict if run multiple times in same temp dir context (though mkdtemp should prevent this)
output_images_zip_path = shutil.make_archive(zip_base_name, 'zip', root_dir=os.path.dirname(image_folder_path), base_dir=os.path.basename(image_folder_path))
log_accumulator += f"✅ 已成功壓縮圖片資料夾:{output_images_zip_path}\n"
print(f"✅ Successfully zipped images: {output_images_zip_path}")
except Exception as zip_e:
error_msg = f"⚠️ 壓縮圖片資料夾 '{image_folder_path}' 失敗: {zip_e}"
log_accumulator += f"{error_msg}\n"
print(error_msg)
output_images_zip_path = None # Ensure it's None if zipping failed
else:
# Explicitly log if image folder wasn't found or isn't a directory
if image_folder_path: # Path exists but not a dir
log_accumulator += f"ℹ️ 找到圖片資料夾路徑,但 '{image_folder_path}' 不是有效的資料夾。無法壓縮。\n"
print(f"ℹ️ Image folder path found but not a directory: {image_folder_path}. Cannot zip.")
else: # Path not found in saved_files (likely no images in PDF or folder wasn't saved)
log_accumulator += f"ℹ️ 未找到圖片資料夾路徑 (可能 PDF 無圖片或未儲存)。無法壓縮。\n"
print(f"ℹ️ Image folder path not found in saved_files (likely no images in PDF or folder not saved). Cannot zip.")
output_images_zip_path = None # Ensure it's None
final_log_message = "✅ 處理完成!請查看預覽視窗,或至下載檔案視窗下載檔案。" # Updated message
log_accumulator += f"{final_log_message}\n"
print(f"--- Gradio 處理請求完成 ---")
else:
final_log_message = "⚠️ 處理完成,但未收到預期的結果字典。"
log_accumulator += f"{final_log_message}\n"
print(f"⚠️ 警告:{final_log_message}")
# Final yield: provide paths for file outputs, markdown content for previews, and the final log
yield (
output_original_md_path,
output_translated_md_path,
output_images_zip_path,
result_data.get("original_content", "無原文內容可預覽"), # Content for original preview
result_data.get("translated_content", "無翻譯內容可預覽"), # Content for translated preview
log_accumulator
)
except Exception as e:
error_message = f"❌ Gradio 處理過程中發生未預期錯誤: {str(e)}"
log_accumulator += f"{error_message}\n"
print(f"❌ 嚴重錯誤:{error_message}")
import traceback
traceback.print_exc() # Print full traceback to console
# Final yield in case of error: provide Nones for files/previews and the error log (6 values total)
yield None, None, None, None, None, log_accumulator
# Create Gradio interface
with gr.Blocks(title="Mistral OCR & Translation Tool") as demo:
gr.Markdown("""
# Mistral OCR & 翻譯工具
Convert PDF files to Markdown with OCR and English-to-Chinese translation, powered by Mistral, Gemini, and OpenAI.
將 PDF 文件轉為 Markdown 格式,支援圖片 OCR 和英文到繁體中文翻譯,使用 Mistral、Gemini 和 OpenAI 模型。
""")
with gr.Row():
with gr.Column(scale=1):
pdf_file = gr.File(label="上傳 PDF 檔案", file_types=[".pdf"])
with gr.Accordion("基本設定", open=True):
# Define default path for placeholder clarity
default_output_path_display = os.path.join("桌面", "MistralOCR_Output") # Simplified for display
# Output directory is now handled internally using tempfile, remove UI element
# output_dir = gr.Textbox(
# label="輸出目錄 (請貼上完整路徑)",
# placeholder=f"留空預設儲存至:{default_output_path_display}",
# info="將所有輸出檔案 (Markdown, 圖片, 檢查點) 儲存於此目錄。",
# value="" # Default logic remains in process_pdf
# )
use_existing_checkpoints = gr.Checkbox(
label="使用現有檢查點(如果存在)",
value=True,
info="啟用後,如果檢查點存在,將跳過已完成的步驟。"
)
output_format = gr.CheckboxGroup(
label="輸出格式 (可多選)",
choices=["中文翻譯", "英文原文"],
value=["中文翻譯", "英文原文"], # Default to both
info="選擇您需要儲存的 Markdown 檔案格式。"
)
with gr.Accordion("API Keys (請自行填入)", open=True):
mistral_api_key_input = gr.Textbox(
label="Mistral API Key",
type="password",
placeholder="請貼上你的 Mistral API Key",
info="(必要) 用於 PDF 和圖片 OCR。請從 https://console.mistral.ai/ 獲取。此金鑰僅用於本次處理,不會儲存。"
)
gemini_api_key_input = gr.Textbox(
label="Gemini API Key",
type="password",
placeholder="請貼上你的 Gemini API Key",
info="(推薦) 若選擇 Gemini 模型進行翻譯或結構化,則需要。請從 https://aistudio.google.com/app/apikey 獲取。此金鑰僅用於本次處理,不會儲存。"
)
openai_api_key_input = gr.Textbox(
label="OpenAI API Key",
type="password",
placeholder="請貼上你的 OpenAI API Key",
info="(可選) 若選擇 GPT 模型進行翻譯或結構化,則需要。請從 https://platform.openai.com/api-keys 獲取。此金鑰僅用於本次處理,不會儲存。"
)
with gr.Accordion("處理選項", open=False):
process_images = gr.Checkbox(
label="處理圖片 OCR",
value=True,
info="啟用後,將對 PDF 中的圖片額外進行 OCR 辨識"
)
with gr.Accordion("模型設定", open=True):
ocr_model = gr.Dropdown(
label="OCR 模型",
choices=["mistral-ocr-latest"],
value="mistral-ocr-latest"
)
structure_model = gr.Dropdown(
label="結構化模型 (用於圖片 OCR)",
choices=[
("pixtral-12b-latest (Recommend)", "pixtral-12b-latest"),
("gemini-2.0-flash (Recommend)", "gemini-2.0-flash"),
("gpt-4o-mini", "gpt-4o-mini"),
("gpt-4o", "gpt-4o"),
("gpt-4.1-nano (Not Recommend)", "gpt-4.1-nano"),
("gpt-4.1-mini", "gpt-4.1-mini"),
("gpt-4.1", "gpt-4.1")
],
value="gemini-2.0-flash",
info="選擇用於結構化圖片 OCR 結果的模型。需要對應的 API Key。"
)
structure_text_only = gr.Checkbox(
label="僅用文字進行結構化 (節省 Token)",
value=False,
info="勾選後,僅將圖片的初步 OCR 文字傳送給 Gemini 或 OpenAI 進行結構化,不傳送圖片本身。對 Pixtral 無效。⚠️注意:缺少圖片視覺資訊可能導致結構化效果不佳,建議僅在 OCR 文字已足夠清晰時使用。"
)
translation_model = gr.Dropdown(
label="翻譯模型",
choices=[
("gemini-2.0-flash (Recommend)", "gemini-2.0-flash"),
("gemini-2.5-pro-exp-03-25", "gemini-2.5-pro-exp-03-25"),
("gemini-2.0-flash-lite", "gemini-2.0-flash-lite"),
("gpt-4o", "gpt-4o"),
("gpt-4o-mini", "gpt-4o-mini"),
("gpt-4.1-nano (Not Recommend)", "gpt-4.1-nano"),
("gpt-4.1-mini", "gpt-4.1-mini"),
("gpt-4.1", "gpt-4.1")
],
value="gemini-2.0-flash",
info="選擇用於翻譯的模型。需要對應的 API Key。"
)
with gr.Accordion("進階設定", open=False):
translation_system_prompt = gr.Textbox(
label="翻譯系統提示詞",
value=DEFAULT_TRANSLATION_SYSTEM_INSTRUCTION,
lines=10
)
process_button = gr.Button("開始處理", variant="primary")
with gr.Column(scale=2):
with gr.Tab("處理日誌"):
console_output = gr.Textbox(
label="處理進度",
lines=20,
max_lines=50,
interactive=False,
autoscroll=True
)
with gr.Tab("使用說明"):
gr.Markdown("""
# 使用說明
1. 上傳 PDF 檔案(可拖曳或點擊上傳)
2. 輸入 Mistral API 金鑰(必要)及 Gemini/OpenAI 金鑰(可選)
3. 基本設定:
- 選擇是否使用現有檢查點(預設啟用)
- 選擇輸出格式(中文翻譯、英文原文,可多選)
4. 處理選項:
- 選擇是否處理圖片 OCR(預設啟用)
5. 模型與進階設定(可選):
- 選擇 OCR、結構化、翻譯模型
- 修改翻譯提示詞(若需其他語言)
6. 點擊「開始處理」按鈕
7. 於「處理日誌」標籤查看進度,完成後從「下載檔案」標籤下載結果
## 檢查點說明
- **PDF OCR 檢查點**:儲存 PDF 的 OCR 結果
- **圖片 OCR 檢查點**:儲存圖片的 OCR 結構化結果
- 若需重新處理,可取消勾選「使用現有檢查點」
## 輸出檔案
- `[檔名]_original.md`:英文原文 Markdown
- `[檔名]_translated.md`:繁體中文翻譯 Markdown
- `images_[檔名].zip`:PDF 中提取的圖片
## API 使用量參考(粗略估計)
以下為兩個實際測試場景的 API 使用情況,可供預估大致耗用量:
### 測試場景一(Gemini 全流程)
- **PDF 範例**:Jones & Bergen (2025) 論文前 3 頁(含 1 張圖片)
- **Mistral OCR**:消耗約 **4 Pages**(含圖片額外一次處理)
- **Gemini 2.0 Flash**:
- 結構化 + 翻譯(單模型)
- 輸入 Token 約 **7,300 Tokens**
### 測試場景二(分開處理:Gemini 結構化 + GPT-4o Mini 翻譯)
- **PDF 範例**:同一份 3 頁英文文件(含圖片)
- **Mistral OCR**:消耗約 **4 Pages**
- **Gemini 2.0 Flash**(僅做結構化):
- 輸入 Token 約 **2,357 Tokens**
- **GPT-4o Mini**(做翻譯):
- 輸入 Token 約 **4,440 Tokens**
> **注意**:實際耗用量會根據 PDF 頁數、內容密度、圖片比例與翻譯範圍有所不同,以上數據僅供參考。
測試樣本之一引用:
Jones, C. R., & Bergen, B. K. (2025). *Large Language Models Pass the Turing Test*. *arXiv preprint* [arXiv:2503.23674](https://arxiv.org/abs/2503.23674)
本測試僅借用該論文前 3 頁作為輸入範例進行處理流程測試,未轉載、修改或散佈其內容。
""")
with gr.Tab("預覽原文"): # New Tab for Original Preview
preview_original_md = gr.Markdown(label="預覽原文 Markdown")
with gr.Tab("預覽翻譯"): # New Tab for Translated Preview
preview_translated_md = gr.Markdown(label="預覽翻譯 Markdown")
with gr.Tab("下載檔案"): # Changed Tab name
# Add File output components for downloads
output_original_md = gr.File(label="下載原文 Markdown (.md)")
output_translated_md = gr.File(label="下載翻譯 Markdown (.md)")
output_images_zip = gr.File(label="下載圖片 (.zip)")
with gr.Tab("關於"): # 新增標籤
gr.Markdown("""
## 關於 Mistral OCR 翻譯工具
本工具由 **David Chang** 開發,旨在將 PDF 文件轉換為 Markdown 格式,支援圖片 OCR 和英文到繁體中文的翻譯。整合以下技術:
- **Mistral AI**:PDF 和圖片 OCR
- **Google Gemini / OpenAI**:翻譯與結構化
- **Gradio**:互動式網頁介面
### 版權與授權
- **作者**:David Chang
- **版權**:© 2025 David Chang
- **授權**:MIT 授權,詳見 [LICENSE](https://github.com/dodo13114arch/mistralocr-pdf2md-translator/blob/main/LICENSE)
- **GitHub**:https://github.com/dodo13114arch/mistralocr-pdf2md-translator
### 感謝
感謝 Mistral AI、Google Gemini、OpenAI 和 Gradio 提供的技術支持,以及 Mistral 官方範例的啟發 ([Colab Notebook](https://colab.research.google.com/github/mistralai/cookbook/blob/main/mistral/ocr/structured_ocr.ipynb))。
### 聯繫與反饋
歡迎在 GitHub 上提交問題或建議!
""")
# Define outputs for the click event
# Order must match the final yield in process_pdf:
# file_orig, file_trans, file_zip, preview_orig, preview_trans, console_log
outputs_list = [
output_original_md,
output_translated_md,
output_images_zip,
preview_original_md, # Added output for original preview
preview_translated_md, # Added output for translated preview
console_output
]
# Define inputs for the click event (remove console_output)
inputs_list=[
pdf_file,
# API Key Inputs
mistral_api_key_input,
gemini_api_key_input,
openai_api_key_input,
# Other parameters
ocr_model,
structure_model,
translation_model,
translation_system_prompt,
process_images,
# translate, # Removed
output_format, # CheckboxGroup list
use_existing_checkpoints,
structure_text_only
]
# Use process_button.click with the generator function
process_button.click(
fn=process_pdf,
inputs=inputs_list,
outputs=outputs_list
)
# Add event handler to exit script when UI is closed/unloaded
# Removed inputs and outputs arguments as they are not accepted by unload
# demo.unload(fn=lambda: os._exit(0))
gr.Markdown("""
---
**免責聲明**
本工具僅供學習與研究用途,整合 Mistral、Google Gemini 和 OpenAI API。請確保:
- 您擁有合法的 API 金鑰,並遵守各服務條款([Mistral](https://mistral.ai/terms)、[Gemini](https://ai.google.dev/terms)、[OpenAI](https://openai.com/policies))。
- 上傳的 PDF 文件符合版權法規,您有權進行處理。
- 翻譯結果可能有誤,請自行驗證。
本工具不儲存任何上傳檔案或 API 金鑰,所有處理均在暫存環境中完成。
**版權資訊**
Copyright © 2025 David Chang. 根據 MIT 授權發布,詳見 [LICENSE](https://github.com/dodo13114arch/mistralocr-pdf2md-translator/blob/main/LICENSE)。
GitHub: https://github.com/dodo13114arch/mistralocr-pdf2md-translator
""")
return demo
# ===== Main Execution =====
if __name__ == "__main__":
# Create and launch Gradio interface
demo = create_gradio_interface()
demo.launch()
|