wony617 commited on
Commit
ef58ab4
·
1 Parent(s): 159b6fa

Add open PR checking & Add prompt injection in translation step

Browse files
agent/handler.py CHANGED
@@ -13,6 +13,7 @@ from agent.workflow import (
13
  generate_github_pr,
14
  )
15
  from pr_generator.searcher import find_reference_pr_simple_stream
 
16
 
17
 
18
  # State management
@@ -22,6 +23,7 @@ class ChatState:
22
  self.target_language = "ko"
23
  self.k_files = 10
24
  self.files_to_translate = []
 
25
  self.current_file_content = {"translated": ""}
26
  self.pr_result = None # Store PR creation result
27
  # GitHub configuration
@@ -127,8 +129,8 @@ def start_translation_process():
127
 
128
  # Call translation function (simplified for demo)
129
  try:
130
- status, translated = translate_docs_interactive(
131
- state.target_language, [[current_file]]
132
  )
133
 
134
  state.current_file_content = {"translated": translated}
@@ -152,10 +154,15 @@ def start_translation_process():
152
  ""
153
  f"{original_file_link}\n"
154
  "**🌐 Translated Content:**\n"
155
- f"\n```\n\n{_extract_content_for_display(translated)}\n```"
 
 
 
156
  # f"{status}\n"
157
  # "✅ Translation completed. The code block will be added when generating PR."
158
  )
 
 
159
 
160
  except Exception as e:
161
  response = f"❌ Translation failed: {str(e)}"
@@ -211,12 +218,14 @@ def handle_user_message(message, history):
211
  # User wants to start translation
212
  if state.files_to_translate:
213
  state.step = "translate"
214
- response = start_translation_process()
 
 
 
215
  else:
216
  response = (
217
  "❌ No files available for translation. Please search for files first."
218
  )
219
-
220
  # Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
221
  else:
222
  # General response
@@ -308,15 +317,43 @@ def update_github_config(token, owner, repo, reference_pr_url):
308
  return f"✅ GitHub configuration updated: {owner}/{repo}"
309
 
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  def send_message(message, history):
312
  new_history, cleared_input = handle_user_message(message, history)
313
  return new_history, cleared_input, update_status()
314
 
315
 
316
  # Button handlers with tab switching
317
- def start_translate_handler(history, anthropic_key, file_to_translate):
318
  os.environ["ANTHROPIC_API_KEY"] = anthropic_key
319
-
 
320
  state.files_to_translate = [file_to_translate]
321
  new_hist, cleared_input = handle_user_message("start translation", history)
322
  selected_tabs = 2 if state.current_file_content["translated"] else 0
 
13
  generate_github_pr,
14
  )
15
  from pr_generator.searcher import find_reference_pr_simple_stream
16
+ from translator.content import get_full_prompt, get_content, preprocess_content
17
 
18
 
19
  # State management
 
23
  self.target_language = "ko"
24
  self.k_files = 10
25
  self.files_to_translate = []
26
+ self.additional_instruction = ""
27
  self.current_file_content = {"translated": ""}
28
  self.pr_result = None # Store PR creation result
29
  # GitHub configuration
 
129
 
130
  # Call translation function (simplified for demo)
131
  try:
132
+ translated = translate_docs_interactive(
133
+ state.target_language, [[current_file]], state.additional_instruction
134
  )
135
 
136
  state.current_file_content = {"translated": translated}
 
154
  ""
155
  f"{original_file_link}\n"
156
  "**🌐 Translated Content:**\n"
157
+ # f"\n```\n\n{_extract_content_for_display(translated)}\n```"
158
+ # "\n```\n\n"
159
+ # f"\n{translated}\n"
160
+ # f"```"
161
  # f"{status}\n"
162
  # "✅ Translation completed. The code block will be added when generating PR."
163
  )
164
+ return response, translated
165
+
166
 
167
  except Exception as e:
168
  response = f"❌ Translation failed: {str(e)}"
 
218
  # User wants to start translation
219
  if state.files_to_translate:
220
  state.step = "translate"
221
+ response, translated = start_translation_process()
222
+ history.append([message, response])
223
+ history.append(["", translated])
224
+ return history, ""
225
  else:
226
  response = (
227
  "❌ No files available for translation. Please search for files first."
228
  )
 
229
  # Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
230
  else:
231
  # General response
 
317
  return f"✅ GitHub configuration updated: {owner}/{repo}"
318
 
319
 
320
+ def update_prompt_preview(language, file_path, additional_instruction):
321
+ """Update prompt preview based on current settings"""
322
+ if not file_path.strip():
323
+ return "Select a file to see the prompt preview..."
324
+
325
+ try:
326
+ # Get language name
327
+ if language == "ko":
328
+ translation_lang = "Korean"
329
+ else:
330
+ translation_lang = language
331
+
332
+ # Get sample content (first 500 characters)
333
+ content = get_content(file_path)
334
+ to_translate = preprocess_content(content)
335
+
336
+ # Truncate for preview
337
+ sample_content = to_translate[:500] + ("..." if len(to_translate) > 500 else "")
338
+
339
+ # Generate prompt
340
+ prompt = get_full_prompt(translation_lang, sample_content, additional_instruction)
341
+
342
+ return prompt
343
+ except Exception as e:
344
+ return f"Error generating prompt preview: {str(e)}"
345
+
346
+
347
  def send_message(message, history):
348
  new_history, cleared_input = handle_user_message(message, history)
349
  return new_history, cleared_input, update_status()
350
 
351
 
352
  # Button handlers with tab switching
353
+ def start_translate_handler(history, anthropic_key, file_to_translate, additional_instruction=""):
354
  os.environ["ANTHROPIC_API_KEY"] = anthropic_key
355
+
356
+ state.additional_instruction = additional_instruction
357
  state.files_to_translate = [file_to_translate]
358
  new_hist, cleared_input = handle_user_message("start translation", history)
359
  selected_tabs = 2 if state.current_file_content["translated"] else 0
agent/workflow.py CHANGED
@@ -51,7 +51,7 @@ def report_in_translation_status_files(translate_lang: str) -> tuple[str, list[s
51
  return status_report, docs
52
 
53
 
54
- def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
55
  """Translate documentation."""
56
  # step 1. Get content from file path
57
  content = get_content(file_path)
@@ -60,7 +60,7 @@ def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
60
  # step 2. Prepare prompt with docs content
61
  if lang == "ko":
62
  translation_lang = "Korean"
63
- to_translate_with_prompt = get_full_prompt(translation_lang, to_translate)
64
 
65
  print("to_translate_with_prompt:\n", to_translate_with_prompt)
66
 
@@ -77,8 +77,8 @@ def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
77
 
78
 
79
  def translate_docs_interactive(
80
- translate_lang: str, selected_files: list[list[str]]
81
- ) -> tuple[str, str, str]:
82
  """Interactive translation function that processes files one by one.
83
 
84
  Args:
@@ -87,27 +87,17 @@ def translate_docs_interactive(
87
  """
88
  # Extract file paths from the dataframe format
89
  file_paths = [row[0] for row in selected_files if row and len(row) > 0]
90
- if not file_paths:
91
- return (
92
- "No files selected for translation.",
93
- gr.update(visible=False),
94
- gr.update(visible=False),
95
- gr.update(visible=False),
96
- [],
97
- 0,
98
- )
99
 
100
  # Start with the first file
101
  current_file = file_paths[0]
102
 
103
  status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
104
- callback_result, translated_content = translate_docs(translate_lang, current_file)
105
  status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
106
 
107
- if len(file_paths) > 1:
108
- status += f"\n### 📝 Note: Currently, only the first file has been translated.\n> The remaining {len(file_paths) - 1} files have not been processed yet, as the system is in its beta version"
109
 
110
- return status, translated_content
111
 
112
 
113
  def generate_github_pr(
 
51
  return status_report, docs
52
 
53
 
54
+ def translate_docs(lang: str, file_path: str, additional_instruction: str = "") -> tuple[str, str]:
55
  """Translate documentation."""
56
  # step 1. Get content from file path
57
  content = get_content(file_path)
 
60
  # step 2. Prepare prompt with docs content
61
  if lang == "ko":
62
  translation_lang = "Korean"
63
+ to_translate_with_prompt = get_full_prompt(translation_lang, to_translate, additional_instruction)
64
 
65
  print("to_translate_with_prompt:\n", to_translate_with_prompt)
66
 
 
77
 
78
 
79
  def translate_docs_interactive(
80
+ translate_lang: str, selected_files: list[list[str]], additional_instruction: str = ""
81
+ ) -> tuple[str, str]:
82
  """Interactive translation function that processes files one by one.
83
 
84
  Args:
 
87
  """
88
  # Extract file paths from the dataframe format
89
  file_paths = [row[0] for row in selected_files if row and len(row) > 0]
 
 
 
 
 
 
 
 
 
90
 
91
  # Start with the first file
92
  current_file = file_paths[0]
93
 
94
  status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
95
+ callback_result, translated_content = translate_docs(translate_lang, current_file, additional_instruction)
96
  status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
97
 
98
+ print(status)
 
99
 
100
+ return translated_content
101
 
102
 
103
  def generate_github_pr(
app.py CHANGED
@@ -14,6 +14,7 @@ from agent.handler import (
14
  send_message,
15
  start_translate_handler,
16
  sync_language_displays,
 
17
  update_status,
18
  update_github_config,
19
  )
@@ -30,7 +31,7 @@ css = """
30
  background: rgba(255, 255, 180, 0.25);
31
  border-radius: 18px;
32
  box-shadow: 0 4px 24px rgba(0,0,0,0.08);
33
- padding: 1.5em;
34
  backdrop-filter: blur(8px);
35
  border: 1px solid rgba(255,255,180,0.25);
36
  width: 100%;
@@ -40,7 +41,7 @@ css = """
40
  background: rgba(255, 255, 180, 0.25);
41
  border-radius: 18px;
42
  box-shadow: 0 4px 24px rgba(0,0,0,0.08);
43
- padding: 1.5em;
44
  backdrop-filter: blur(8px);
45
  border: 1px solid rgba(255,255,180,0.25);
46
  width: 100%;
@@ -106,11 +107,12 @@ with gr.Blocks(
106
  # Content
107
  with gr.Row():
108
  # Chat interface
109
- with gr.Column(scale=4, elem_classes=["chat-container"]):
110
  gr.Markdown("### 🌐 Hugging Face i18n Agent")
111
 
112
  chatbot = gr.Chatbot(
113
- value=[[None, get_welcome_message()]], scale=1, height=585
 
114
  )
115
 
116
  # Controller interface
@@ -144,7 +146,7 @@ with gr.Blocks(
144
  choices=[],
145
  label="📄 Select a file to translate",
146
  interactive=True,
147
- value=[],
148
  )
149
  file_to_translate_input = gr.Textbox(
150
  label="🌍 Select in the dropdown or write the file path to translate",
@@ -161,6 +163,21 @@ with gr.Blocks(
161
  label="🔑 Anthropic API key for translation generation",
162
  type="password",
163
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  start_translate_btn = gr.Button(
165
  "🚀 Start Translation", elem_classes="action-button"
166
  )
@@ -230,7 +247,7 @@ with gr.Blocks(
230
  # Button event handlers
231
  start_translate_btn.click(
232
  fn=start_translate_handler,
233
- inputs=[chatbot, anthropic_key, file_to_translate_input],
234
  outputs=[chatbot, msg_input, status_display, control_tabs],
235
  )
236
 
@@ -265,5 +282,13 @@ with gr.Blocks(
265
  outputs=[chatbot, msg_input, status_display],
266
  )
267
 
 
 
 
 
 
 
 
 
268
  root_path = os.environ.get("GRADIO_ROOT_PATH")
269
  demo.launch(root_path=root_path)
 
14
  send_message,
15
  start_translate_handler,
16
  sync_language_displays,
17
+ update_prompt_preview,
18
  update_status,
19
  update_github_config,
20
  )
 
31
  background: rgba(255, 255, 180, 0.25);
32
  border-radius: 18px;
33
  box-shadow: 0 4px 24px rgba(0,0,0,0.08);
34
+ padding: 1.0em;
35
  backdrop-filter: blur(8px);
36
  border: 1px solid rgba(255,255,180,0.25);
37
  width: 100%;
 
41
  background: rgba(255, 255, 180, 0.25);
42
  border-radius: 18px;
43
  box-shadow: 0 4px 24px rgba(0,0,0,0.08);
44
+ padding: 1.0em;
45
  backdrop-filter: blur(8px);
46
  border: 1px solid rgba(255,255,180,0.25);
47
  width: 100%;
 
107
  # Content
108
  with gr.Row():
109
  # Chat interface
110
+ with gr.Column(scale=3, elem_classes=["chat-container"]):
111
  gr.Markdown("### 🌐 Hugging Face i18n Agent")
112
 
113
  chatbot = gr.Chatbot(
114
+ value=[[None, get_welcome_message()]], scale=1, height=585,
115
+ show_copy_button=True
116
  )
117
 
118
  # Controller interface
 
146
  choices=[],
147
  label="📄 Select a file to translate",
148
  interactive=True,
149
+ value=None,
150
  )
151
  file_to_translate_input = gr.Textbox(
152
  label="🌍 Select in the dropdown or write the file path to translate",
 
163
  label="🔑 Anthropic API key for translation generation",
164
  type="password",
165
  )
166
+ additional_instruction = gr.Textbox(
167
+ label="📝 Additional instructions (Optional - e.g., custom glossary)",
168
+ placeholder="Example: Translate 'model' as '모델' consistently",
169
+ lines=2,
170
+ )
171
+
172
+ with gr.Accordion("🔍 Preview Prompt", open=False):
173
+ prompt_preview = gr.Textbox(
174
+ label="Current Translation Prompt",
175
+ lines=8,
176
+ interactive=False,
177
+ placeholder="Select a file and language to see the prompt preview...",
178
+ show_copy_button=True,
179
+ )
180
+
181
  start_translate_btn = gr.Button(
182
  "🚀 Start Translation", elem_classes="action-button"
183
  )
 
247
  # Button event handlers
248
  start_translate_btn.click(
249
  fn=start_translate_handler,
250
+ inputs=[chatbot, anthropic_key, file_to_translate_input, additional_instruction],
251
  outputs=[chatbot, msg_input, status_display, control_tabs],
252
  )
253
 
 
282
  outputs=[chatbot, msg_input, status_display],
283
  )
284
 
285
+ # Update prompt preview when inputs change
286
+ for input_component in [translate_lang_display, file_to_translate_input, additional_instruction]:
287
+ input_component.change(
288
+ fn=update_prompt_preview,
289
+ inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
290
+ outputs=[prompt_preview],
291
+ )
292
+
293
  root_path = os.environ.get("GRADIO_ROOT_PATH")
294
  demo.launch(root_path=root_path)
pr_generator/agent.py CHANGED
@@ -518,7 +518,7 @@ Please return only the commit message. No other explanation is needed."""
518
  "status": "partial_success",
519
  "branch": branch_name,
520
  "file_path": target_filepath,
521
- "message": f"File was saved but PR creation failed: {pr_result}",
522
  "error_details": pr_result,
523
  }
524
  elif "successful" in pr_result and "http" in pr_result:
 
518
  "status": "partial_success",
519
  "branch": branch_name,
520
  "file_path": target_filepath,
521
+ "message": f"File was saved and commit was successful.\nPR creation failed: {pr_result}",
522
  "error_details": pr_result,
523
  }
524
  elif "successful" in pr_result and "http" in pr_result:
translator/content.py CHANGED
@@ -9,6 +9,9 @@ from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
9
 
10
 
11
  def get_content(filepath: str) -> str:
 
 
 
12
  url = string.Template(
13
  "https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
14
  ).safe_substitute(filepath=filepath)
@@ -26,25 +29,31 @@ def preprocess_content(content: str) -> str:
26
  ## ignore top license comment
27
  to_translate = content[content.find("#") :]
28
  ## remove code blocks from text
29
- to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
30
  ## remove markdown tables from text
31
- to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
32
  ## remove empty lines from text
33
  to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
34
-
35
  return to_translate
36
 
37
 
38
- def get_full_prompt(language: str, to_translate: str) -> str:
39
- prompt = string.Template(
40
  "What do these sentences about Hugging Face Transformers "
41
  "(a machine learning library) mean in $language? "
42
  "Please do not translate the word after a 🤗 emoji "
43
  "as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
44
- "No explanations or extras—only the translated markdown"
45
- "\n\n```md"
46
  ).safe_substitute(language=language)
47
- return "\n".join([prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
 
 
 
 
 
 
 
 
48
 
49
 
50
  def split_markdown_sections(markdown: str) -> list:
@@ -72,36 +81,84 @@ def make_scaffold(content: str, to_translate: str) -> string.Template:
72
  return string.Template(scaffold)
73
 
74
 
 
 
 
 
 
 
 
75
  def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
76
  scaffold = make_scaffold(content, to_translate)
77
  print("scaffold:")
78
  print(scaffold.template)
 
 
 
 
 
79
  divided = split_markdown_sections(to_translate)
80
  print("divided:")
81
  print(divided)
82
  anchors = get_anchors(divided)
83
-
84
- translated = split_markdown_sections(translated)
85
- print("translated:")
86
- print(translated)
87
-
88
- translated[1::3] = [
89
- f"{korean_title} {anchors[i]}"
90
- for i, korean_title in enumerate(translated[1::3])
91
- ]
92
- translated = "".join(
93
- ["".join(translated[i * 3 : i * 3 + 3]) for i in range(len(translated) // 3)]
94
- ).split("\n\n")
95
- if newlines := scaffold.template.count("$hf_i18n_placeholder") - len(translated):
96
- return str(
97
- [
98
- f"Please {'recover' if newlines > 0 else 'remove'} "
99
- f"{abs(newlines)} incorrectly inserted double newlines."
100
- ]
101
- )
102
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  translated_doc = scaffold.safe_substitute(
104
- {f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated)}
105
  )
106
  return translated_doc
107
 
 
9
 
10
 
11
  def get_content(filepath: str) -> str:
12
+ if filepath == "":
13
+ raise ValueError("No files selected for translation.")
14
+
15
  url = string.Template(
16
  "https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
17
  ).safe_substitute(filepath=filepath)
 
29
  ## ignore top license comment
30
  to_translate = content[content.find("#") :]
31
  ## remove code blocks from text
32
+ # to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
33
  ## remove markdown tables from text
34
+ # to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
35
  ## remove empty lines from text
36
  to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
 
37
  return to_translate
38
 
39
 
40
+ def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
41
+ base_prompt = string.Template(
42
  "What do these sentences about Hugging Face Transformers "
43
  "(a machine learning library) mean in $language? "
44
  "Please do not translate the word after a 🤗 emoji "
45
  "as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
46
+ "No explanations or extras—only the translated markdown. Also translate all comments within code blocks as well."
 
47
  ).safe_substitute(language=language)
48
+
49
+ base_prompt += "\n\n```md"
50
+
51
+ full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
52
+
53
+ if additional_instruction.strip():
54
+ full_prompt += f"\n\n🗒️ Additional instructions: {additional_instruction.strip()}"
55
+
56
+ return full_prompt
57
 
58
 
59
  def split_markdown_sections(markdown: str) -> list:
 
81
  return string.Template(scaffold)
82
 
83
 
84
+ def is_in_code_block(text: str, position: int) -> bool:
85
+ """Check if a position in text is inside a code block"""
86
+ text_before = text[:position]
87
+ code_block_starts = text_before.count("```")
88
+ return code_block_starts % 2 == 1
89
+
90
+
91
  def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
92
  scaffold = make_scaffold(content, to_translate)
93
  print("scaffold:")
94
  print(scaffold.template)
95
+
96
+ # Get original text sections to maintain structure
97
+ original_sections = to_translate.split("\n\n")
98
+
99
+ # Split markdown sections to get headers and anchors
100
  divided = split_markdown_sections(to_translate)
101
  print("divided:")
102
  print(divided)
103
  anchors = get_anchors(divided)
104
+
105
+ # Split translated content by markdown sections
106
+ translated_divided = split_markdown_sections(translated)
107
+ print("translated divided:")
108
+ print(translated_divided)
109
+
110
+ # Ensure we have the same number of headers as the original
111
+ if len(translated_divided[1::3]) != len(anchors):
112
+ print(f"Warning: Header count mismatch. Original: {len(anchors)}, Translated: {len(translated_divided[1::3])}")
113
+ # Adjust anchors list to match translated headers
114
+ if len(translated_divided[1::3]) < len(anchors):
115
+ anchors = anchors[:len(translated_divided[1::3])]
116
+ else:
117
+ # Add empty anchors for extra headers
118
+ anchors.extend([""] * (len(translated_divided[1::3]) - len(anchors)))
119
+
120
+ # Add anchors to translated headers only if they're not in code blocks
121
+ for i, korean_title in enumerate(translated_divided[1::3]):
122
+ if i < len(anchors):
123
+ # Find the position of this header in the original translated text
124
+ header_pos = translated.find(korean_title.strip())
125
+ if header_pos != -1 and not is_in_code_block(translated, header_pos):
126
+ translated_divided[1 + i * 3] = f"{korean_title} {anchors[i]}"
127
+ else:
128
+ translated_divided[1 + i * 3] = korean_title
129
+
130
+ # Reconstruct translated content with proper structure
131
+ reconstructed_translated = "".join([
132
+ "".join(translated_divided[i * 3 : i * 3 + 3])
133
+ for i in range(len(translated_divided) // 3)
134
+ ])
135
+
136
+ # Split by double newlines to match original structure
137
+ translated_sections = reconstructed_translated.split("\n\n")
138
+
139
+ print("scaffold template count:")
140
+ print(scaffold.template.count("$hf_i18n_placeholder"))
141
+ print("original sections length:")
142
+ print(len(original_sections))
143
+ print("translated sections length:")
144
+ print(len(translated_sections))
145
+
146
+ # Ensure section counts match
147
+ placeholder_count = scaffold.template.count("$hf_i18n_placeholder")
148
+
149
+ if len(translated_sections) < placeholder_count:
150
+ # Add empty sections if translated has fewer sections
151
+ translated_sections.extend([""] * (placeholder_count - len(translated_sections)))
152
+ elif len(translated_sections) > placeholder_count:
153
+ # Truncate if translated has more sections
154
+ translated_sections = translated_sections[:placeholder_count]
155
+
156
+ # Final check
157
+ if len(translated_sections) != placeholder_count:
158
+ return f"Error: Section count mismatch. Expected: {placeholder_count}, Got: {len(translated_sections)}"
159
+
160
  translated_doc = scaffold.safe_substitute(
161
+ {f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated_sections)}
162
  )
163
  return translated_doc
164
 
translator/retriever.py CHANGED
@@ -37,18 +37,33 @@ def get_github_issue_open_pr(lang: str = "ko"):
37
  "No Github issue has been registered to the server. (Only 'ko' is supported - please contact us to support this.)"
38
  )
39
 
40
- url = f"https://api.github.com/repos/huggingface/transformers/pulls?state=open"
41
-
42
  headers = {
43
  "Accept": "application/vnd.github+json",
44
  }
45
- response = requests.get(url, headers=headers)
46
-
47
- if response.status_code != 200:
48
- raise Exception(f"GitHub API error: {response.status_code} {response.text}")
49
-
50
- open_prs = response.json()
51
- filtered_prs = [pr for pr in open_prs if pr["title"].startswith("🌐 [i18n-KO]")]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  pattern = re.compile(r"`([^`]+\.md)`")
54
 
 
37
  "No Github issue has been registered to the server. (Only 'ko' is supported - please contact us to support this.)"
38
  )
39
 
 
 
40
  headers = {
41
  "Accept": "application/vnd.github+json",
42
  }
43
+
44
+ all_open_prs = []
45
+ page = 1
46
+ per_page = 100 # Maximum allowed by GitHub API
47
+
48
+ while True:
49
+ url = f"https://api.github.com/repos/huggingface/transformers/pulls?state=open&page={page}&per_page={per_page}"
50
+ response = requests.get(url, headers=headers)
51
+
52
+ if response.status_code != 200:
53
+ raise Exception(f"GitHub API error: {response.status_code} {response.text}")
54
+
55
+ page_prs = response.json()
56
+ if not page_prs: # No more PRs
57
+ break
58
+
59
+ all_open_prs.extend(page_prs)
60
+ page += 1
61
+
62
+ # Break if we got less than per_page results (last page)
63
+ if len(page_prs) < per_page:
64
+ break
65
+
66
+ filtered_prs = [pr for pr in all_open_prs if pr["title"].startswith("🌐 [i18n-KO]")]
67
 
68
  pattern = re.compile(r"`([^`]+\.md)`")
69