wony617
commited on
Commit
·
ef58ab4
1
Parent(s):
159b6fa
Add open PR checking & Add prompt injection in translation step
Browse files- agent/handler.py +44 -7
- agent/workflow.py +7 -17
- app.py +31 -6
- pr_generator/agent.py +1 -1
- translator/content.py +86 -29
- translator/retriever.py +24 -9
agent/handler.py
CHANGED
@@ -13,6 +13,7 @@ from agent.workflow import (
|
|
13 |
generate_github_pr,
|
14 |
)
|
15 |
from pr_generator.searcher import find_reference_pr_simple_stream
|
|
|
16 |
|
17 |
|
18 |
# State management
|
@@ -22,6 +23,7 @@ class ChatState:
|
|
22 |
self.target_language = "ko"
|
23 |
self.k_files = 10
|
24 |
self.files_to_translate = []
|
|
|
25 |
self.current_file_content = {"translated": ""}
|
26 |
self.pr_result = None # Store PR creation result
|
27 |
# GitHub configuration
|
@@ -127,8 +129,8 @@ def start_translation_process():
|
|
127 |
|
128 |
# Call translation function (simplified for demo)
|
129 |
try:
|
130 |
-
|
131 |
-
state.target_language, [[current_file]]
|
132 |
)
|
133 |
|
134 |
state.current_file_content = {"translated": translated}
|
@@ -152,10 +154,15 @@ def start_translation_process():
|
|
152 |
""
|
153 |
f"{original_file_link}\n"
|
154 |
"**🌐 Translated Content:**\n"
|
155 |
-
f"\n```\n\n{_extract_content_for_display(translated)}\n```"
|
|
|
|
|
|
|
156 |
# f"{status}\n"
|
157 |
# "✅ Translation completed. The code block will be added when generating PR."
|
158 |
)
|
|
|
|
|
159 |
|
160 |
except Exception as e:
|
161 |
response = f"❌ Translation failed: {str(e)}"
|
@@ -211,12 +218,14 @@ def handle_user_message(message, history):
|
|
211 |
# User wants to start translation
|
212 |
if state.files_to_translate:
|
213 |
state.step = "translate"
|
214 |
-
response = start_translation_process()
|
|
|
|
|
|
|
215 |
else:
|
216 |
response = (
|
217 |
"❌ No files available for translation. Please search for files first."
|
218 |
)
|
219 |
-
|
220 |
# Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
|
221 |
else:
|
222 |
# General response
|
@@ -308,15 +317,43 @@ def update_github_config(token, owner, repo, reference_pr_url):
|
|
308 |
return f"✅ GitHub configuration updated: {owner}/{repo}"
|
309 |
|
310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
def send_message(message, history):
|
312 |
new_history, cleared_input = handle_user_message(message, history)
|
313 |
return new_history, cleared_input, update_status()
|
314 |
|
315 |
|
316 |
# Button handlers with tab switching
|
317 |
-
def start_translate_handler(history, anthropic_key, file_to_translate):
|
318 |
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
|
319 |
-
|
|
|
320 |
state.files_to_translate = [file_to_translate]
|
321 |
new_hist, cleared_input = handle_user_message("start translation", history)
|
322 |
selected_tabs = 2 if state.current_file_content["translated"] else 0
|
|
|
13 |
generate_github_pr,
|
14 |
)
|
15 |
from pr_generator.searcher import find_reference_pr_simple_stream
|
16 |
+
from translator.content import get_full_prompt, get_content, preprocess_content
|
17 |
|
18 |
|
19 |
# State management
|
|
|
23 |
self.target_language = "ko"
|
24 |
self.k_files = 10
|
25 |
self.files_to_translate = []
|
26 |
+
self.additional_instruction = ""
|
27 |
self.current_file_content = {"translated": ""}
|
28 |
self.pr_result = None # Store PR creation result
|
29 |
# GitHub configuration
|
|
|
129 |
|
130 |
# Call translation function (simplified for demo)
|
131 |
try:
|
132 |
+
translated = translate_docs_interactive(
|
133 |
+
state.target_language, [[current_file]], state.additional_instruction
|
134 |
)
|
135 |
|
136 |
state.current_file_content = {"translated": translated}
|
|
|
154 |
""
|
155 |
f"{original_file_link}\n"
|
156 |
"**🌐 Translated Content:**\n"
|
157 |
+
# f"\n```\n\n{_extract_content_for_display(translated)}\n```"
|
158 |
+
# "\n```\n\n"
|
159 |
+
# f"\n{translated}\n"
|
160 |
+
# f"```"
|
161 |
# f"{status}\n"
|
162 |
# "✅ Translation completed. The code block will be added when generating PR."
|
163 |
)
|
164 |
+
return response, translated
|
165 |
+
|
166 |
|
167 |
except Exception as e:
|
168 |
response = f"❌ Translation failed: {str(e)}"
|
|
|
218 |
# User wants to start translation
|
219 |
if state.files_to_translate:
|
220 |
state.step = "translate"
|
221 |
+
response, translated = start_translation_process()
|
222 |
+
history.append([message, response])
|
223 |
+
history.append(["", translated])
|
224 |
+
return history, ""
|
225 |
else:
|
226 |
response = (
|
227 |
"❌ No files available for translation. Please search for files first."
|
228 |
)
|
|
|
229 |
# Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
|
230 |
else:
|
231 |
# General response
|
|
|
317 |
return f"✅ GitHub configuration updated: {owner}/{repo}"
|
318 |
|
319 |
|
320 |
+
def update_prompt_preview(language, file_path, additional_instruction):
|
321 |
+
"""Update prompt preview based on current settings"""
|
322 |
+
if not file_path.strip():
|
323 |
+
return "Select a file to see the prompt preview..."
|
324 |
+
|
325 |
+
try:
|
326 |
+
# Get language name
|
327 |
+
if language == "ko":
|
328 |
+
translation_lang = "Korean"
|
329 |
+
else:
|
330 |
+
translation_lang = language
|
331 |
+
|
332 |
+
# Get sample content (first 500 characters)
|
333 |
+
content = get_content(file_path)
|
334 |
+
to_translate = preprocess_content(content)
|
335 |
+
|
336 |
+
# Truncate for preview
|
337 |
+
sample_content = to_translate[:500] + ("..." if len(to_translate) > 500 else "")
|
338 |
+
|
339 |
+
# Generate prompt
|
340 |
+
prompt = get_full_prompt(translation_lang, sample_content, additional_instruction)
|
341 |
+
|
342 |
+
return prompt
|
343 |
+
except Exception as e:
|
344 |
+
return f"Error generating prompt preview: {str(e)}"
|
345 |
+
|
346 |
+
|
347 |
def send_message(message, history):
|
348 |
new_history, cleared_input = handle_user_message(message, history)
|
349 |
return new_history, cleared_input, update_status()
|
350 |
|
351 |
|
352 |
# Button handlers with tab switching
|
353 |
+
def start_translate_handler(history, anthropic_key, file_to_translate, additional_instruction=""):
|
354 |
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
|
355 |
+
|
356 |
+
state.additional_instruction = additional_instruction
|
357 |
state.files_to_translate = [file_to_translate]
|
358 |
new_hist, cleared_input = handle_user_message("start translation", history)
|
359 |
selected_tabs = 2 if state.current_file_content["translated"] else 0
|
agent/workflow.py
CHANGED
@@ -51,7 +51,7 @@ def report_in_translation_status_files(translate_lang: str) -> tuple[str, list[s
|
|
51 |
return status_report, docs
|
52 |
|
53 |
|
54 |
-
def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
|
55 |
"""Translate documentation."""
|
56 |
# step 1. Get content from file path
|
57 |
content = get_content(file_path)
|
@@ -60,7 +60,7 @@ def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
|
|
60 |
# step 2. Prepare prompt with docs content
|
61 |
if lang == "ko":
|
62 |
translation_lang = "Korean"
|
63 |
-
to_translate_with_prompt = get_full_prompt(translation_lang, to_translate)
|
64 |
|
65 |
print("to_translate_with_prompt:\n", to_translate_with_prompt)
|
66 |
|
@@ -77,8 +77,8 @@ def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
|
|
77 |
|
78 |
|
79 |
def translate_docs_interactive(
|
80 |
-
translate_lang: str, selected_files: list[list[str]]
|
81 |
-
) -> tuple[str, str
|
82 |
"""Interactive translation function that processes files one by one.
|
83 |
|
84 |
Args:
|
@@ -87,27 +87,17 @@ def translate_docs_interactive(
|
|
87 |
"""
|
88 |
# Extract file paths from the dataframe format
|
89 |
file_paths = [row[0] for row in selected_files if row and len(row) > 0]
|
90 |
-
if not file_paths:
|
91 |
-
return (
|
92 |
-
"No files selected for translation.",
|
93 |
-
gr.update(visible=False),
|
94 |
-
gr.update(visible=False),
|
95 |
-
gr.update(visible=False),
|
96 |
-
[],
|
97 |
-
0,
|
98 |
-
)
|
99 |
|
100 |
# Start with the first file
|
101 |
current_file = file_paths[0]
|
102 |
|
103 |
status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
|
104 |
-
callback_result, translated_content = translate_docs(translate_lang, current_file)
|
105 |
status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
|
106 |
|
107 |
-
|
108 |
-
status += f"\n### 📝 Note: Currently, only the first file has been translated.\n> The remaining {len(file_paths) - 1} files have not been processed yet, as the system is in its beta version"
|
109 |
|
110 |
-
return
|
111 |
|
112 |
|
113 |
def generate_github_pr(
|
|
|
51 |
return status_report, docs
|
52 |
|
53 |
|
54 |
+
def translate_docs(lang: str, file_path: str, additional_instruction: str = "") -> tuple[str, str]:
|
55 |
"""Translate documentation."""
|
56 |
# step 1. Get content from file path
|
57 |
content = get_content(file_path)
|
|
|
60 |
# step 2. Prepare prompt with docs content
|
61 |
if lang == "ko":
|
62 |
translation_lang = "Korean"
|
63 |
+
to_translate_with_prompt = get_full_prompt(translation_lang, to_translate, additional_instruction)
|
64 |
|
65 |
print("to_translate_with_prompt:\n", to_translate_with_prompt)
|
66 |
|
|
|
77 |
|
78 |
|
79 |
def translate_docs_interactive(
|
80 |
+
translate_lang: str, selected_files: list[list[str]], additional_instruction: str = ""
|
81 |
+
) -> tuple[str, str]:
|
82 |
"""Interactive translation function that processes files one by one.
|
83 |
|
84 |
Args:
|
|
|
87 |
"""
|
88 |
# Extract file paths from the dataframe format
|
89 |
file_paths = [row[0] for row in selected_files if row and len(row) > 0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
# Start with the first file
|
92 |
current_file = file_paths[0]
|
93 |
|
94 |
status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
|
95 |
+
callback_result, translated_content = translate_docs(translate_lang, current_file, additional_instruction)
|
96 |
status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
|
97 |
|
98 |
+
print(status)
|
|
|
99 |
|
100 |
+
return translated_content
|
101 |
|
102 |
|
103 |
def generate_github_pr(
|
app.py
CHANGED
@@ -14,6 +14,7 @@ from agent.handler import (
|
|
14 |
send_message,
|
15 |
start_translate_handler,
|
16 |
sync_language_displays,
|
|
|
17 |
update_status,
|
18 |
update_github_config,
|
19 |
)
|
@@ -30,7 +31,7 @@ css = """
|
|
30 |
background: rgba(255, 255, 180, 0.25);
|
31 |
border-radius: 18px;
|
32 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
33 |
-
padding: 1.
|
34 |
backdrop-filter: blur(8px);
|
35 |
border: 1px solid rgba(255,255,180,0.25);
|
36 |
width: 100%;
|
@@ -40,7 +41,7 @@ css = """
|
|
40 |
background: rgba(255, 255, 180, 0.25);
|
41 |
border-radius: 18px;
|
42 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
43 |
-
padding: 1.
|
44 |
backdrop-filter: blur(8px);
|
45 |
border: 1px solid rgba(255,255,180,0.25);
|
46 |
width: 100%;
|
@@ -106,11 +107,12 @@ with gr.Blocks(
|
|
106 |
# Content
|
107 |
with gr.Row():
|
108 |
# Chat interface
|
109 |
-
with gr.Column(scale=
|
110 |
gr.Markdown("### 🌐 Hugging Face i18n Agent")
|
111 |
|
112 |
chatbot = gr.Chatbot(
|
113 |
-
value=[[None, get_welcome_message()]], scale=1, height=585
|
|
|
114 |
)
|
115 |
|
116 |
# Controller interface
|
@@ -144,7 +146,7 @@ with gr.Blocks(
|
|
144 |
choices=[],
|
145 |
label="📄 Select a file to translate",
|
146 |
interactive=True,
|
147 |
-
value=
|
148 |
)
|
149 |
file_to_translate_input = gr.Textbox(
|
150 |
label="🌍 Select in the dropdown or write the file path to translate",
|
@@ -161,6 +163,21 @@ with gr.Blocks(
|
|
161 |
label="🔑 Anthropic API key for translation generation",
|
162 |
type="password",
|
163 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
start_translate_btn = gr.Button(
|
165 |
"🚀 Start Translation", elem_classes="action-button"
|
166 |
)
|
@@ -230,7 +247,7 @@ with gr.Blocks(
|
|
230 |
# Button event handlers
|
231 |
start_translate_btn.click(
|
232 |
fn=start_translate_handler,
|
233 |
-
inputs=[chatbot, anthropic_key, file_to_translate_input],
|
234 |
outputs=[chatbot, msg_input, status_display, control_tabs],
|
235 |
)
|
236 |
|
@@ -265,5 +282,13 @@ with gr.Blocks(
|
|
265 |
outputs=[chatbot, msg_input, status_display],
|
266 |
)
|
267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
root_path = os.environ.get("GRADIO_ROOT_PATH")
|
269 |
demo.launch(root_path=root_path)
|
|
|
14 |
send_message,
|
15 |
start_translate_handler,
|
16 |
sync_language_displays,
|
17 |
+
update_prompt_preview,
|
18 |
update_status,
|
19 |
update_github_config,
|
20 |
)
|
|
|
31 |
background: rgba(255, 255, 180, 0.25);
|
32 |
border-radius: 18px;
|
33 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
34 |
+
padding: 1.0em;
|
35 |
backdrop-filter: blur(8px);
|
36 |
border: 1px solid rgba(255,255,180,0.25);
|
37 |
width: 100%;
|
|
|
41 |
background: rgba(255, 255, 180, 0.25);
|
42 |
border-radius: 18px;
|
43 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
44 |
+
padding: 1.0em;
|
45 |
backdrop-filter: blur(8px);
|
46 |
border: 1px solid rgba(255,255,180,0.25);
|
47 |
width: 100%;
|
|
|
107 |
# Content
|
108 |
with gr.Row():
|
109 |
# Chat interface
|
110 |
+
with gr.Column(scale=3, elem_classes=["chat-container"]):
|
111 |
gr.Markdown("### 🌐 Hugging Face i18n Agent")
|
112 |
|
113 |
chatbot = gr.Chatbot(
|
114 |
+
value=[[None, get_welcome_message()]], scale=1, height=585,
|
115 |
+
show_copy_button=True
|
116 |
)
|
117 |
|
118 |
# Controller interface
|
|
|
146 |
choices=[],
|
147 |
label="📄 Select a file to translate",
|
148 |
interactive=True,
|
149 |
+
value=None,
|
150 |
)
|
151 |
file_to_translate_input = gr.Textbox(
|
152 |
label="🌍 Select in the dropdown or write the file path to translate",
|
|
|
163 |
label="🔑 Anthropic API key for translation generation",
|
164 |
type="password",
|
165 |
)
|
166 |
+
additional_instruction = gr.Textbox(
|
167 |
+
label="📝 Additional instructions (Optional - e.g., custom glossary)",
|
168 |
+
placeholder="Example: Translate 'model' as '모델' consistently",
|
169 |
+
lines=2,
|
170 |
+
)
|
171 |
+
|
172 |
+
with gr.Accordion("🔍 Preview Prompt", open=False):
|
173 |
+
prompt_preview = gr.Textbox(
|
174 |
+
label="Current Translation Prompt",
|
175 |
+
lines=8,
|
176 |
+
interactive=False,
|
177 |
+
placeholder="Select a file and language to see the prompt preview...",
|
178 |
+
show_copy_button=True,
|
179 |
+
)
|
180 |
+
|
181 |
start_translate_btn = gr.Button(
|
182 |
"🚀 Start Translation", elem_classes="action-button"
|
183 |
)
|
|
|
247 |
# Button event handlers
|
248 |
start_translate_btn.click(
|
249 |
fn=start_translate_handler,
|
250 |
+
inputs=[chatbot, anthropic_key, file_to_translate_input, additional_instruction],
|
251 |
outputs=[chatbot, msg_input, status_display, control_tabs],
|
252 |
)
|
253 |
|
|
|
282 |
outputs=[chatbot, msg_input, status_display],
|
283 |
)
|
284 |
|
285 |
+
# Update prompt preview when inputs change
|
286 |
+
for input_component in [translate_lang_display, file_to_translate_input, additional_instruction]:
|
287 |
+
input_component.change(
|
288 |
+
fn=update_prompt_preview,
|
289 |
+
inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
|
290 |
+
outputs=[prompt_preview],
|
291 |
+
)
|
292 |
+
|
293 |
root_path = os.environ.get("GRADIO_ROOT_PATH")
|
294 |
demo.launch(root_path=root_path)
|
pr_generator/agent.py
CHANGED
@@ -518,7 +518,7 @@ Please return only the commit message. No other explanation is needed."""
|
|
518 |
"status": "partial_success",
|
519 |
"branch": branch_name,
|
520 |
"file_path": target_filepath,
|
521 |
-
"message": f"File was saved
|
522 |
"error_details": pr_result,
|
523 |
}
|
524 |
elif "successful" in pr_result and "http" in pr_result:
|
|
|
518 |
"status": "partial_success",
|
519 |
"branch": branch_name,
|
520 |
"file_path": target_filepath,
|
521 |
+
"message": f"File was saved and commit was successful.\nPR creation failed: {pr_result}",
|
522 |
"error_details": pr_result,
|
523 |
}
|
524 |
elif "successful" in pr_result and "http" in pr_result:
|
translator/content.py
CHANGED
@@ -9,6 +9,9 @@ from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
|
|
9 |
|
10 |
|
11 |
def get_content(filepath: str) -> str:
|
|
|
|
|
|
|
12 |
url = string.Template(
|
13 |
"https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
|
14 |
).safe_substitute(filepath=filepath)
|
@@ -26,25 +29,31 @@ def preprocess_content(content: str) -> str:
|
|
26 |
## ignore top license comment
|
27 |
to_translate = content[content.find("#") :]
|
28 |
## remove code blocks from text
|
29 |
-
to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
30 |
## remove markdown tables from text
|
31 |
-
to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
32 |
## remove empty lines from text
|
33 |
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
34 |
-
|
35 |
return to_translate
|
36 |
|
37 |
|
38 |
-
def get_full_prompt(language: str, to_translate: str) -> str:
|
39 |
-
|
40 |
"What do these sentences about Hugging Face Transformers "
|
41 |
"(a machine learning library) mean in $language? "
|
42 |
"Please do not translate the word after a 🤗 emoji "
|
43 |
"as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
|
44 |
-
"No explanations or extras—only the translated markdown"
|
45 |
-
"\n\n```md"
|
46 |
).safe_substitute(language=language)
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
|
50 |
def split_markdown_sections(markdown: str) -> list:
|
@@ -72,36 +81,84 @@ def make_scaffold(content: str, to_translate: str) -> string.Template:
|
|
72 |
return string.Template(scaffold)
|
73 |
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
|
76 |
scaffold = make_scaffold(content, to_translate)
|
77 |
print("scaffold:")
|
78 |
print(scaffold.template)
|
|
|
|
|
|
|
|
|
|
|
79 |
divided = split_markdown_sections(to_translate)
|
80 |
print("divided:")
|
81 |
print(divided)
|
82 |
anchors = get_anchors(divided)
|
83 |
-
|
84 |
-
translated
|
85 |
-
|
86 |
-
print(translated)
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
[
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
)
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
translated_doc = scaffold.safe_substitute(
|
104 |
-
{f"hf_i18n_placeholder{i}": text for i, text in enumerate(
|
105 |
)
|
106 |
return translated_doc
|
107 |
|
|
|
9 |
|
10 |
|
11 |
def get_content(filepath: str) -> str:
|
12 |
+
if filepath == "":
|
13 |
+
raise ValueError("No files selected for translation.")
|
14 |
+
|
15 |
url = string.Template(
|
16 |
"https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
|
17 |
).safe_substitute(filepath=filepath)
|
|
|
29 |
## ignore top license comment
|
30 |
to_translate = content[content.find("#") :]
|
31 |
## remove code blocks from text
|
32 |
+
# to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
33 |
## remove markdown tables from text
|
34 |
+
# to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
35 |
## remove empty lines from text
|
36 |
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
|
|
37 |
return to_translate
|
38 |
|
39 |
|
40 |
+
def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
|
41 |
+
base_prompt = string.Template(
|
42 |
"What do these sentences about Hugging Face Transformers "
|
43 |
"(a machine learning library) mean in $language? "
|
44 |
"Please do not translate the word after a 🤗 emoji "
|
45 |
"as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
|
46 |
+
"No explanations or extras—only the translated markdown. Also translate all comments within code blocks as well."
|
|
|
47 |
).safe_substitute(language=language)
|
48 |
+
|
49 |
+
base_prompt += "\n\n```md"
|
50 |
+
|
51 |
+
full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
|
52 |
+
|
53 |
+
if additional_instruction.strip():
|
54 |
+
full_prompt += f"\n\n🗒️ Additional instructions: {additional_instruction.strip()}"
|
55 |
+
|
56 |
+
return full_prompt
|
57 |
|
58 |
|
59 |
def split_markdown_sections(markdown: str) -> list:
|
|
|
81 |
return string.Template(scaffold)
|
82 |
|
83 |
|
84 |
+
def is_in_code_block(text: str, position: int) -> bool:
|
85 |
+
"""Check if a position in text is inside a code block"""
|
86 |
+
text_before = text[:position]
|
87 |
+
code_block_starts = text_before.count("```")
|
88 |
+
return code_block_starts % 2 == 1
|
89 |
+
|
90 |
+
|
91 |
def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
|
92 |
scaffold = make_scaffold(content, to_translate)
|
93 |
print("scaffold:")
|
94 |
print(scaffold.template)
|
95 |
+
|
96 |
+
# Get original text sections to maintain structure
|
97 |
+
original_sections = to_translate.split("\n\n")
|
98 |
+
|
99 |
+
# Split markdown sections to get headers and anchors
|
100 |
divided = split_markdown_sections(to_translate)
|
101 |
print("divided:")
|
102 |
print(divided)
|
103 |
anchors = get_anchors(divided)
|
104 |
+
|
105 |
+
# Split translated content by markdown sections
|
106 |
+
translated_divided = split_markdown_sections(translated)
|
107 |
+
print("translated divided:")
|
108 |
+
print(translated_divided)
|
109 |
+
|
110 |
+
# Ensure we have the same number of headers as the original
|
111 |
+
if len(translated_divided[1::3]) != len(anchors):
|
112 |
+
print(f"Warning: Header count mismatch. Original: {len(anchors)}, Translated: {len(translated_divided[1::3])}")
|
113 |
+
# Adjust anchors list to match translated headers
|
114 |
+
if len(translated_divided[1::3]) < len(anchors):
|
115 |
+
anchors = anchors[:len(translated_divided[1::3])]
|
116 |
+
else:
|
117 |
+
# Add empty anchors for extra headers
|
118 |
+
anchors.extend([""] * (len(translated_divided[1::3]) - len(anchors)))
|
119 |
+
|
120 |
+
# Add anchors to translated headers only if they're not in code blocks
|
121 |
+
for i, korean_title in enumerate(translated_divided[1::3]):
|
122 |
+
if i < len(anchors):
|
123 |
+
# Find the position of this header in the original translated text
|
124 |
+
header_pos = translated.find(korean_title.strip())
|
125 |
+
if header_pos != -1 and not is_in_code_block(translated, header_pos):
|
126 |
+
translated_divided[1 + i * 3] = f"{korean_title} {anchors[i]}"
|
127 |
+
else:
|
128 |
+
translated_divided[1 + i * 3] = korean_title
|
129 |
+
|
130 |
+
# Reconstruct translated content with proper structure
|
131 |
+
reconstructed_translated = "".join([
|
132 |
+
"".join(translated_divided[i * 3 : i * 3 + 3])
|
133 |
+
for i in range(len(translated_divided) // 3)
|
134 |
+
])
|
135 |
+
|
136 |
+
# Split by double newlines to match original structure
|
137 |
+
translated_sections = reconstructed_translated.split("\n\n")
|
138 |
+
|
139 |
+
print("scaffold template count:")
|
140 |
+
print(scaffold.template.count("$hf_i18n_placeholder"))
|
141 |
+
print("original sections length:")
|
142 |
+
print(len(original_sections))
|
143 |
+
print("translated sections length:")
|
144 |
+
print(len(translated_sections))
|
145 |
+
|
146 |
+
# Ensure section counts match
|
147 |
+
placeholder_count = scaffold.template.count("$hf_i18n_placeholder")
|
148 |
+
|
149 |
+
if len(translated_sections) < placeholder_count:
|
150 |
+
# Add empty sections if translated has fewer sections
|
151 |
+
translated_sections.extend([""] * (placeholder_count - len(translated_sections)))
|
152 |
+
elif len(translated_sections) > placeholder_count:
|
153 |
+
# Truncate if translated has more sections
|
154 |
+
translated_sections = translated_sections[:placeholder_count]
|
155 |
+
|
156 |
+
# Final check
|
157 |
+
if len(translated_sections) != placeholder_count:
|
158 |
+
return f"Error: Section count mismatch. Expected: {placeholder_count}, Got: {len(translated_sections)}"
|
159 |
+
|
160 |
translated_doc = scaffold.safe_substitute(
|
161 |
+
{f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated_sections)}
|
162 |
)
|
163 |
return translated_doc
|
164 |
|
translator/retriever.py
CHANGED
@@ -37,18 +37,33 @@ def get_github_issue_open_pr(lang: str = "ko"):
|
|
37 |
"No Github issue has been registered to the server. (Only 'ko' is supported - please contact us to support this.)"
|
38 |
)
|
39 |
|
40 |
-
url = f"https://api.github.com/repos/huggingface/transformers/pulls?state=open"
|
41 |
-
|
42 |
headers = {
|
43 |
"Accept": "application/vnd.github+json",
|
44 |
}
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
pattern = re.compile(r"`([^`]+\.md)`")
|
54 |
|
|
|
37 |
"No Github issue has been registered to the server. (Only 'ko' is supported - please contact us to support this.)"
|
38 |
)
|
39 |
|
|
|
|
|
40 |
headers = {
|
41 |
"Accept": "application/vnd.github+json",
|
42 |
}
|
43 |
+
|
44 |
+
all_open_prs = []
|
45 |
+
page = 1
|
46 |
+
per_page = 100 # Maximum allowed by GitHub API
|
47 |
+
|
48 |
+
while True:
|
49 |
+
url = f"https://api.github.com/repos/huggingface/transformers/pulls?state=open&page={page}&per_page={per_page}"
|
50 |
+
response = requests.get(url, headers=headers)
|
51 |
+
|
52 |
+
if response.status_code != 200:
|
53 |
+
raise Exception(f"GitHub API error: {response.status_code} {response.text}")
|
54 |
+
|
55 |
+
page_prs = response.json()
|
56 |
+
if not page_prs: # No more PRs
|
57 |
+
break
|
58 |
+
|
59 |
+
all_open_prs.extend(page_prs)
|
60 |
+
page += 1
|
61 |
+
|
62 |
+
# Break if we got less than per_page results (last page)
|
63 |
+
if len(page_prs) < per_page:
|
64 |
+
break
|
65 |
+
|
66 |
+
filtered_prs = [pr for pr in all_open_prs if pr["title"].startswith("🌐 [i18n-KO]")]
|
67 |
|
68 |
pattern = re.compile(r"`([^`]+\.md)`")
|
69 |
|