wony617 commited on
Commit
7e4dd83
Β·
1 Parent(s): fe6c90f

Initial for supporting smolagent translation

Browse files
agent/handler.py CHANGED
@@ -13,12 +13,14 @@ from agent.workflow import (
13
  )
14
  from pr_generator.searcher import find_reference_pr_simple_stream
15
  from translator.content import get_full_prompt, get_content, preprocess_content
 
16
 
17
 
18
  # State management
19
  class ChatState:
20
  def __init__(self):
21
- self.step = "welcome" # welcome -> find_files -> translate -> create_github_pr
 
22
  self.target_language = "ko"
23
  self.k_files = 10
24
  self.files_to_translate = []
@@ -53,25 +55,26 @@ def _extract_content_for_display(content: str) -> str:
53
 
54
 
55
  def get_welcome_message():
56
- """Initial welcome message with file finding controls"""
57
  return """**πŸ‘‹ Welcome to 🌐 Hugging Face i18n Translation Agent!**
58
 
59
  I'll help you find files that need translation and translate them in a streamlined workflow.
60
 
61
- **πŸ”Ž Let's start by finding files that need translation.**
62
 
63
- Use the **`Quick Controls`** on the right or **ask me `what`, `how`, or `help`** to get started.
64
  """
65
 
66
 
67
- def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
68
  """Process file search request and update Gradio UI components."""
69
  global state
 
70
  state.target_language = lang
71
  state.k_files = k
72
  state.step = "find_files"
73
 
74
- status_report, files_list = report_translation_target_files(lang, k)
75
  state.files_to_translate = (
76
  [file[0] for file in files_list]
77
  if files_list
@@ -87,8 +90,10 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
87
  """
88
 
89
  if state.files_to_translate:
 
90
  for i, file in enumerate(state.files_to_translate, 1):
91
- response += f"\n{i}. `{file}`"
 
92
 
93
  # if len(state.files_to_translate) > 5:
94
  # response += f"\n... and {len(state.files_to_translate) - 5} more files"
@@ -138,9 +143,8 @@ def start_translation_process():
138
  p.parent.mkdir(parents=True, exist_ok=True)
139
  p.write_text(translated, encoding="utf-8")
140
 
141
- original_file_link = (
142
- "https://github.com/huggingface/transformers/blob/main/" + current_file
143
- )
144
  print("Compeleted translation:\n")
145
  print(translated)
146
  print("----------------------------")
@@ -226,12 +230,12 @@ def handle_user_message(message, history):
226
 
227
  def update_status():
228
  if state.step == "welcome":
229
- return """
230
  <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
231
  <div><strong>πŸ”„ Step:</strong> Welcome</div>
 
232
  <div><strong>πŸ“ Files:</strong> 0</div>
233
- <div><strong>🌍 Language:</strong> ko</div>
234
- <div><strong>⏳ Progress:</strong> Ready</div>
235
  </div>
236
  """
237
 
@@ -267,6 +271,7 @@ def update_status():
267
  status_html = f"""
268
  <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
269
  <div><strong>πŸ”„ Step:</strong> {step_map.get(state.step, state.step)}</div>
 
270
  <div><strong>πŸ“ Files:</strong> {len(state.files_to_translate)}</div>
271
  <div><strong>🌍 Language:</strong> {state.target_language}</div>
272
  <div><strong>⏳ Progress:</strong> {progress_map.get(state.step, 'In progress')}</div>
@@ -292,14 +297,18 @@ def update_github_config(token, owner, repo, reference_pr_url):
292
  if token:
293
  os.environ["GITHUB_TOKEN"] = token
294
 
 
 
 
 
 
295
  # Save GitHub configuration to state
296
  state.github_config.update(
297
  {
298
  "token": token,
299
  "owner": owner,
300
  "repo_name": repo,
301
- "reference_pr_url": reference_pr_url
302
- or state.github_config["reference_pr_url"],
303
  }
304
  )
305
 
 
13
  )
14
  from pr_generator.searcher import find_reference_pr_simple_stream
15
  from translator.content import get_full_prompt, get_content, preprocess_content
16
+ from translator.project_config import get_available_projects, get_project_config
17
 
18
 
19
  # State management
20
  class ChatState:
21
  def __init__(self):
22
+ self.step = "welcome" # welcome -> select_project -> find_files -> translate -> create_github_pr
23
+ self.selected_project = "transformers" # Default project
24
  self.target_language = "ko"
25
  self.k_files = 10
26
  self.files_to_translate = []
 
55
 
56
 
57
  def get_welcome_message():
58
+ """Initial welcome message with project selection"""
59
  return """**πŸ‘‹ Welcome to 🌐 Hugging Face i18n Translation Agent!**
60
 
61
  I'll help you find files that need translation and translate them in a streamlined workflow.
62
 
63
+ **🎯 First, select which project you want to translate:**
64
 
65
+ Use the **`Quick Controls`** on the right to select a project, or **ask me `what`, `how`, or `help`** to get started.
66
  """
67
 
68
 
69
+ def process_file_search_handler(project: str, lang: str, k: int, history: list) -> tuple:
70
  """Process file search request and update Gradio UI components."""
71
  global state
72
+ state.selected_project = project
73
  state.target_language = lang
74
  state.k_files = k
75
  state.step = "find_files"
76
 
77
+ status_report, files_list = report_translation_target_files(project, lang, k)
78
  state.files_to_translate = (
79
  [file[0] for file in files_list]
80
  if files_list
 
90
  """
91
 
92
  if state.files_to_translate:
93
+ config = get_project_config(state.selected_project)
94
  for i, file in enumerate(state.files_to_translate, 1):
95
+ file_link = f"{config.repo_url}/blob/main/{file}"
96
+ response += f"\n{i}. [`{file}`]({file_link})"
97
 
98
  # if len(state.files_to_translate) > 5:
99
  # response += f"\n... and {len(state.files_to_translate) - 5} more files"
 
143
  p.parent.mkdir(parents=True, exist_ok=True)
144
  p.write_text(translated, encoding="utf-8")
145
 
146
+ config = get_project_config(state.selected_project)
147
+ original_file_link = f"{config.repo_url}/blob/main/{current_file}"
 
148
  print("Compeleted translation:\n")
149
  print(translated)
150
  print("----------------------------")
 
230
 
231
  def update_status():
232
  if state.step == "welcome":
233
+ return f"""
234
  <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
235
  <div><strong>πŸ”„ Step:</strong> Welcome</div>
236
+ <div><strong>🎯 Project:</strong> {state.selected_project}</div>
237
  <div><strong>πŸ“ Files:</strong> 0</div>
238
+ <div><strong>🌍 Language:</strong> {state.target_language}</div>
 
239
  </div>
240
  """
241
 
 
271
  status_html = f"""
272
  <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
273
  <div><strong>πŸ”„ Step:</strong> {step_map.get(state.step, state.step)}</div>
274
+ <div><strong>🎯 Project:</strong> {state.selected_project}</div>
275
  <div><strong>πŸ“ Files:</strong> {len(state.files_to_translate)}</div>
276
  <div><strong>🌍 Language:</strong> {state.target_language}</div>
277
  <div><strong>⏳ Progress:</strong> {progress_map.get(state.step, 'In progress')}</div>
 
297
  if token:
298
  os.environ["GITHUB_TOKEN"] = token
299
 
300
+ # Get default reference PR URL from project config if not provided
301
+ if not reference_pr_url:
302
+ config = get_project_config(state.selected_project)
303
+ reference_pr_url = config.reference_pr_url
304
+
305
  # Save GitHub configuration to state
306
  state.github_config.update(
307
  {
308
  "token": token,
309
  "owner": owner,
310
  "repo_name": repo,
311
+ "reference_pr_url": reference_pr_url,
 
312
  }
313
  )
314
 
agent/workflow.py CHANGED
@@ -26,19 +26,20 @@ from logger.github_logger import GitHubLogger
26
 
27
 
28
  def report_translation_target_files(
29
- translate_lang: str, top_k: int = 1
30
  ) -> tuple[str, list[list[str]]]:
31
  """Return the top-k files that need translation, excluding files already in progress.
32
 
33
  Args:
 
34
  translate_lang: Target language to translate
35
  top_k: Number of top-first files to return for translation. (Default 1)
36
  """
37
  # Get files in progress
38
- docs_in_progress, pr_info_list = get_github_issue_open_pr(translate_lang)
39
 
40
  # Get all available files for translation
41
- all_status_report, all_filepath_list = report(translate_lang, top_k * 2) # Get more to account for filtering
42
 
43
  # Filter out files that are already in progress
44
  available_files = [f for f in all_filepath_list if f not in docs_in_progress]
@@ -52,7 +53,7 @@ def report_translation_target_files(
52
  if docs_in_progress:
53
  status_report += f"\n\nπŸ€– Found {len(docs_in_progress)} files in progress for translation:"
54
  for i, file in enumerate(docs_in_progress):
55
- status_report += f"\n{i+1}. `{file}`: {pr_info_list[i]}"
56
  status_report += f"\n\nπŸ“‹ Showing {len(filepath_list)} available files (excluding in-progress):"
57
 
58
  return status_report, [[file] for file in filepath_list]
 
26
 
27
 
28
  def report_translation_target_files(
29
+ project: str, translate_lang: str, top_k: int = 1
30
  ) -> tuple[str, list[list[str]]]:
31
  """Return the top-k files that need translation, excluding files already in progress.
32
 
33
  Args:
34
+ project: Project to translate (e.g., "transformers", "smolagents")
35
  translate_lang: Target language to translate
36
  top_k: Number of top-first files to return for translation. (Default 1)
37
  """
38
  # Get files in progress
39
+ docs_in_progress, pr_info_list = get_github_issue_open_pr(project, translate_lang)
40
 
41
  # Get all available files for translation
42
+ all_status_report, all_filepath_list = report(project, translate_lang, top_k * 2) # Get more to account for filtering
43
 
44
  # Filter out files that are already in progress
45
  available_files = [f for f in all_filepath_list if f not in docs_in_progress]
 
53
  if docs_in_progress:
54
  status_report += f"\n\nπŸ€– Found {len(docs_in_progress)} files in progress for translation:"
55
  for i, file in enumerate(docs_in_progress):
56
+ status_report += f"\n{i+1}. [`{file}`]({pr_info_list[i]})"
57
  status_report += f"\n\nπŸ“‹ Showing {len(filepath_list)} available files (excluding in-progress):"
58
 
59
  return status_report, [[file] for file in filepath_list]
app.py CHANGED
@@ -19,6 +19,7 @@ from agent.handler import (
19
  update_github_config,
20
  )
21
  from translator.model import Languages
 
22
 
23
  load_dotenv()
24
 
@@ -125,6 +126,11 @@ with gr.Blocks(
125
  with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
126
  with gr.TabItem("1. Find Files", id=0):
127
  with gr.Group():
 
 
 
 
 
128
  lang_dropdown = gr.Radio(
129
  choices=[language.value for language in Languages],
130
  label="🌍 Translate To",
@@ -226,7 +232,7 @@ with gr.Blocks(
226
 
227
  find_btn.click(
228
  fn=process_file_search_handler,
229
- inputs=[lang_dropdown, k_input, chatbot],
230
  outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
231
  )
232
 
 
19
  update_github_config,
20
  )
21
  from translator.model import Languages
22
+ from translator.project_config import get_available_projects
23
 
24
  load_dotenv()
25
 
 
126
  with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
127
  with gr.TabItem("1. Find Files", id=0):
128
  with gr.Group():
129
+ project_dropdown = gr.Radio(
130
+ choices=get_available_projects(),
131
+ label="🎯 Select Project",
132
+ value="transformers",
133
+ )
134
  lang_dropdown = gr.Radio(
135
  choices=[language.value for language in Languages],
136
  label="🌍 Translate To",
 
232
 
233
  find_btn.click(
234
  fn=process_file_search_handler,
235
+ inputs=[project_dropdown, lang_dropdown, k_input, chatbot],
236
  outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
237
  )
238
 
example.env CHANGED
@@ -13,6 +13,5 @@ HF_SPACE_NAME=
13
 
14
  # Secrets for logging to Github
15
  LOG_REPO=
16
- LOG_GITHUB_TOKEN=
17
  LOG_BRANCH=
18
  LOG_FILE_PATH=
 
13
 
14
  # Secrets for logging to Github
15
  LOG_REPO=
 
16
  LOG_BRANCH=
17
  LOG_FILE_PATH=
logger/github_logger.py CHANGED
@@ -12,7 +12,7 @@ class GitHubLogger:
12
  """Dedicated logger that appends JSONL entries to a GitHub repo/branch/file.
13
 
14
  Env vars:
15
- - LOG_GITHUB_TOKEN (fallback: GITHUB_TOKEN)
16
  - LOG_REPO (format: owner/repo)
17
  - LOG_BRANCH (default: 'log_event')
18
  - LOG_FILE_PATH (default: 'pr_success.log')
@@ -21,9 +21,9 @@ class GitHubLogger:
21
  def __init__(self):
22
  if not LIBS_OK:
23
  raise ImportError("PyGithub not installed. Please install PyGithub.")
24
- token = os.environ.get("LOG_GITHUB_TOKEN") or os.environ.get("GITHUB_TOKEN")
25
  if not token:
26
- raise ValueError("Missing LOG_GITHUB_TOKEN or GITHUB_TOKEN for logging")
27
  self._client = Github(token)
28
 
29
  repo_spec = os.environ.get("LOG_REPO")
 
12
  """Dedicated logger that appends JSONL entries to a GitHub repo/branch/file.
13
 
14
  Env vars:
15
+ - GITHUB_TOKEN
16
  - LOG_REPO (format: owner/repo)
17
  - LOG_BRANCH (default: 'log_event')
18
  - LOG_FILE_PATH (default: 'pr_success.log')
 
21
  def __init__(self):
22
  if not LIBS_OK:
23
  raise ImportError("PyGithub not installed. Please install PyGithub.")
24
+ token = os.environ.get("GITHUB_TOKEN")
25
  if not token:
26
+ raise ValueError("Missing GITHUB_TOKEN for logging")
27
  self._client = Github(token)
28
 
29
  repo_spec = os.environ.get("LOG_REPO")
translator/project_config.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Project configuration for different HuggingFace repositories."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Dict
5
+
6
+
7
+ @dataclass
8
+ class ProjectConfig:
9
+ """Configuration for a specific HuggingFace project."""
10
+ name: str
11
+ repo_url: str
12
+ api_url: str
13
+ docs_path: str
14
+ github_issues: Dict[str, str] # language -> issue_id
15
+ reference_pr_url: str
16
+
17
+
18
+ # Project configurations
19
+ PROJECTS = {
20
+ "transformers": ProjectConfig(
21
+ name="Transformers",
22
+ repo_url="https://github.com/huggingface/transformers",
23
+ api_url="https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1",
24
+ docs_path="docs/source",
25
+ github_issues={"ko": "20179"},
26
+ reference_pr_url="https://github.com/huggingface/transformers/pull/24968"
27
+ ),
28
+ "smolagents": ProjectConfig(
29
+ name="SmolAgents",
30
+ repo_url="https://github.com/huggingface/smolagents",
31
+ api_url="https://api.github.com/repos/huggingface/smolagents/git/trees/main?recursive=1",
32
+ docs_path="docs/source",
33
+ github_issues={"ko": "20179"}, # To be filled when issue is created
34
+ reference_pr_url="https://github.com/huggingface/smolagents/pull/1581" # To be filled with actual PR URL
35
+ )
36
+ }
37
+
38
+
39
+ def get_project_config(project_key: str) -> ProjectConfig:
40
+ """Get project configuration by key."""
41
+ if project_key not in PROJECTS:
42
+ raise ValueError(f"Unknown project: {project_key}. Available: {list(PROJECTS.keys())}")
43
+ return PROJECTS[project_key]
44
+
45
+
46
+ def get_available_projects() -> list[str]:
47
+ """Get list of available project keys."""
48
+ return list(PROJECTS.keys())
translator/retriever.py CHANGED
@@ -5,15 +5,22 @@ from pathlib import Path
5
  import requests
6
 
7
  from .model import Languages, Summary, TranslationDoc
 
8
 
9
- URL = "https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1"
10
 
11
-
12
- def get_github_repo_files():
13
  """
14
  Get github repo files
15
  """
16
- response = requests.get(URL)
 
 
 
 
 
 
 
 
17
 
18
  data = response.json()
19
  all_items = data.get("tree", [])
@@ -26,27 +33,33 @@ def get_github_repo_files():
26
  return file_paths
27
 
28
 
29
- def get_github_issue_open_pr(lang: str = "ko"):
30
  """
31
- Get open PR in the github issue, filtered by title starting with '🌐 [i18n-KO]'.
32
  """
33
- if lang == "ko":
34
- issue_id = "20179"
35
- else:
36
- raise ValueError(
37
- "No Github issue has been registered to the server. (Only 'ko' is supported - please contact us to support this.)"
38
- )
39
 
40
  headers = {
41
  "Accept": "application/vnd.github+json",
42
  }
43
 
 
 
 
 
 
44
  all_open_prs = []
45
  page = 1
46
  per_page = 100 # Maximum allowed by GitHub API
47
 
48
  while True:
49
- url = f"https://api.github.com/repos/huggingface/transformers/pulls?state=open&page={page}&per_page={per_page}"
 
50
  response = requests.get(url, headers=headers)
51
 
52
  if response.status_code != 200:
@@ -63,17 +76,20 @@ def get_github_issue_open_pr(lang: str = "ko"):
63
  if len(page_prs) < per_page:
64
  break
65
 
66
- filtered_prs = [pr for pr in all_open_prs if pr["title"].startswith("🌐 [i18n-KO]")]
67
 
68
- pattern = re.compile(r"`([^`]+\.md)`")
 
69
 
70
- filenames = [
71
- "docs/source/en/" + match.group(1)
72
- for pr in filtered_prs
73
- if (match := pattern.search(pr["title"]))
74
- ]
 
 
75
  pr_info_list = [
76
- f"https://github.com/huggingface/transformers/pull/{pr['url'].rstrip('/').split('/')[-1]}"
77
  for pr in filtered_prs
78
  ]
79
  return filenames, pr_info_list
@@ -99,11 +115,11 @@ def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
99
  return report, first_missing_docs
100
 
101
 
102
- def report(target_lang: str, top_k: int = 1) -> tuple[str, list[str]]:
103
  """
104
  Generate a report for the translated docs
105
  """
106
- docs_file = get_github_repo_files()
107
 
108
  base_docs_path = Path("docs/source")
109
  en_docs_path = Path("docs/source/en")
 
5
  import requests
6
 
7
  from .model import Languages, Summary, TranslationDoc
8
+ from .project_config import get_project_config
9
 
 
10
 
11
+ def get_github_repo_files(project: str = "transformers"):
 
12
  """
13
  Get github repo files
14
  """
15
+ config = get_project_config(project)
16
+
17
+ # Add GitHub token if available to avoid rate limiting
18
+ headers = {}
19
+ github_token = os.environ.get("GITHUB_TOKEN")
20
+ if github_token:
21
+ headers["Authorization"] = f"token {github_token}"
22
+
23
+ response = requests.get(config.api_url, headers=headers)
24
 
25
  data = response.json()
26
  all_items = data.get("tree", [])
 
33
  return file_paths
34
 
35
 
36
+ def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko"):
37
  """
38
+ Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
39
  """
40
+ config = get_project_config(project)
41
+ issue_id = config.github_issues.get(lang)
42
+
43
+ # For projects without GitHub issue tracking, still search for PRs
44
+ if not issue_id:
45
+ raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
46
 
47
  headers = {
48
  "Accept": "application/vnd.github+json",
49
  }
50
 
51
+ # Add GitHub token if available to avoid rate limiting
52
+ github_token = os.environ.get("GITHUB_TOKEN")
53
+ if github_token:
54
+ headers["Authorization"] = f"token {github_token}"
55
+
56
  all_open_prs = []
57
  page = 1
58
  per_page = 100 # Maximum allowed by GitHub API
59
 
60
  while True:
61
+ repo_path = config.repo_url.replace("https://github.com/", "")
62
+ url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
63
  response = requests.get(url, headers=headers)
64
 
65
  if response.status_code != 200:
 
76
  if len(page_prs) < per_page:
77
  break
78
 
79
+ filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
80
 
81
+ # Pattern to match both `filename.md` and filename.md formats
82
+ pattern = re.compile(r"(?:`([^`]+\.md)`|(\w+\.md))")
83
 
84
+ filenames = []
85
+ for pr in filtered_prs:
86
+ match = pattern.search(pr["title"])
87
+ if match:
88
+ # Use group 1 (with backticks) or group 2 (without backticks)
89
+ filename = match.group(1) or match.group(2)
90
+ filenames.append("docs/source/en/" + filename)
91
  pr_info_list = [
92
+ f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}"
93
  for pr in filtered_prs
94
  ]
95
  return filenames, pr_info_list
 
115
  return report, first_missing_docs
116
 
117
 
118
+ def report(project: str, target_lang: str, top_k: int = 1) -> tuple[str, list[str]]:
119
  """
120
  Generate a report for the translated docs
121
  """
122
+ docs_file = get_github_repo_files(project)
123
 
124
  base_docs_path = Path("docs/source")
125
  en_docs_path = Path("docs/source/en")