wony617
commited on
Commit
·
1b1c0d8
1
Parent(s):
a487d1c
fix the docs path of the open pr list
Browse files- agent/workflow.py +9 -6
- translator/retriever.py +54 -10
agent/workflow.py
CHANGED
@@ -11,7 +11,7 @@ from translator.content import (
|
|
11 |
llm_translate,
|
12 |
preprocess_content,
|
13 |
)
|
14 |
-
from translator.retriever import report, get_github_issue_open_pr
|
15 |
# GitHub PR Agent import
|
16 |
try:
|
17 |
from pr_generator.agent import GitHubPRAgent
|
@@ -35,11 +35,14 @@ def report_translation_target_files(
|
|
35 |
translate_lang: Target language to translate
|
36 |
top_k: Number of top-first files to return for translation. (Default 1)
|
37 |
"""
|
38 |
-
# Get files
|
39 |
-
|
40 |
-
|
41 |
-
# Get all available files for translation
|
42 |
-
all_status_report, all_filepath_list = report(project, translate_lang, top_k * 2) # Get more to account for filtering
|
|
|
|
|
|
|
43 |
|
44 |
# Filter out files that are already in progress
|
45 |
available_files = [f for f in all_filepath_list if f not in docs_in_progress]
|
|
|
11 |
llm_translate,
|
12 |
preprocess_content,
|
13 |
)
|
14 |
+
from translator.retriever import report, get_github_issue_open_pr, get_github_repo_files
|
15 |
# GitHub PR Agent import
|
16 |
try:
|
17 |
from pr_generator.agent import GitHubPRAgent
|
|
|
35 |
translate_lang: Target language to translate
|
36 |
top_k: Number of top-first files to return for translation. (Default 1)
|
37 |
"""
|
38 |
+
# Get repo files once to avoid duplicate API calls
|
39 |
+
all_repo_files = get_github_repo_files(project)
|
40 |
+
|
41 |
+
# Get all available files for translation using the file list
|
42 |
+
all_status_report, all_filepath_list = report(project, translate_lang, top_k * 2, all_repo_files) # Get more to account for filtering
|
43 |
+
|
44 |
+
# Get files in progress using the same file list
|
45 |
+
docs_in_progress, pr_info_list = get_github_issue_open_pr(project, translate_lang, all_repo_files)
|
46 |
|
47 |
# Filter out files that are already in progress
|
48 |
available_files = [f for f in all_filepath_list if f not in docs_in_progress]
|
translator/retriever.py
CHANGED
@@ -37,7 +37,7 @@ def get_github_repo_files(project: str = "transformers"):
|
|
37 |
return file_paths
|
38 |
|
39 |
|
40 |
-
def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko"):
|
41 |
"""
|
42 |
Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
|
43 |
"""
|
@@ -48,6 +48,10 @@ def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko"):
|
|
48 |
if not issue_id:
|
49 |
raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
|
50 |
|
|
|
|
|
|
|
|
|
51 |
headers = {
|
52 |
"Accept": "application/vnd.github+json",
|
53 |
}
|
@@ -84,20 +88,59 @@ def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko"):
|
|
84 |
|
85 |
filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
|
86 |
|
87 |
-
# Pattern to match
|
88 |
-
pattern = re.compile(r"(?:`([^`]
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
filenames = []
|
|
|
|
|
91 |
for pr in filtered_prs:
|
92 |
match = pattern.search(pr["title"])
|
93 |
if match:
|
94 |
# Use group 1 (with backticks) or group 2 (without backticks)
|
95 |
filename = match.group(1) or match.group(2)
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
return filenames, pr_info_list
|
102 |
|
103 |
|
@@ -121,11 +164,12 @@ def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
|
|
121 |
return report, first_missing_docs
|
122 |
|
123 |
|
124 |
-
def report(project: str, target_lang: str, top_k: int = 1) -> tuple[str, list[str]]:
|
125 |
"""
|
126 |
Generate a report for the translated docs
|
127 |
"""
|
128 |
-
docs_file
|
|
|
129 |
|
130 |
base_docs_path = Path("docs/source")
|
131 |
en_docs_path = Path("docs/source/en")
|
|
|
37 |
return file_paths
|
38 |
|
39 |
|
40 |
+
def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
|
41 |
"""
|
42 |
Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
|
43 |
"""
|
|
|
48 |
if not issue_id:
|
49 |
raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
|
50 |
|
51 |
+
# Require all_files parameter
|
52 |
+
if all_files is None:
|
53 |
+
raise ValueError("Repository file list must be provided")
|
54 |
+
|
55 |
headers = {
|
56 |
"Accept": "application/vnd.github+json",
|
57 |
}
|
|
|
88 |
|
89 |
filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
|
90 |
|
91 |
+
# Pattern to match filenames after "Translated" keyword
|
92 |
+
pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")
|
93 |
|
94 |
+
def find_original_file_path(filename_from_title, all_files):
|
95 |
+
"""Find the exact file path from repo files by matching filename"""
|
96 |
+
if not filename_from_title:
|
97 |
+
return None
|
98 |
+
|
99 |
+
# Remove .md extension for matching
|
100 |
+
base_name = filename_from_title.replace('.md', '')
|
101 |
+
|
102 |
+
# Look for exact matches in repo files
|
103 |
+
for file_path in all_files:
|
104 |
+
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
|
105 |
+
file_base = file_path.split("/")[-1].replace('.md', '')
|
106 |
+
if file_base == base_name:
|
107 |
+
return file_path
|
108 |
+
|
109 |
+
# If no exact match, fallback to simple path
|
110 |
+
return f"docs/source/en/{filename_from_title}"
|
111 |
+
|
112 |
filenames = []
|
113 |
+
pr_info_list = []
|
114 |
+
|
115 |
for pr in filtered_prs:
|
116 |
match = pattern.search(pr["title"])
|
117 |
if match:
|
118 |
# Use group 1 (with backticks) or group 2 (without backticks)
|
119 |
filename = match.group(1) or match.group(2)
|
120 |
+
# Add .md extension if not present
|
121 |
+
if not filename.endswith('.md'):
|
122 |
+
filename += '.md'
|
123 |
+
|
124 |
+
# Find the correct file path by matching filename
|
125 |
+
correct_path = None
|
126 |
+
if filename:
|
127 |
+
# Remove .md extension for matching
|
128 |
+
base_name = filename.replace('.md', '')
|
129 |
+
|
130 |
+
# Look for exact matches in repo files
|
131 |
+
for file_path in all_files:
|
132 |
+
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
|
133 |
+
file_base = file_path.split("/")[-1].replace('.md', '')
|
134 |
+
if file_base == base_name:
|
135 |
+
correct_path = file_path
|
136 |
+
break
|
137 |
+
|
138 |
+
# If no exact match, fallback to simple path
|
139 |
+
if not correct_path:
|
140 |
+
correct_path = f"docs/source/en/{filename}"
|
141 |
+
if correct_path:
|
142 |
+
filenames.append(correct_path)
|
143 |
+
pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
|
144 |
return filenames, pr_info_list
|
145 |
|
146 |
|
|
|
164 |
return report, first_missing_docs
|
165 |
|
166 |
|
167 |
+
def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
|
168 |
"""
|
169 |
Generate a report for the translated docs
|
170 |
"""
|
171 |
+
if docs_file is None:
|
172 |
+
raise ValueError("Repository file list must be provided")
|
173 |
|
174 |
base_docs_path = Path("docs/source")
|
175 |
en_docs_path = Path("docs/source/en")
|