wony617
fix the docs path of the open pr list
1b1c0d8
import re
import os
from pathlib import Path
import requests
from .model import Languages, Summary, TranslationDoc
from .project_config import get_project_config
def get_github_repo_files(project: str = "transformers"):
"""
Get github repo files
"""
config = get_project_config(project)
# Add GitHub token if available to avoid rate limiting (optional)
headers = {}
github_token = os.environ.get("GITHUB_TOKEN")
if github_token:
headers["Authorization"] = f"token {github_token}"
response = requests.get(config.api_url, headers=headers)
# Handle rate limit with helpful message
if response.status_code == 403 and "rate limit" in response.text.lower():
raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
data = response.json()
all_items = data.get("tree", [])
file_paths = [
item["path"]
for item in all_items
if item["type"] == "blob" and (item["path"].startswith("docs"))
]
return file_paths
def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
"""
Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
"""
config = get_project_config(project)
issue_id = config.github_issues.get(lang)
# For projects without GitHub issue tracking, still search for PRs
if not issue_id:
raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
# Require all_files parameter
if all_files is None:
raise ValueError("Repository file list must be provided")
headers = {
"Accept": "application/vnd.github+json",
}
# Add GitHub token if available to avoid rate limiting (optional)
github_token = os.environ.get("GITHUB_TOKEN")
if github_token:
headers["Authorization"] = f"token {github_token}"
all_open_prs = []
page = 1
per_page = 100 # Maximum allowed by GitHub API
while True:
repo_path = config.repo_url.replace("https://github.com/", "")
url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
response = requests.get(url, headers=headers)
if response.status_code == 403 and "rate limit" in response.text.lower():
raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
elif response.status_code != 200:
raise Exception(f"GitHub API error: {response.status_code} {response.text}")
page_prs = response.json()
if not page_prs: # No more PRs
break
all_open_prs.extend(page_prs)
page += 1
# Break if we got less than per_page results (last page)
if len(page_prs) < per_page:
break
filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
# Pattern to match filenames after "Translated" keyword
pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")
def find_original_file_path(filename_from_title, all_files):
"""Find the exact file path from repo files by matching filename"""
if not filename_from_title:
return None
# Remove .md extension for matching
base_name = filename_from_title.replace('.md', '')
# Look for exact matches in repo files
for file_path in all_files:
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
file_base = file_path.split("/")[-1].replace('.md', '')
if file_base == base_name:
return file_path
# If no exact match, fallback to simple path
return f"docs/source/en/{filename_from_title}"
filenames = []
pr_info_list = []
for pr in filtered_prs:
match = pattern.search(pr["title"])
if match:
# Use group 1 (with backticks) or group 2 (without backticks)
filename = match.group(1) or match.group(2)
# Add .md extension if not present
if not filename.endswith('.md'):
filename += '.md'
# Find the correct file path by matching filename
correct_path = None
if filename:
# Remove .md extension for matching
base_name = filename.replace('.md', '')
# Look for exact matches in repo files
for file_path in all_files:
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
file_base = file_path.split("/")[-1].replace('.md', '')
if file_base == base_name:
correct_path = file_path
break
# If no exact match, fallback to simple path
if not correct_path:
correct_path = f"docs/source/en/{filename}"
if correct_path:
filenames.append(correct_path)
pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
return filenames, pr_info_list
def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
"""
Retrieve missing docs
"""
report = f"""
| Item | Count | Percentage |
|------|-------|------------|
| 📂 HuggingFaces docs | {summary.files_analyzed} | - |
| 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
"""
print(report)
first_missing_docs = list()
for file in summary.first_missing_translation_files(table_size):
first_missing_docs.append(file.original_file)
print(first_missing_docs)
return report, first_missing_docs
def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
"""
Generate a report for the translated docs
"""
if docs_file is None:
raise ValueError("Repository file list must be provided")
base_docs_path = Path("docs/source")
en_docs_path = Path("docs/source/en")
lang = Languages[target_lang]
summary = Summary(lang=lang.value)
for file in docs_file:
if file.endswith(".md"):
try:
file_relative_path = Path(file).relative_to(en_docs_path)
except ValueError:
continue
translated_path = os.path.join(
base_docs_path, lang.value, file_relative_path
)
translation_exists = translated_path in docs_file
doc = TranslationDoc(
translation_lang=lang.value,
original_file=file,
translation_file=translated_path,
translation_exists=translation_exists,
)
summary.append_file(doc)
return retrieve(summary, top_k)