Spaces:

huggingface-KREW
/

test-github-CI-for-i18n-agent

Running

test-github-CI-for-i18n-agent / translator /retriever.py

wony617

fix the docs path of the open pr list

1b1c0d8 3 days ago

7.25 kB

	import re
	import os
	from pathlib import Path

	import requests

	from .model import Languages, Summary, TranslationDoc
	from .project_config import get_project_config


	def get_github_repo_files(project: str = "transformers"):
	"""
	Get github repo files
	"""
	config = get_project_config(project)

	# Add GitHub token if available to avoid rate limiting (optional)
	headers = {}
	github_token = os.environ.get("GITHUB_TOKEN")
	if github_token:
	headers["Authorization"] = f"token {github_token}"

	response = requests.get(config.api_url, headers=headers)

	# Handle rate limit with helpful message
	if response.status_code == 403 and "rate limit" in response.text.lower():
	raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")

	data = response.json()
	all_items = data.get("tree", [])

	file_paths = [
	item["path"]
	for item in all_items
	if item["type"] == "blob" and (item["path"].startswith("docs"))
	]
	return file_paths


	def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
	"""
	Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
	"""
	config = get_project_config(project)
	issue_id = config.github_issues.get(lang)

	# For projects without GitHub issue tracking, still search for PRs
	if not issue_id:
	raise ValueError(f"⚠️ No GitHub issue registered for {project}.")

	# Require all_files parameter
	if all_files is None:
	raise ValueError("Repository file list must be provided")

	headers = {
	"Accept": "application/vnd.github+json",
	}

	# Add GitHub token if available to avoid rate limiting (optional)
	github_token = os.environ.get("GITHUB_TOKEN")
	if github_token:
	headers["Authorization"] = f"token {github_token}"

	all_open_prs = []
	page = 1
	per_page = 100 # Maximum allowed by GitHub API

	while True:
	repo_path = config.repo_url.replace("https://github.com/", "")
	url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
	response = requests.get(url, headers=headers)

	if response.status_code == 403 and "rate limit" in response.text.lower():
	raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
	elif response.status_code != 200:
	raise Exception(f"GitHub API error: {response.status_code} {response.text}")

	page_prs = response.json()
	if not page_prs: # No more PRs
	break

	all_open_prs.extend(page_prs)
	page += 1

	# Break if we got less than per_page results (last page)
	if len(page_prs) < per_page:
	break

	filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]

	# Pattern to match filenames after "Translated" keyword
	pattern = re.compile(r"Translated\s+(?:`([^`]+)`\|(\S+))\s+to")

	def find_original_file_path(filename_from_title, all_files):
	"""Find the exact file path from repo files by matching filename"""
	if not filename_from_title:
	return None

	# Remove .md extension for matching
	base_name = filename_from_title.replace('.md', '')

	# Look for exact matches in repo files
	for file_path in all_files:
	if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
	file_base = file_path.split("/")[-1].replace('.md', '')
	if file_base == base_name:
	return file_path

	# If no exact match, fallback to simple path
	return f"docs/source/en/{filename_from_title}"

	filenames = []
	pr_info_list = []

	for pr in filtered_prs:
	match = pattern.search(pr["title"])
	if match:
	# Use group 1 (with backticks) or group 2 (without backticks)
	filename = match.group(1) or match.group(2)
	# Add .md extension if not present
	if not filename.endswith('.md'):
	filename += '.md'

	# Find the correct file path by matching filename
	correct_path = None
	if filename:
	# Remove .md extension for matching
	base_name = filename.replace('.md', '')

	# Look for exact matches in repo files
	for file_path in all_files:
	if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
	file_base = file_path.split("/")[-1].replace('.md', '')
	if file_base == base_name:
	correct_path = file_path
	break

	# If no exact match, fallback to simple path
	if not correct_path:
	correct_path = f"docs/source/en/{filename}"
	if correct_path:
	filenames.append(correct_path)
	pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
	return filenames, pr_info_list


	def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
	"""
	Retrieve missing docs
	"""

	report = f"""
	\| Item \| Count \| Percentage \|
	\|------\|-------\|------------\|
	\| 📂 HuggingFaces docs \| {summary.files_analyzed} \| - \|
	\| 🪹 Missing translations \| {summary.files_missing_translation} \| {summary.percentage_missing_translation:.2f}% \|
	"""
	print(report)
	first_missing_docs = list()
	for file in summary.first_missing_translation_files(table_size):
	first_missing_docs.append(file.original_file)

	print(first_missing_docs)
	return report, first_missing_docs


	def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
	"""
	Generate a report for the translated docs
	"""
	if docs_file is None:
	raise ValueError("Repository file list must be provided")

	base_docs_path = Path("docs/source")
	en_docs_path = Path("docs/source/en")

	lang = Languages[target_lang]
	summary = Summary(lang=lang.value)

	for file in docs_file:
	if file.endswith(".md"):
	try:
	file_relative_path = Path(file).relative_to(en_docs_path)
	except ValueError:
	continue

	translated_path = os.path.join(
	base_docs_path, lang.value, file_relative_path
	)
	translation_exists = translated_path in docs_file

	doc = TranslationDoc(
	translation_lang=lang.value,
	original_file=file,
	translation_file=translated_path,
	translation_exists=translation_exists,
	)
	summary.append_file(doc)
	return retrieve(summary, top_k)