|
|
|
import gradio as gr |
|
import os |
|
import pandas as pd |
|
import requests |
|
from pathlib import Path |
|
import ctranslate2 |
|
import time |
|
import logging |
|
import transformers |
|
import json |
|
import io |
|
from tqdm import tqdm |
|
import subprocess |
|
from huggingface_hub import snapshot_download, upload_file, HfApi, create_repo |
|
|
|
|
|
def download_parquet(url, local_path): |
|
response = requests.get(url, stream=True) |
|
if response.status_code == 200: |
|
with open(local_path, 'wb') as file: |
|
for chunk in response.iter_content(chunk_size=1024): |
|
file.write(chunk) |
|
print("File downloaded successfully.") |
|
else: |
|
print(f"Failed to download file, status code: {response.status_code}") |
|
|
|
|
|
def convert_parquet_to_jsonl_polars(input_file, output_dir, override=False): |
|
output_dir_path = Path(output_dir) |
|
output_dir_path.mkdir(parents=True, exist_ok=True) |
|
|
|
input_path = Path(input_file) |
|
output_file_path = output_dir_path / input_path.with_suffix(".jsonl").name |
|
|
|
if output_file_path.exists() and not override: |
|
print(f"Skipping because output exists already: {output_file_path}") |
|
else: |
|
df = pl.read_parquet(input_path) |
|
df.write_ndjson(output_file_path) |
|
print(f"Data written to {output_file_path}") |
|
|
|
def convert_parquet_to_jsonl(parquet_filename, jsonl_filename): |
|
try: |
|
|
|
df = pd.read_parquet(parquet_filename) |
|
logger.info(f"Read Parquet file {parquet_filename} successfully.") |
|
|
|
|
|
json_str = df.to_json(orient='records', lines=True, force_ascii=False) |
|
logger.info(f"Converted Parquet file to JSON string.") |
|
|
|
|
|
json_str = json_str.replace('\\/', '/') |
|
|
|
|
|
jsonl_filename += '/train.jsonl' |
|
logger.info(f"Attempting to save to {jsonl_filename}") |
|
with open(jsonl_filename, 'w', encoding='utf-8') as file: |
|
file.write(json_str) |
|
logger.info(f"Data saved to {jsonl_filename}") |
|
except Exception as e: |
|
logger.error(f"Failed to convert Parquet to JSONL: {e}") |
|
raise |
|
|
|
|
|
def count_lines_in_jsonl(file_path): |
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
line_count = sum(1 for _ in file) |
|
return line_count |
|
|
|
def parse_range_specification(range_specification, file_length): |
|
line_indices = [] |
|
ranges = range_specification.split(',') |
|
for r in ranges: |
|
if '-' in r: |
|
parts = r.split('-') |
|
start = int(parts[0]) - 1 if parts[0] else 0 |
|
end = int(parts[1]) - 1 if parts[1] else file_length - 1 |
|
if start < 0 or end >= file_length: |
|
logging.error(f"Range {r} is out of bounds.") |
|
continue |
|
line_indices.extend(range(start, end + 1)) |
|
else: |
|
single_line = int(r) - 1 |
|
if single_line < 0 or single_line >= file_length: |
|
logging.error(f"Line number {r} is out of bounds.") |
|
continue |
|
line_indices.append(single_line) |
|
return line_indices |
|
|
|
def translate_text(text, translator, tokenizer, target_language): |
|
""" |
|
Translates the given text from English to German using CTranslate2 and the WMT21 model, |
|
with special handling for newlines and segmenting text longer than 500 characters. |
|
Ensures sequences of newlines (\n\n, \n\n\n, etc.) are accurately reproduced. |
|
""" |
|
try: |
|
segments = [] |
|
newline_sequences = [] |
|
segment = "" |
|
|
|
i = 0 |
|
while i < len(text): |
|
|
|
if text[i] == '\n': |
|
newline_sequence = '\n' |
|
while i + 1 < len(text) and text[i + 1] == '\n': |
|
newline_sequence += '\n' |
|
i += 1 |
|
if segment: |
|
segments.append(segment) |
|
segment = "" |
|
newline_sequences.append(newline_sequence) |
|
else: |
|
segment += text[i] |
|
|
|
if len(segment) >= 500 or i == len(text) - 1: |
|
end_index = max(segment.rfind('.', 0, 500), segment.rfind('?', 0, 500), segment.rfind('!', 0, 500)) |
|
if end_index != -1 and len(segment) > 500: |
|
|
|
segments.append(segment[:end_index+1]) |
|
segment = segment[end_index+1:].lstrip() |
|
else: |
|
|
|
segments.append(segment) |
|
segment = "" |
|
i += 1 |
|
|
|
|
|
translated_segments = [] |
|
for segment in segments: |
|
source = tokenizer.convert_ids_to_tokens(tokenizer.encode(segment)) |
|
target_prefix = [tokenizer.lang_code_to_token[target_language]] |
|
results = translator.translate_batch([source], target_prefix=[target_prefix]) |
|
target = results[0].hypotheses[0][1:] |
|
translated_segment = tokenizer.decode(tokenizer.convert_tokens_to_ids(target)) |
|
translated_segments.append(translated_segment) |
|
|
|
|
|
translated_text = "" |
|
for i, segment in enumerate(translated_segments): |
|
translated_text += segment |
|
if i < len(newline_sequences): |
|
translated_text += newline_sequences[i] |
|
|
|
return translated_text.strip() |
|
|
|
except Exception as e: |
|
logging.error(f"An error occurred during translation: {e}") |
|
return None |
|
|
|
def translate_item_ufb(item, raw_file_path, translator, tokenizer, target_language): |
|
try: |
|
|
|
translated_prompt = translate_text(item['prompt'], translator, tokenizer) |
|
|
|
|
|
translated_chosen = [] |
|
for choice in item['chosen']: |
|
translated_content = translate_text(choice['content'], translator, tokenizer, target_language) |
|
translated_chosen.append({'content': translated_content, 'role': choice['role']}) |
|
|
|
translated_rejected = [] |
|
for choice in item['rejected']: |
|
translated_content = translate_text(choice['content'], translator, tokenizer, target_language) |
|
translated_rejected.append({'content': translated_content, 'role': choice['role']}) |
|
|
|
|
|
with open(raw_file_path, 'a', encoding='utf-8') as raw_file: |
|
raw_file.write(f"Prompt: {translated_prompt}\n") |
|
raw_file.write(f"Chosen: {json.dumps(translated_chosen, ensure_ascii=False)}\n") |
|
raw_file.write(f"Rejected: {json.dumps(translated_rejected, ensure_ascii=False)}\n\n") |
|
|
|
logging.info("Translation request successful.") |
|
|
|
item['prompt'] = translated_prompt |
|
item['chosen'] = translated_chosen |
|
item['rejected'] = translated_rejected |
|
return item |
|
|
|
except Exception as e: |
|
logging.error(f"An error occurred during translation: {e}") |
|
return None |
|
|
|
def validate_item_ufb(item): |
|
|
|
required_fields = ['source', 'prompt', 'chosen', 'rejected'] |
|
for field in required_fields: |
|
if field not in item: |
|
logging.warning(f"Missing required field: {field}") |
|
return False |
|
if field == 'prompt' and not isinstance(item['prompt'], str): |
|
logging.warning("Prompt must be a string.") |
|
return False |
|
|
|
|
|
for field in ['chosen', 'rejected']: |
|
if not isinstance(item[field], list) or not item[field]: |
|
logging.warning(f"No entries or incorrect type for section: {field}") |
|
return False |
|
for idx, message in enumerate(item[field]): |
|
if 'content' not in message or 'role' not in message: |
|
logging.warning(f"Missing 'content' or 'role' field in {field} at index {idx}") |
|
return False |
|
if not isinstance(message['content'], str) or not isinstance(message['role'], str): |
|
logging.warning(f"Invalid type for 'content' or 'role' field in {field} at index {idx}") |
|
return False |
|
|
|
return True |
|
|
|
|
|
|
|
def translate_item_mix(item, raw_file_path, translator, tokenizer, target_language): |
|
""" |
|
Translates the relevant fields in the given item from English to German using CTranslate2 and the WMT21 model, |
|
and saves the raw response to a backup file. |
|
""" |
|
|
|
try: |
|
|
|
translated_prompts = [] |
|
for message in item['prompt']: |
|
translated_content = translate_text(message['content'], translator, tokenizer, target_language) |
|
translated_prompts.append({'content': translated_content, 'role': message['role']}) |
|
|
|
|
|
translated_chosen_content = translate_text(item['chosen'][0]['content'], translator, tokenizer, target_language) |
|
translated_rejected_content = translate_text(item['rejected'][0]['content'], translator, tokenizer, target_language) |
|
|
|
|
|
with open(raw_file_path, 'a', encoding='utf-8') as raw_file: |
|
raw_file.write("Prompt content:\n") |
|
for translated_prompt in translated_prompts: |
|
raw_file.write(f"{translated_prompt['role']}: {translated_prompt['content']}\n") |
|
raw_file.write(f"Chosen content: {translated_chosen_content}\n") |
|
raw_file.write(f"Rejected content: {translated_rejected_content}\n\n") |
|
|
|
logging.info("Translation request successful.") |
|
except Exception as e: |
|
logging.error(f"An error occurred during translation: {e}") |
|
return None |
|
|
|
|
|
item['prompt'] = translated_prompts |
|
item['chosen'][0]['content'] = translated_chosen_content |
|
item['rejected'][0]['content'] = translated_rejected_content |
|
|
|
logging.info("Translation processing successful.") |
|
return item |
|
|
|
def validate_item_mix(item): |
|
""" |
|
Validates the structure, presence, and content of required fields in the given item, |
|
allowing for multiple elements in the 'prompt' field for multi-turn conversations. |
|
""" |
|
required_fields = ['dataset', 'prompt', 'chosen', 'rejected'] |
|
for field in required_fields: |
|
if field not in item: |
|
logging.warning(f"Missing required field: {field}") |
|
return False |
|
|
|
|
|
if len(item['prompt']) < 1 or len(item['chosen']) != 1 or len(item['rejected']) != 1: |
|
logging.warning("Invalid number of elements in 'prompt', 'chosen', or 'rejected' field.") |
|
return False |
|
|
|
|
|
for choice in item['prompt'] + item['chosen'] + item['rejected']: |
|
if 'content' not in choice or 'role' not in choice: |
|
logging.warning("Missing 'content' or 'role' field in choice.") |
|
return False |
|
if not isinstance(choice['content'], str) or not isinstance(choice['role'], str): |
|
logging.warning("Invalid type for 'content' or 'role' field in choice.") |
|
return False |
|
|
|
return True |
|
|
|
def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer, target_language): |
|
try: |
|
translated_texts = {} |
|
|
|
|
|
if item['prompt'] not in translated_texts: |
|
translated_prompt = translate_text(item['prompt'], translator, tokenizer, target_language) |
|
translated_texts[item['prompt']] = translated_prompt |
|
else: |
|
translated_prompt = translated_texts[item['prompt']] |
|
|
|
|
|
def get_translated_content(content): |
|
if content not in translated_texts: |
|
translated_texts[content] = translate_text(content, translator, tokenizer, target_language) |
|
return translated_texts[content] |
|
|
|
|
|
def translate_interactions(interactions): |
|
translated_interactions = [] |
|
for interaction in interactions: |
|
translated_content = get_translated_content(interaction['content']) |
|
translated_interactions.append({'content': translated_content, 'role': interaction['role']}) |
|
return translated_interactions |
|
|
|
translated_chosen = translate_interactions(item['chosen']) |
|
translated_rejected = translate_interactions(item['rejected']) |
|
|
|
|
|
with open(raw_file_path, 'a', encoding='utf-8') as raw_file: |
|
raw_file.write(f"Prompt: {translated_prompt}\n") |
|
raw_file.write(f"Chosen: {json.dumps(translated_chosen, ensure_ascii=False)}\n") |
|
raw_file.write(f"Rejected: {json.dumps(translated_rejected, ensure_ascii=False)}\n\n") |
|
|
|
logging.info("Translation request successful.") |
|
|
|
item['prompt'] = translated_prompt |
|
item['chosen'] = translated_chosen |
|
item['rejected'] = translated_rejected |
|
return item |
|
|
|
except Exception as e: |
|
logging.error(f"An error occurred during translation: {e}") |
|
return None |
|
|
|
def validate_item_ufb_cached(item): |
|
|
|
required_fields = ['source', 'prompt', 'chosen', 'rejected'] |
|
for field in required_fields: |
|
if field not in item: |
|
logging.warning(f"Missing required field: {field}") |
|
return False |
|
|
|
|
|
if not isinstance(item['prompt'], str): |
|
logging.warning("Prompt must be a string.") |
|
return False |
|
|
|
|
|
for field in ['chosen', 'rejected']: |
|
if not isinstance(item[field], list) or not item[field]: |
|
logging.warning(f"No entries or incorrect type for section: {field}") |
|
return False |
|
for idx, message in enumerate(item[field]): |
|
if 'content' not in message or 'role' not in message: |
|
logging.warning(f"Missing 'content' or 'role' field in {field} at index {idx}") |
|
return False |
|
if not isinstance(message['content'], str) or not isinstance(message['role'], str): |
|
logging.warning(f"Invalid type for 'content' or 'role' field in {field} at index {idx}") |
|
return False |
|
|
|
return True |
|
|
|
def process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language): |
|
try: |
|
|
|
if model_type == "mix": |
|
print ("translating a mix-style model...") |
|
validate_item = validate_item_mix |
|
translate_item = translate_item_mix |
|
elif model_type == "ufb_cached": |
|
print ("translating an ufb_cached-style model...") |
|
validate_item = validate_item_ufb_cached |
|
translate_item = translate_item_ufb_cached |
|
elif model_type == "ufb": |
|
print ("translating an ultrafeedback-style model...") |
|
validate_item = validate_item_ufb |
|
translate_item = translate_item_ufb |
|
else: |
|
raise ValueError(f"Unsupported model_type: {model_type}") |
|
|
|
with open(input_file_path, 'r', encoding='utf-8') as file: |
|
data_points = [json.loads(line) for line in file] |
|
|
|
failed_items = [] |
|
failed_items_indices = [] |
|
|
|
for index in tqdm(line_indices, desc="Processing lines", unit="item"): |
|
item = data_points[index] |
|
|
|
|
|
if not validate_item(item): |
|
logging.warning("Skipping item due to invalid structure.") |
|
failed_items.append(item) |
|
continue |
|
|
|
|
|
translated_item = None |
|
retry_count = 0 |
|
while translated_item is None and retry_count < 3: |
|
print ("going to translate the item...") |
|
translated_item = translate_item(item, raw_file_path, translator, tokenizer, target_language) |
|
retry_count += 1 |
|
if translated_item is None: |
|
logging.warning(f"Translation failed for item. Retry attempt: {retry_count}") |
|
time.sleep(1) |
|
|
|
if translated_item is not None: |
|
translated_item['index'] = index |
|
with open(output_file_path, 'a', encoding='utf-8') as file: |
|
file.write(json.dumps(translated_item, ensure_ascii=False) + "\n") |
|
else: |
|
failed_items_indices.append(index) |
|
failed_items.append(item) |
|
logging.error("Translation failed after multiple attempts. Skipping item.") |
|
|
|
|
|
if not validate_item(translated_item): |
|
logging.warning("Skipping translated item due to invalid structure.") |
|
failed_items.append(item) |
|
continue |
|
|
|
with open('failed_items.jsonl', 'w', encoding='utf-8') as file: |
|
for item in failed_items: |
|
file.write(json.dumps(item, ensure_ascii=False) + "\n") |
|
|
|
failed_items_str = generate_failed_items_str(failed_items_indices) |
|
with open('failed_items_index.txt', 'w', encoding='utf-8') as f: |
|
f.write(failed_items_str) |
|
|
|
logging.info("Translation completed successfully.") |
|
|
|
except Exception as e: |
|
logging.error(f"An error occurred: {e}") |
|
|
|
def generate_failed_items_str(indices): |
|
""" |
|
Converts a list of failed item indices into a string. |
|
""" |
|
if not indices: |
|
return "" |
|
|
|
|
|
indices.sort() |
|
range_start = indices[0] |
|
current = range_start |
|
ranges = [] |
|
|
|
for i in indices[1:]: |
|
if i == current + 1: |
|
current = i |
|
else: |
|
if range_start == current: |
|
ranges.append(f"{range_start}") |
|
else: |
|
ranges.append(f"{range_start}-{current}") |
|
range_start = current = i |
|
|
|
|
|
if range_start == current: |
|
ranges.append(f"{range_start}") |
|
else: |
|
ranges.append(f"{range_start}-{current}") |
|
|
|
return ",".join(ranges) |
|
|
|
|
|
def upload_output_to_huggingface(output_file_path, repo_name, token): |
|
api = HfApi() |
|
|
|
|
|
try: |
|
print ("checking repo:", repo_name) |
|
api.repo_info(repo_id=repo_name, repo_type="dataset", token=token) |
|
except Exception as e: |
|
if "404" in str(e): |
|
|
|
print ("creating it...") |
|
create_repo(repo_id=repo_name, repo_type="dataset", token=token) |
|
print(f"Created repository: {repo_name}") |
|
else: |
|
print(f"Failed to check repository existence: {e}") |
|
return |
|
|
|
|
|
try: |
|
print ("starting dataset upload from:", output_file_path) |
|
upload_file( |
|
path_or_fileobj=output_file_path, |
|
path_in_repo=output_file_path, |
|
repo_id=repo_name, |
|
repo_type="dataset", |
|
token=token |
|
) |
|
print(f"Uploaded {output_file_path} to Hugging Face repository: {repo_name}") |
|
except Exception as e: |
|
print(f"Failed to upload {output_file_path} to Hugging Face: {e}") |
|
raise |
|
|
|
def translate_dataset(train_url, local_parquet_path, input_file_path, output_file_path, raw_file_path, range_specification, model_type, output_dir, output_repo_name, token, translator, tokenizer, target_language): |
|
try: |
|
|
|
download_parquet(train_url, local_parquet_path) |
|
except Exception as e: |
|
logging.error(f"Failed to download the Parquet file from {train_url}: {e}") |
|
return |
|
|
|
try: |
|
|
|
convert_parquet_to_jsonl(local_parquet_path, output_dir) |
|
except Exception as e: |
|
logging.error(f"Failed to convert Parquet to JSONL: {e}") |
|
return |
|
|
|
try: |
|
|
|
subprocess.run(["mv", f"{output_dir}/train.jsonl", input_file_path], check=True) |
|
except subprocess.CalledProcessError as e: |
|
logging.error(f"Failed to rename the file from 'train.jsonl' to {input_file_path}: {e}") |
|
return |
|
|
|
try: |
|
|
|
line_count = count_lines_in_jsonl(input_file_path) |
|
logging.info(f"Number of lines in the file: {line_count}") |
|
except Exception as e: |
|
logging.error(f"Failed to count lines in {input_file_path}: {e}") |
|
return |
|
|
|
try: |
|
|
|
line_indices = parse_range_specification(range_specification, file_length=line_count) |
|
if not line_indices: |
|
logging.error("No valid line indices to process. Please check the range specifications.") |
|
return |
|
except Exception as e: |
|
logging.error(f"Error parsing range specification '{range_specification}': {e}") |
|
return |
|
|
|
try: |
|
|
|
process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language) |
|
except Exception as e: |
|
logging.error(f"Failed to process the file {input_file_path}: {e}") |
|
return |
|
|
|
try: |
|
|
|
upload_output_to_huggingface(output_file_path, output_repo_name, token) |
|
except Exception as e: |
|
logging.error(f"Failed to upload {output_file_path} to Hugging Face: {e}") |
|
|
|
|
|
log_stream = io.StringIO() |
|
logging.basicConfig(level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.FileHandler("translation.log", mode='a'), |
|
logging.StreamHandler(log_stream) |
|
]) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
def main(dataset_url, model_type, output_dataset_name, range_specification, target_language, token: gr.OAuthToken | None, profile: gr.OAuthProfile | None): |
|
try: |
|
|
|
if token is None or profile is None or token.token is None or profile.username is None: |
|
return "### You must be logged in to use this service." |
|
|
|
if token: |
|
logger.info("Logged in to Hugging Face") |
|
|
|
|
|
tokenizer_name = "facebook/wmt21-dense-24-wide-en-x" |
|
model_repo_name = "cstr/wmt21ct2_int8" |
|
|
|
|
|
model_path = snapshot_download(repo_id=model_repo_name, token=token.token) |
|
logger.info(f"Model downloaded to: {model_path}") |
|
|
|
|
|
translator = ctranslate2.Translator(model_path, device="auto") |
|
logger.info("CTranslate2 model loaded successfully.") |
|
|
|
|
|
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name) |
|
tokenizer.src_lang = "en" |
|
tokenizer.tgt_lang = target_language |
|
logger.info("Tokenizer loaded successfully.") |
|
|
|
|
|
task = { |
|
"url": dataset_url, |
|
"local_path": "train.parquet", |
|
"input_file": f"{model_type}_en.jsonl", |
|
"output_file": f"{model_type}_{target_language}.jsonl", |
|
"raw_file": f"{model_type}_{target_language}_raw.jsonl", |
|
"range_spec": range_specification, |
|
"model_type": model_type, |
|
"target_language": target_language |
|
} |
|
|
|
|
|
translate_dataset( |
|
train_url=task["url"], |
|
local_parquet_path=task["local_path"], |
|
input_file_path=task["input_file"], |
|
output_file_path=task["output_file"], |
|
output_dir=".", |
|
output_repo_name=output_dataset_name, |
|
raw_file_path=task["raw_file"], |
|
token=token.token, |
|
range_specification=task["range_spec"], |
|
model_type=task["model_type"], |
|
translator=translator, |
|
tokenizer=tokenizer, |
|
target_language=task["target_language"] |
|
) |
|
logger.info("Dataset translation completed!") |
|
return "Dataset translation completed!\n\n### Logs:\n" + log_stream.getvalue() |
|
else: |
|
return "Login failed. Please try again." |
|
except Exception as e: |
|
logger.error(f"An error occurred in the main function: {e}") |
|
return f"An error occurred: {e}\n\n### Logs:\n{log_stream.getvalue()}" |
|
|
|
|
|
|
|
gradio_title = "๐ง WMT21 Dataset Translation" |
|
gradio_desc = """This tool translates english datasets using the WMT21 translation model. |
|
## ๐ญ What Does This Tool Do: |
|
- Translates datasets (as parquet files) with structures based on the selected model type (see below). |
|
- The translation model (facebook/wmt21-dense-24-wide-en-x) supports as target languages: Hausa (ha), Icelandic (is), Japanese (ja), Czech (cs), Russian (ru), Chinese (zh), German (de) |
|
- Uploads the translated dataset as jsonl to Hugging Face. |
|
- At the moment, this works only on CPU, and therefore is very very slow.""" |
|
datasets_desc = """## ๐ Dataset Types: |
|
Note: additional fields will be kept (untranslated), an additional index field is added, which makes it easier to verify results, i.a. |
|
- **mix**: |
|
- `prompt`: List of dictionaries with 'content' and 'role' fields (multi-turn conversation). |
|
- `chosen`: Single dictionary with 'content' and 'role' fields. |
|
- `rejected`: Single dictionary with 'content' and 'role' fields. |
|
- **ufb_cached**: |
|
- `prompt`: String (user input). |
|
- `chosen`: List of dictionaries with 'content' and 'role' fields. |
|
- `rejected`: List of dictionaries with 'content' and 'role' fields. |
|
- **ufb**: |
|
- like ufb_cached, but we do not check for already translated strings |
|
## ๐ ๏ธ Backend: |
|
The translation model is int8 quantized from facebook/wmt21-dense-24-wide-en-x and runs via ctranslate2 on the Hugging Face Hub.""" |
|
|
|
|
|
theme = gr.themes.Soft(text_size="lg", spacing_size="lg") |
|
|
|
with gr.Blocks(theme=theme) as demo: |
|
gr.HTML(f"""<h1 align="center" id="space-title">{gradio_title}</h1>""") |
|
gr.Markdown(gradio_desc) |
|
|
|
with gr.Row(variant="panel"): |
|
gr.Markdown(value="## ๐ Login to Hugging Face"), |
|
gr.LoginButton(min_width=380) |
|
|
|
gr.Markdown(value="๐จ **This is needed to upload the resulting dataset.**") |
|
|
|
with gr.Row(equal_height=False): |
|
with gr.Column(): |
|
dataset_url = gr.Textbox(label="Input Dataset URL", lines=2, placeholder = "https://huggingface.co/datasets/alvarobartt/dpo-mix-7k-simplified/resolve/main/data/train-00000-of-00001.parquet?download=true") |
|
model_type = gr.Dropdown(choices=["mix", "ufb_cached", "ufb"], label="Dataset Type") |
|
output_dataset_name = gr.Textbox(label="Output Dataset Name", lines=1, placeholder = "cstr/translated_datasets") |
|
range_specification = gr.Textbox(label="Range Specification", lines=1, placeholder="e.g., 1-100") |
|
target_language = gr.Dropdown(choices=["ha", "is", "ja", "cs", "ru", "zh", "de"], label="Target Language") |
|
|
|
with gr.Column(): |
|
output = gr.Markdown(label="Output") |
|
|
|
submit_btn = gr.Button("Translate Dataset", variant="primary") |
|
submit_btn.click(main, inputs=[dataset_url, model_type, output_dataset_name, range_specification, target_language], outputs=output) |
|
|
|
|
|
gr.Markdown(datasets_desc) |
|
|
|
demo.queue(max_size=10).launch(share=True, show_api=True) |