|
import os |
|
import json |
|
import random |
|
from tqdm import tqdm |
|
from itertools import islice |
|
from datasets import load_dataset |
|
|
|
from typing import List |
|
|
|
|
|
|
|
cyrillic_to_latin = { |
|
"А": "A", "а": "a", |
|
"Ә": "Ä", "ә": "ä", |
|
"Б": "B", "б": "b", |
|
"Д": "D", "д": "d", |
|
"Е": "E", "е": "e", |
|
"Ф": "F", "ф": "f", |
|
"Г": "G", "г": "g", |
|
"Ғ": "Ğ", "ғ": "ğ", |
|
"Х": "H", "х": "h", |
|
"Һ": "H", "һ": "h", |
|
|
|
"И": "I", "и": "i", |
|
"І": "I", "і": "ı", |
|
"Ж": "J", "ж": "j", |
|
|
|
"К": "K", "к": "k", |
|
"Қ": "Q", "қ": "q", |
|
"Л": "L", "л": "l", |
|
"М": "M", "м": "m", |
|
"Н": "N", "н": "n", |
|
"Ң": "Ñ", "ң": "ñ", |
|
|
|
"О": "O", "о": "o", |
|
"Ө": "Ö", "ө": "ö", |
|
|
|
"П": "P", "п": "p", |
|
"Р": "R", "р": "r", |
|
"С": "S", "с": "s", |
|
"Ш": "Ş", "ш": "ş", |
|
"Т": "T", "т": "t", |
|
|
|
"У": "U", "у": "u", |
|
"Ұ": "Ū", "ұ": "ū", |
|
"Ү": "Ü", "ү": "ü", |
|
|
|
"В": "V", "в": "v", |
|
"Ы": "Y", "ы": "y", |
|
"Й": "I", "й": "i", |
|
"Ц": "Ts", "ц": "ts", |
|
"Ч": "Ch", "ч": "ch", |
|
"Щ": "Ş", "щ": "ş", |
|
|
|
"Э": "E", "э": "e", |
|
"Ю": "Iu", "ю": "iu", |
|
"Я": "Ia", "я": "ia", |
|
|
|
"Ъ": "", "ъ": "", |
|
"Ь": "", "ь": "", |
|
|
|
"З": "Z", "з": "z", |
|
|
|
|
|
"Ё": "Io", "ё": "io", |
|
} |
|
|
|
|
|
def convert_to_latin(text: str) -> str: |
|
""" |
|
Simple function to apply the Cyrillic -> Latin mapping for Kazakh characters. |
|
""" |
|
return ''.join(cyrillic_to_latin.get(char, char) for char in text) |
|
|
|
|
|
def create_augmented_pairs(sentences: List) -> List: |
|
""" |
|
Create Kazakh Latin pairs between original sentences and slightly changed ones. |
|
""" |
|
pairs = [] |
|
|
|
|
|
for _ in range(len(sentences)): |
|
s = random.choice(sentences) |
|
|
|
|
|
s_aug = s.replace(".", "").replace(",", "") |
|
s_aug = s_aug.replace("ğa", "ga").replace("ñ", "n") |
|
s_aug = s_aug.capitalize() |
|
|
|
if s != s_aug: |
|
pairs.append({"texts": [s, s_aug]}) |
|
|
|
return pairs |
|
|
|
|
|
|
|
|
|
output_path = "src/data/kazakh_latin_pairs.jsonl" |
|
|
|
|
|
latin_sentences = [] |
|
|
|
|
|
print("Processing the Wikipedia dump of Kazakh articles...") |
|
|
|
|
|
for root, _, files in os.walk("src/data/extracted"): |
|
for fname in tqdm(files, desc = "Files in Wikipedia dump"): |
|
with open(os.path.join(root, fname), 'r', encoding = "utf-8") as f: |
|
for line in f: |
|
try: |
|
data = json.loads(line) |
|
cyr_text = data["text"].strip() |
|
lat_text = convert_to_latin(cyr_text).strip() |
|
|
|
if lat_text: |
|
latin_sentences.append(lat_text) |
|
|
|
except Exception as e: |
|
tqdm.write(f"Skipping due to: {e}") |
|
|
|
continue |
|
|
|
print("Done") |
|
|
|
|
|
print("Loading 'CC100-Kazakh' dataset...") |
|
|
|
with open("src/data/kk.txt", 'r', encoding = "utf-8") as f: |
|
for line in tqdm(islice(f, 50_000), total = 50_000, desc = "Lines in CC100-Kazakh"): |
|
try: |
|
cyr_text = line.strip() |
|
lat_text = convert_to_latin(cyr_text).strip() |
|
|
|
if lat_text: |
|
latin_sentences.append(lat_text) |
|
|
|
except Exception as e: |
|
tqdm.write(f"Skipping due to: {e}") |
|
|
|
continue |
|
|
|
|
|
print("Loading 'KazParC' dataset...") |
|
|
|
kazparc = load_dataset("issai/kazparc", "kazparc_raw", split = "train[:15%]") |
|
|
|
for entry in tqdm(kazparc, desc = "Entries in KazParC"): |
|
try: |
|
if "kk" in entry and isinstance(entry["kk"], str): |
|
cyr_text = entry["kk"].strip() |
|
lat_text = convert_to_latin(cyr_text).strip() |
|
|
|
if lat_text: |
|
latin_sentences.append(lat_text) |
|
|
|
except Exception as e: |
|
tqdm.write(f"Skipping due to: {e}") |
|
|
|
continue |
|
|
|
|
|
|
|
print("Creating Latin pairs...") |
|
|
|
augmented_pairs = create_augmented_pairs(latin_sentences) |
|
|
|
with open(output_path, 'w', encoding = "utf-8") as f: |
|
for pair in tqdm(augmented_pairs, desc = "Dataset entries"): |
|
try: |
|
f.write(json.dumps(pair, ensure_ascii = False) + "\n") |
|
|
|
except Exception as e: |
|
tqdm.write(f"Skipping due to: {e}") |
|
|
|
continue |
|
|