minidalalm / src /data /generate_lat_pairs.py
crossroderick's picture
Upload folder using huggingface_hub
83daab2 verified
import os
import json
import random
from tqdm import tqdm
from itertools import islice
from datasets import load_dataset
from typing import List
# Kazakh Cyrillic character to the Kazakh Latin character mapping from 2021 onwards
cyrillic_to_latin = {
"А": "A", "а": "a",
"Ә": "Ä", "ә": "ä",
"Б": "B", "б": "b",
"Д": "D", "д": "d",
"Е": "E", "е": "e",
"Ф": "F", "ф": "f",
"Г": "G", "г": "g",
"Ғ": "Ğ", "ғ": "ğ",
"Х": "H", "х": "h", # also Һ, see below
"Һ": "H", "һ": "h",
"И": "I", "и": "i", # used for [и], [й]
"І": "I", "і": "ı", # distinct from И in sound, both map to 'I/i'
"Ж": "J", "ж": "j",
"К": "K", "к": "k",
"Қ": "Q", "қ": "q",
"Л": "L", "л": "l",
"М": "M", "м": "m",
"Н": "N", "н": "n",
"Ң": "Ñ", "ң": "ñ",
"О": "O", "о": "o",
"Ө": "Ö", "ө": "ö",
"П": "P", "п": "p",
"Р": "R", "р": "r",
"С": "S", "с": "s",
"Ш": "Ş", "ш": "ş",
"Т": "T", "т": "t",
"У": "U", "у": "u", # basic 'u' sound, distinct from Ұ
"Ұ": "Ū", "ұ": "ū", # back rounded, used frequently
"Ү": "Ü", "ү": "ü", # front rounded
"В": "V", "в": "v",
"Ы": "Y", "ы": "y",
"Й": "I", "й": "i", # same treatment as И
"Ц": "Ts", "ц": "ts", # for Russian borrowings
"Ч": "Ch", "ч": "ch",
"Щ": "Ş", "щ": "ş", # typically simplified to 'ş'
"Э": "E", "э": "e",
"Ю": "Iu", "ю": "iu", # borrowed words only
"Я": "Ia", "я": "ia",
"Ъ": "", "ъ": "",
"Ь": "", "ь": "",
"З": "Z", "з": "z",
# Additional (not in table but used in borrowings)
"Ё": "Io", "ё": "io",
}
def convert_to_latin(text: str) -> str:
"""
Simple function to apply the Cyrillic -> Latin mapping for Kazakh characters.
"""
return ''.join(cyrillic_to_latin.get(char, char) for char in text)
def create_augmented_pairs(sentences: List) -> List:
"""
Create Kazakh Latin pairs between original sentences and slightly changed ones.
"""
pairs = []
# Randomly change sentences
for _ in range(len(sentences)):
s = random.choice(sentences)
# Create a minor variation
s_aug = s.replace(".", "").replace(",", "") # remove punctuation
s_aug = s_aug.replace("ğa", "ga").replace("ñ", "n") # light spelling variants
s_aug = s_aug.capitalize()
if s != s_aug:
pairs.append({"texts": [s, s_aug]})
return pairs
# Process all files in "extracted" dir
# Output file path
output_path = "src/data/kazakh_latin_pairs.jsonl"
# List to hold all Latin sentences
latin_sentences = []
# First step: process the Wikipedia dump
print("Processing the Wikipedia dump of Kazakh articles...")
# Iterate over all folders
for root, _, files in os.walk("src/data/extracted"):
for fname in tqdm(files, desc = "Files in Wikipedia dump"):
with open(os.path.join(root, fname), 'r', encoding = "utf-8") as f:
for line in f:
try:
data = json.loads(line)
cyr_text = data["text"].strip()
lat_text = convert_to_latin(cyr_text).strip()
if lat_text:
latin_sentences.append(lat_text)
except Exception as e:
tqdm.write(f"Skipping due to: {e}")
continue
print("Done")
# Second step: process the "CC100-Kazakh" dataset
print("Loading 'CC100-Kazakh' dataset...")
with open("src/data/kk.txt", 'r', encoding = "utf-8") as f:
for line in tqdm(islice(f, 50_000), total = 50_000, desc = "Lines in CC100-Kazakh"):
try:
cyr_text = line.strip()
lat_text = convert_to_latin(cyr_text).strip()
if lat_text:
latin_sentences.append(lat_text)
except Exception as e:
tqdm.write(f"Skipping due to: {e}")
continue
# Third step: process 15% of the raw, Kazakh-centred part of the "KazParC" dataset
print("Loading 'KazParC' dataset...")
kazparc = load_dataset("issai/kazparc", "kazparc_raw", split = "train[:15%]")
for entry in tqdm(kazparc, desc = "Entries in KazParC"):
try:
if "kk" in entry and isinstance(entry["kk"], str):
cyr_text = entry["kk"].strip()
lat_text = convert_to_latin(cyr_text).strip()
if lat_text:
latin_sentences.append(lat_text)
except Exception as e:
tqdm.write(f"Skipping due to: {e}")
continue
# Fourth and last step: create Latin sentences with variations
print("Creating Latin pairs...")
augmented_pairs = create_augmented_pairs(latin_sentences)
with open(output_path, 'w', encoding = "utf-8") as f:
for pair in tqdm(augmented_pairs, desc = "Dataset entries"):
try:
f.write(json.dumps(pair, ensure_ascii = False) + "\n")
except Exception as e:
tqdm.write(f"Skipping due to: {e}")
continue