File size: 5,081 Bytes

83daab2

import os
import json
import random
from tqdm import tqdm
from itertools import islice
from datasets import load_dataset

from typing import List


# Kazakh Cyrillic character to the Kazakh Latin character mapping from 2021 onwards
cyrillic_to_latin = {
    "А": "A", "а": "a",
    "Ә": "Ä", "ә": "ä",
    "Б": "B", "б": "b",
    "Д": "D", "д": "d",
    "Е": "E", "е": "e",
    "Ф": "F", "ф": "f",
    "Г": "G", "г": "g",
    "Ғ": "Ğ", "ғ": "ğ",
    "Х": "H", "х": "h",      # also Һ, see below
    "Һ": "H", "һ": "h",

    "И": "I", "и": "i",      # used for [и], [й]
    "І": "I", "і": "ı",      # distinct from И in sound, both map to 'I/i'
    "Ж": "J", "ж": "j",

    "К": "K", "к": "k",
    "Қ": "Q", "қ": "q",
    "Л": "L", "л": "l",
    "М": "M", "м": "m",
    "Н": "N", "н": "n",
    "Ң": "Ñ", "ң": "ñ",

    "О": "O", "о": "o",
    "Ө": "Ö", "ө": "ö",

    "П": "P", "п": "p",
    "Р": "R", "р": "r",
    "С": "S", "с": "s",
    "Ш": "Ş", "ш": "ş",
    "Т": "T", "т": "t",

    "У": "U", "у": "u",      # basic 'u' sound, distinct from Ұ
    "Ұ": "Ū", "ұ": "ū",      # back rounded, used frequently
    "Ү": "Ü", "ү": "ü",      # front rounded

    "В": "V", "в": "v",
    "Ы": "Y", "ы": "y",
    "Й": "I", "й": "i",      # same treatment as И
    "Ц": "Ts", "ц": "ts",    # for Russian borrowings
    "Ч": "Ch", "ч": "ch",
    "Щ": "Ş", "щ": "ş",      # typically simplified to 'ş'

    "Э": "E", "э": "e",
    "Ю": "Iu", "ю": "iu",    # borrowed words only
    "Я": "Ia", "я": "ia",

    "Ъ": "", "ъ": "",
    "Ь": "", "ь": "",

    "З": "Z", "з": "z",

    # Additional (not in table but used in borrowings)
    "Ё": "Io", "ё": "io",
}


def convert_to_latin(text: str) -> str:
    """
    Simple function to apply the Cyrillic -> Latin mapping for Kazakh characters.
    """
    return ''.join(cyrillic_to_latin.get(char, char) for char in text)


def create_augmented_pairs(sentences: List) -> List:
    """
    Create Kazakh Latin pairs between original sentences and slightly changed ones.
    """
    pairs = []

    # Randomly change sentences
    for _ in range(len(sentences)):
        s = random.choice(sentences)

        # Create a minor variation
        s_aug = s.replace(".", "").replace(",", "")  # remove punctuation
        s_aug = s_aug.replace("ğa", "ga").replace("ñ", "n")  # light spelling variants
        s_aug = s_aug.capitalize()

        if s != s_aug:
            pairs.append({"texts": [s, s_aug]})

    return pairs


# Process all files in "extracted" dir
# Output file path
output_path = "src/data/kazakh_latin_pairs.jsonl"

# List to hold all Latin sentences
latin_sentences = []

# First step: process the Wikipedia dump
print("Processing the Wikipedia dump of Kazakh articles...")

# Iterate over all folders
for root, _, files in os.walk("src/data/extracted"):
    for fname in tqdm(files, desc = "Files in Wikipedia dump"):
        with open(os.path.join(root, fname), 'r', encoding = "utf-8") as f:
            for line in f:
                try:
                    data = json.loads(line)
                    cyr_text = data["text"].strip()
                    lat_text = convert_to_latin(cyr_text).strip()

                    if lat_text:
                        latin_sentences.append(lat_text)

                except Exception as e:
                    tqdm.write(f"Skipping due to: {e}")

                    continue

print("Done")

# Second step: process the "CC100-Kazakh" dataset
print("Loading 'CC100-Kazakh' dataset...")

with open("src/data/kk.txt", 'r', encoding = "utf-8") as f:
    for line in tqdm(islice(f, 50_000), total = 50_000, desc = "Lines in CC100-Kazakh"):
        try:
            cyr_text = line.strip()
            lat_text = convert_to_latin(cyr_text).strip()

            if lat_text:
                latin_sentences.append(lat_text)

        except Exception as e:
            tqdm.write(f"Skipping due to: {e}")

            continue

# Third step: process 15% of the raw, Kazakh-centred part of the "KazParC" dataset
print("Loading 'KazParC' dataset...")

kazparc = load_dataset("issai/kazparc", "kazparc_raw", split = "train[:15%]")

for entry in tqdm(kazparc, desc = "Entries in KazParC"):
    try:
        if "kk" in entry and isinstance(entry["kk"], str):
            cyr_text = entry["kk"].strip()
            lat_text = convert_to_latin(cyr_text).strip()

            if lat_text:
                latin_sentences.append(lat_text)

    except Exception as e:
        tqdm.write(f"Skipping due to: {e}")

        continue


# Fourth and last step: create Latin sentences with variations
print("Creating Latin pairs...")

augmented_pairs = create_augmented_pairs(latin_sentences)

with open(output_path, 'w', encoding = "utf-8") as f:
    for pair in tqdm(augmented_pairs, desc = "Dataset entries"):
        try:
            f.write(json.dumps(pair, ensure_ascii = False) + "\n")

        except Exception as e:
            tqdm.write(f"Skipping due to: {e}")

            continue