File size: 5,081 Bytes
83daab2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import os
import json
import random
from tqdm import tqdm
from itertools import islice
from datasets import load_dataset
from typing import List
# Kazakh Cyrillic character to the Kazakh Latin character mapping from 2021 onwards
cyrillic_to_latin = {
"А": "A", "а": "a",
"Ә": "Ä", "ә": "ä",
"Б": "B", "б": "b",
"Д": "D", "д": "d",
"Е": "E", "е": "e",
"Ф": "F", "ф": "f",
"Г": "G", "г": "g",
"Ғ": "Ğ", "ғ": "ğ",
"Х": "H", "х": "h", # also Һ, see below
"Һ": "H", "һ": "h",
"И": "I", "и": "i", # used for [и], [й]
"І": "I", "і": "ı", # distinct from И in sound, both map to 'I/i'
"Ж": "J", "ж": "j",
"К": "K", "к": "k",
"Қ": "Q", "қ": "q",
"Л": "L", "л": "l",
"М": "M", "м": "m",
"Н": "N", "н": "n",
"Ң": "Ñ", "ң": "ñ",
"О": "O", "о": "o",
"Ө": "Ö", "ө": "ö",
"П": "P", "п": "p",
"Р": "R", "р": "r",
"С": "S", "с": "s",
"Ш": "Ş", "ш": "ş",
"Т": "T", "т": "t",
"У": "U", "у": "u", # basic 'u' sound, distinct from Ұ
"Ұ": "Ū", "ұ": "ū", # back rounded, used frequently
"Ү": "Ü", "ү": "ü", # front rounded
"В": "V", "в": "v",
"Ы": "Y", "ы": "y",
"Й": "I", "й": "i", # same treatment as И
"Ц": "Ts", "ц": "ts", # for Russian borrowings
"Ч": "Ch", "ч": "ch",
"Щ": "Ş", "щ": "ş", # typically simplified to 'ş'
"Э": "E", "э": "e",
"Ю": "Iu", "ю": "iu", # borrowed words only
"Я": "Ia", "я": "ia",
"Ъ": "", "ъ": "",
"Ь": "", "ь": "",
"З": "Z", "з": "z",
# Additional (not in table but used in borrowings)
"Ё": "Io", "ё": "io",
}
def convert_to_latin(text: str) -> str:
"""
Simple function to apply the Cyrillic -> Latin mapping for Kazakh characters.
"""
return ''.join(cyrillic_to_latin.get(char, char) for char in text)
def create_augmented_pairs(sentences: List) -> List:
"""
Create Kazakh Latin pairs between original sentences and slightly changed ones.
"""
pairs = []
# Randomly change sentences
for _ in range(len(sentences)):
s = random.choice(sentences)
# Create a minor variation
s_aug = s.replace(".", "").replace(",", "") # remove punctuation
s_aug = s_aug.replace("ğa", "ga").replace("ñ", "n") # light spelling variants
s_aug = s_aug.capitalize()
if s != s_aug:
pairs.append({"texts": [s, s_aug]})
return pairs
# Process all files in "extracted" dir
# Output file path
output_path = "src/data/kazakh_latin_pairs.jsonl"
# List to hold all Latin sentences
latin_sentences = []
# First step: process the Wikipedia dump
print("Processing the Wikipedia dump of Kazakh articles...")
# Iterate over all folders
for root, _, files in os.walk("src/data/extracted"):
for fname in tqdm(files, desc = "Files in Wikipedia dump"):
with open(os.path.join(root, fname), 'r', encoding = "utf-8") as f:
for line in f:
try:
data = json.loads(line)
cyr_text = data["text"].strip()
lat_text = convert_to_latin(cyr_text).strip()
if lat_text:
latin_sentences.append(lat_text)
except Exception as e:
tqdm.write(f"Skipping due to: {e}")
continue
print("Done")
# Second step: process the "CC100-Kazakh" dataset
print("Loading 'CC100-Kazakh' dataset...")
with open("src/data/kk.txt", 'r', encoding = "utf-8") as f:
for line in tqdm(islice(f, 50_000), total = 50_000, desc = "Lines in CC100-Kazakh"):
try:
cyr_text = line.strip()
lat_text = convert_to_latin(cyr_text).strip()
if lat_text:
latin_sentences.append(lat_text)
except Exception as e:
tqdm.write(f"Skipping due to: {e}")
continue
# Third step: process 15% of the raw, Kazakh-centred part of the "KazParC" dataset
print("Loading 'KazParC' dataset...")
kazparc = load_dataset("issai/kazparc", "kazparc_raw", split = "train[:15%]")
for entry in tqdm(kazparc, desc = "Entries in KazParC"):
try:
if "kk" in entry and isinstance(entry["kk"], str):
cyr_text = entry["kk"].strip()
lat_text = convert_to_latin(cyr_text).strip()
if lat_text:
latin_sentences.append(lat_text)
except Exception as e:
tqdm.write(f"Skipping due to: {e}")
continue
# Fourth and last step: create Latin sentences with variations
print("Creating Latin pairs...")
augmented_pairs = create_augmented_pairs(latin_sentences)
with open(output_path, 'w', encoding = "utf-8") as f:
for pair in tqdm(augmented_pairs, desc = "Dataset entries"):
try:
f.write(json.dumps(pair, ensure_ascii = False) + "\n")
except Exception as e:
tqdm.write(f"Skipping due to: {e}")
continue
|