minidalalm / src /data /generate_lat_pairs.py

Upload folder using huggingface_hub

83daab2 verified 3 months ago

5.08 kB

	import os
	import json
	import random
	from tqdm import tqdm
	from itertools import islice
	from datasets import load_dataset

	from typing import List


	# Kazakh Cyrillic character to the Kazakh Latin character mapping from 2021 onwards
	cyrillic_to_latin = {
	"А": "A", "а": "a",
	"Ә": "Ä", "ә": "ä",
	"Б": "B", "б": "b",
	"Д": "D", "д": "d",
	"Е": "E", "е": "e",
	"Ф": "F", "ф": "f",
	"Г": "G", "г": "g",
	"Ғ": "Ğ", "ғ": "ğ",
	"Х": "H", "х": "h", # also Һ, see below
	"Һ": "H", "һ": "h",

	"И": "I", "и": "i", # used for [и], [й]
	"І": "I", "і": "ı", # distinct from И in sound, both map to 'I/i'
	"Ж": "J", "ж": "j",

	"К": "K", "к": "k",
	"Қ": "Q", "қ": "q",
	"Л": "L", "л": "l",
	"М": "M", "м": "m",
	"Н": "N", "н": "n",
	"Ң": "Ñ", "ң": "ñ",

	"О": "O", "о": "o",
	"Ө": "Ö", "ө": "ö",

	"П": "P", "п": "p",
	"Р": "R", "р": "r",
	"С": "S", "с": "s",
	"Ш": "Ş", "ш": "ş",
	"Т": "T", "т": "t",

	"У": "U", "у": "u", # basic 'u' sound, distinct from Ұ
	"Ұ": "Ū", "ұ": "ū", # back rounded, used frequently
	"Ү": "Ü", "ү": "ü", # front rounded

	"В": "V", "в": "v",
	"Ы": "Y", "ы": "y",
	"Й": "I", "й": "i", # same treatment as И
	"Ц": "Ts", "ц": "ts", # for Russian borrowings
	"Ч": "Ch", "ч": "ch",
	"Щ": "Ş", "щ": "ş", # typically simplified to 'ş'

	"Э": "E", "э": "e",
	"Ю": "Iu", "ю": "iu", # borrowed words only
	"Я": "Ia", "я": "ia",

	"Ъ": "", "ъ": "",
	"Ь": "", "ь": "",

	"З": "Z", "з": "z",

	# Additional (not in table but used in borrowings)
	"Ё": "Io", "ё": "io",
	}


	def convert_to_latin(text: str) -> str:
	"""
	Simple function to apply the Cyrillic -> Latin mapping for Kazakh characters.
	"""
	return ''.join(cyrillic_to_latin.get(char, char) for char in text)


	def create_augmented_pairs(sentences: List) -> List:
	"""
	Create Kazakh Latin pairs between original sentences and slightly changed ones.
	"""
	pairs = []

	# Randomly change sentences
	for _ in range(len(sentences)):
	s = random.choice(sentences)

	# Create a minor variation
	s_aug = s.replace(".", "").replace(",", "") # remove punctuation
	s_aug = s_aug.replace("ğa", "ga").replace("ñ", "n") # light spelling variants
	s_aug = s_aug.capitalize()

	if s != s_aug:
	pairs.append({"texts": [s, s_aug]})

	return pairs


	# Process all files in "extracted" dir
	# Output file path
	output_path = "src/data/kazakh_latin_pairs.jsonl"

	# List to hold all Latin sentences
	latin_sentences = []

	# First step: process the Wikipedia dump
	print("Processing the Wikipedia dump of Kazakh articles...")

	# Iterate over all folders
	for root, _, files in os.walk("src/data/extracted"):
	for fname in tqdm(files, desc = "Files in Wikipedia dump"):
	with open(os.path.join(root, fname), 'r', encoding = "utf-8") as f:
	for line in f:
	try:
	data = json.loads(line)
	cyr_text = data["text"].strip()
	lat_text = convert_to_latin(cyr_text).strip()

	if lat_text:
	latin_sentences.append(lat_text)

	except Exception as e:
	tqdm.write(f"Skipping due to: {e}")

	continue

	print("Done")

	# Second step: process the "CC100-Kazakh" dataset
	print("Loading 'CC100-Kazakh' dataset...")

	with open("src/data/kk.txt", 'r', encoding = "utf-8") as f:
	for line in tqdm(islice(f, 50_000), total = 50_000, desc = "Lines in CC100-Kazakh"):
	try:
	cyr_text = line.strip()
	lat_text = convert_to_latin(cyr_text).strip()

	if lat_text:
	latin_sentences.append(lat_text)

	except Exception as e:
	tqdm.write(f"Skipping due to: {e}")

	continue

	# Third step: process 15% of the raw, Kazakh-centred part of the "KazParC" dataset
	print("Loading 'KazParC' dataset...")

	kazparc = load_dataset("issai/kazparc", "kazparc_raw", split = "train[:15%]")

	for entry in tqdm(kazparc, desc = "Entries in KazParC"):
	try:
	if "kk" in entry and isinstance(entry["kk"], str):
	cyr_text = entry["kk"].strip()
	lat_text = convert_to_latin(cyr_text).strip()

	if lat_text:
	latin_sentences.append(lat_text)

	except Exception as e:
	tqdm.write(f"Skipping due to: {e}")

	continue


	# Fourth and last step: create Latin sentences with variations
	print("Creating Latin pairs...")

	augmented_pairs = create_augmented_pairs(latin_sentences)

	with open(output_path, 'w', encoding = "utf-8") as f:
	for pair in tqdm(augmented_pairs, desc = "Dataset entries"):
	try:
	f.write(json.dumps(pair, ensure_ascii = False) + "\n")

	except Exception as e:
	tqdm.write(f"Skipping due to: {e}")

	continue