explainability-tool-for-aa / utils /generate_augmented_mapping.py
Milad Alshomary
updates
3d73c8d
import csv
import re
def load_original_map_and_extract_morph(path="human_readable.txt"):
human_to_code = {}
morph_entries = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or ":" not in line or line.startswith("#"):
continue
key, val = [p.strip() for p in line.split(":", 1)]
# If key looks like Aspect=Perf, it's a morphological tag
if "=" in key:
morph_entries.append((val, key)) # human:code
else:
human_to_code[val] = key # human:code for POS/etc.
return human_to_code, morph_entries
def extract_bigrams_from_csv(csv_path="../datasets/gram2vec_feats.csv"):
bigrams = set()
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
feat = row["gram2vec_feats"]
if feat.startswith("Part-of-Speech Bigram:"):
human_bigram = feat.split(":", 1)[1].strip()
if "followed by" in human_bigram:
bigrams.add(human_bigram)
return bigrams
def generate_bigram_code_map(human_to_code, bigrams):
pattern = re.compile(r"(.+?) followed by (.+)")
code_map = {}
for bigram in bigrams:
match = pattern.match(bigram)
if match:
x = match.group(1).strip()
y = match.group(2).strip()
code_x = human_to_code.get(x)
code_y = human_to_code.get(y)
if code_x and code_y:
code_map[bigram] = f"{code_x} {code_y}"
else:
print(f"Could not map: {bigram}{code_x}, {code_y}")
else:
print(f"Not matched: {bigram}")
return code_map
def write_augmented_map(pos_bigram_map, morph_entries, original_path="human_readable.txt", output_path="augmented_human_readable.txt"):
with open(output_path, "w", encoding="utf-8") as f:
# Flip original lines: write human-readable:code instead of code:human
with open(original_path, "r", encoding="utf-8") as orig:
for line in orig:
line = line.strip()
if not line or line.startswith("#"):
f.write(line + "\n")
continue
if ":" not in line:
continue
key, val = [p.strip() for p in line.split(":", 1)]
flipped_line = f"{val}:{key}\n"
f.write(flipped_line)
# Add new section for POS bigrams
f.write("\n")
for human, code in sorted(pos_bigram_map.items()):
f.write(f"{human}:{code}\n")
# Re-add morph tag mappings
f.write("\n")
for human, code in sorted(morph_entries):
f.write(f"{human}:{code}\n")
print(f"Augmented map written to {output_path}")
# Run all
human_to_code, morph_entries = load_original_map_and_extract_morph()
bigrams = extract_bigrams_from_csv()
pos_bigram_map = generate_bigram_code_map(human_to_code, bigrams)
write_augmented_map(pos_bigram_map, morph_entries)