File size: 3,147 Bytes
3d73c8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import csv
import re

def load_original_map_and_extract_morph(path="human_readable.txt"):
    human_to_code = {}
    morph_entries = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or ":" not in line or line.startswith("#"):
                continue
            key, val = [p.strip() for p in line.split(":", 1)]

            # If key looks like Aspect=Perf, it's a morphological tag
            if "=" in key:
                morph_entries.append((val, key))  # human:code
            else:
                human_to_code[val] = key  # human:code for POS/etc.

    return human_to_code, morph_entries

def extract_bigrams_from_csv(csv_path="../datasets/gram2vec_feats.csv"):
    bigrams = set()
    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            feat = row["gram2vec_feats"]
            if feat.startswith("Part-of-Speech Bigram:"):
                human_bigram = feat.split(":", 1)[1].strip()
                if "followed by" in human_bigram:
                    bigrams.add(human_bigram)
    return bigrams

def generate_bigram_code_map(human_to_code, bigrams):
    pattern = re.compile(r"(.+?) followed by (.+)")
    code_map = {}

    for bigram in bigrams:
        match = pattern.match(bigram)
        if match:
            x = match.group(1).strip()
            y = match.group(2).strip()
            code_x = human_to_code.get(x)
            code_y = human_to_code.get(y)
            if code_x and code_y:
                code_map[bigram] = f"{code_x} {code_y}"
            else:
                print(f"Could not map: {bigram}{code_x}, {code_y}")
        else:
            print(f"Not matched: {bigram}")
    return code_map

def write_augmented_map(pos_bigram_map, morph_entries, original_path="human_readable.txt", output_path="augmented_human_readable.txt"):
    with open(output_path, "w", encoding="utf-8") as f:
        # Flip original lines: write human-readable:code instead of code:human
        with open(original_path, "r", encoding="utf-8") as orig:
            for line in orig:
                line = line.strip()
                if not line or line.startswith("#"):
                    f.write(line + "\n")
                    continue
                if ":" not in line:
                    continue
                key, val = [p.strip() for p in line.split(":", 1)]
                flipped_line = f"{val}:{key}\n"
                f.write(flipped_line)


        # Add new section for POS bigrams
        f.write("\n")
        for human, code in sorted(pos_bigram_map.items()):
            f.write(f"{human}:{code}\n")

        # Re-add morph tag mappings
        f.write("\n")
        for human, code in sorted(morph_entries):
            f.write(f"{human}:{code}\n")

    print(f"Augmented map written to {output_path}")

# Run all
human_to_code, morph_entries = load_original_map_and_extract_morph()
bigrams = extract_bigrams_from_csv()
pos_bigram_map = generate_bigram_code_map(human_to_code, bigrams)
write_augmented_map(pos_bigram_map, morph_entries)