File size: 5,011 Bytes
1098783
9b2f18e
 
1098783
 
9b2f18e
 
1098783
 
9b2f18e
 
 
4da4b61
9b2f18e
 
 
 
 
 
 
 
 
1098783
 
 
 
 
 
 
 
 
 
 
9b2f18e
 
1098783
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b2f18e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4da4b61
 
 
 
 
 
 
 
 
 
 
 
 
 
9b2f18e
 
 
1098783
 
 
9b2f18e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import json
import os
from concurrent.futures import ThreadPoolExecutor

import fire
from gtts import gTTS
from tqdm import tqdm


def update_vocab(vocab_path="data/vocab.json"):
    compact_json(vocab_path)
    generate_tts(vocab_path)
    conv_to_text(vocab_path)


def compact_json(
    src_path="data/vocab.json",
    dst_path=None,
    group_size=10,
    ensure_ascii=False,
    indent=2,
):
    dst_path = dst_path or src_path

    with open(src_path, "rt", encoding="UTF-8") as fp:
        data = json.load(fp)

    data = [item for group in data for item in group]
    data = [data[i : i + group_size] for i in range(0, len(data), group_size)]

    with open(dst_path, "wt", encoding="UTF-8") as fp:
        json.dump(data, fp, cls=CompactEncoder, ensure_ascii=ensure_ascii, indent=indent)

    print(f"output: {dst_path}")


class CompactEncoder(json.JSONEncoder):
    CONTAINER_TYPES = (list, tuple, dict)
    MAX_WIDTH = 100
    MAX_ITEMS = 10

    def __init__(self, *args, **kwargs):
        if kwargs.get("indent") is None:
            kwargs["indent"] = 4
        super().__init__(*args, **kwargs)
        self.indentation_level = 0

    def encode(self, o):
        if isinstance(o, (list, tuple)):
            return self._encode_list(o)
        if isinstance(o, dict):
            return self._encode_object(o)
        if isinstance(o, float):
            return format(o, "g")
        return json.dumps(
            o,
            skipkeys=self.skipkeys,
            ensure_ascii=self.ensure_ascii,
            check_circular=self.check_circular,
            allow_nan=self.allow_nan,
            sort_keys=self.sort_keys,
            indent=self.indent,
            separators=(self.item_separator, self.key_separator),
            default=self.default if hasattr(self, "default") else None,
        )

    def _encode_list(self, o):
        if self._single_line(o):
            return "[" + ", ".join(self.encode(el) for el in o) + "]"
        self.indentation_level += 1
        output = [self.indent_str + self.encode(el) for el in o]
        self.indentation_level -= 1
        return "[\n" + ",\n".join(output) + "\n" + self.indent_str + "]"

    def _encode_object(self, o):
        if not o:
            return "{}"

        o = {str(k) if k is not None else "null": v for k, v in o.items()}

        if self.sort_keys:
            o = dict(sorted(o.items(), key=lambda x: x[0]))

        if self._single_line(o):
            return "{" + ", ".join(f"{self._create_kv(k,v)}" for k, v in o.items()) + "}"

        self.indentation_level += 1
        output = [f"{self.indent_str}{self._create_kv(k,v)}" for k, v in o.items()]
        self.indentation_level -= 1

        return "{\n" + ",\n".join(output) + "\n" + self.indent_str + "}"

    def _create_kv(self, k, v):
        return f"{json.dumps(k)}: {self.encode(v)}"

    def iterencode(self, o, **_):
        return self.encode(o)

    def _single_line(self, o):
        return (
            self._primitives_only(o)
            and len(o) <= self.MAX_ITEMS
            and len(str(o)) - 2 <= self.MAX_WIDTH
        )

    def _primitives_only(self, o: list | tuple | dict):
        if isinstance(o, (list, tuple)):
            return not any(isinstance(el, self.CONTAINER_TYPES) for el in o)
        elif isinstance(o, dict):
            return not any(isinstance(el, self.CONTAINER_TYPES) for el in o.values())

    @property
    def indent_str(self) -> str:
        if isinstance(self.indent, int):
            return " " * (self.indentation_level * self.indent)
        elif isinstance(self.indent, str):
            return self.indentation_level * self.indent
        else:
            raise ValueError(f"indent must either be of type int or str (is: {type(self.indent)})")


def generate_tts(src_path="data/vocab.json", output_dir="data/tts"):
    os.makedirs(output_dir, exist_ok=True)
    data = load_json(src_path)
    text_list = [item["kana"] for item_list in data for item in item_list]

    def generate_tts_worker(text):
        fp = os.path.join(output_dir, f"{text}.mp3")
        if os.path.exists(fp):
            return
        gTTS(text=text, lang="ja").save(fp)

    with tqdm(total=len(text_list), desc="generating tts") as pbar:
        with ThreadPoolExecutor() as executor:
            for _ in executor.map(generate_tts_worker, text_list):
                pbar.update(1)


def conv_to_text(vocab_path, dst_path="data/vocab.txt"):
    vocab_list = load_json(vocab_path)

    lines = list()
    for group in vocab_list:
        for v in group:
            t = [t for t in (v["kana"], v["kanji"], v["meaning"]) if t]
            lines.append(" ".join(t))
        lines.append("")

    with open(dst_path, "wt", encoding="UTF-8") as fp:
        fp.write("\n".join(lines))


def load_json(path):
    with open(path, "rt", encoding="UTF-8") as fp:
        return json.load(fp)


if __name__ == "__main__":
    fire_map = dict(update=update_vocab, compact=compact_json, tts=generate_tts)
    fire.Fire(fire_map)