|
|
|
|
|
require 'json' |
|
require 'nokogiri' |
|
require 'parallel' |
|
require 'ruby-progressbar' |
|
|
|
JMDICT_XML = 'JMdict_e' |
|
JMNEDICT_XML = 'JMnedict.xml' |
|
PUNC = 'γγγγγγο½ο½ ο½ο½οΌ»οΌ½γγοΌοΌγγγγγοΌοΌγ»οΌγοΌγοΌβοΈβ₯β¦γβ»οΌγ½βͺβ«β¬β©γγγΆγ γββββ'.chars |
|
|
|
def download_dict(xml) |
|
return if File.exist?(File.expand_path(xml, __dir__)) |
|
|
|
archive = "#{xml}.gz" |
|
url = "http://ftp.monash.edu/pub/nihongo/#{archive}" |
|
`cd #{File.dirname(__FILE__)} && wget #{url} && gunzip #{archive}` |
|
end |
|
|
|
def read_word(word) |
|
word.css('k_ele keb').map(&:text) + word.css('r_ele reb').map(&:text) |
|
end |
|
|
|
def read_dict(filename, root) |
|
xml = Nokogiri::XML(File.open(File.expand_path(filename, __dir__))) |
|
words = xml.css("#{root} > entry") |
|
Parallel.flat_map(words, in_threads: 16, progress: root) do |word| |
|
read_word(word) |
|
end |
|
end |
|
|
|
def write_files(words) |
|
src_dir = File.expand_path('../easyocr', __dir__) |
|
ja_dict = File.join(src_dir, 'dict', 'ja.txt') |
|
ja_char = File.join(src_dir, 'character', 'ja_char2.txt') |
|
ja_char_old = File.join(src_dir, 'character', 'ja_char.txt') |
|
ja_punc = File.join(src_dir, 'character', 'ja_punc.txt') |
|
|
|
words -= PUNC |
|
chars = words.join.chars.uniq |
|
chars_old = IO.read(ja_char_old).split("\n") |
|
|
|
puts "new characters: #{(chars - chars_old).size}" |
|
puts "missing characters: #{(chars_old - chars).size}" |
|
puts chars_old - chars |
|
|
|
IO.write(ja_dict, words.join("\n")) |
|
IO.write(ja_char, chars.join("\n")) |
|
IO.write(ja_punc, PUNC.join("\n")) |
|
end |
|
|
|
download_dict(JMDICT_XML) |
|
download_dict(JMNEDICT_XML) |
|
words = read_dict(JMDICT_XML, 'JMdict') + read_dict(JMNEDICT_XML, 'JMnedict') |
|
write_files(words) |
|
|