File size: 1,708 Bytes
b4959be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# frozen_string_literal: true

require 'json'
require 'nokogiri'
require 'parallel'
require 'ruby-progressbar'

JMDICT_XML = 'JMdict_e'
JMNEDICT_XML = 'JMnedict.xml'
PUNC = 'γ€γ€‘γ€Šγ€‹γ€ˆγ€‰ο½Ÿο½ ο½›ο½οΌ»οΌ½γ€”γ€•οΌˆοΌ‰γ€Žγ€γ€Œγ€γ€οΌ›οΌšγƒ»οΌŸγ€œοΌγ€‚οΌβ‰οΈŽβ€₯β€¦γ€œβ€»οΌŠγ€½β™ͺβ™«β™¬β™©γ€‡γ€’γ€Άγ€ γ€„β“β“β“Žβ†’'.chars

def download_dict(xml)
  return if File.exist?(File.expand_path(xml, __dir__))

  archive = "#{xml}.gz"
  url = "http://ftp.monash.edu/pub/nihongo/#{archive}"
  `cd #{File.dirname(__FILE__)} && wget #{url} && gunzip #{archive}`
end

def read_word(word)
  word.css('k_ele keb').map(&:text) + word.css('r_ele reb').map(&:text)
end

def read_dict(filename, root)
  xml = Nokogiri::XML(File.open(File.expand_path(filename, __dir__)))
  words = xml.css("#{root} > entry")
  Parallel.flat_map(words, in_threads: 16, progress: root) do |word|
    read_word(word)
  end
end

def write_files(words)
  src_dir = File.expand_path('../easyocr', __dir__)
  ja_dict = File.join(src_dir, 'dict', 'ja.txt')
  ja_char = File.join(src_dir, 'character', 'ja_char2.txt')
  ja_char_old = File.join(src_dir, 'character', 'ja_char.txt')
  ja_punc = File.join(src_dir, 'character', 'ja_punc.txt')

  words -= PUNC
  chars = words.join.chars.uniq
  chars_old = IO.read(ja_char_old).split("\n")

  puts "new characters: #{(chars - chars_old).size}"
  puts "missing characters: #{(chars_old - chars).size}"
  puts chars_old - chars

  IO.write(ja_dict, words.join("\n"))
  IO.write(ja_char, chars.join("\n"))
  IO.write(ja_punc, PUNC.join("\n"))
end

download_dict(JMDICT_XML)
download_dict(JMNEDICT_XML)
words = read_dict(JMDICT_XML, 'JMdict') + read_dict(JMNEDICT_XML, 'JMnedict')
write_files(words)