Files
akaza/comb/comb.py
Tokuhiro Matsuno d538396f18 import
2020-09-08 10:15:03 +09:00

208 lines
6.8 KiB
Python

from typing import List, Any
import comb
import os
import time
import logging
import marisa_trie
import jaconv
from gi.repository import GLib
from comb import combromkan
from comb.skkdict import parse_skkdict, write_skkdict, merge_skkdict
import pathlib
BOIN = set(['a', 'i', 'u', 'e', 'o'])
class SystemDict:
def __init__(self, logger=logging.getLogger(__name__)):
self.logger = logger
try:
self._load()
except:
self.logger.error("cannot LOAD JISYO", exc_info=True)
def cache_file(self):
cachedir = os.path.join(GLib.get_user_cache_dir(), 'ibus-comb')
pathlib.Path(cachedir).mkdir(parents=True, exist_ok=True)
return os.path.join(cachedir, 'system-dict.marisa')
def _load(self):
# TODO: load configuration file.
dicts = [
('/home/tokuhirom/dotfiles/skk/SKK-JISYO.tokuhirom', 'utf-8'),
('/usr/share/skk/SKK-JISYO.L', 'euc-jp'),
('/usr/share/skk/SKK-JISYO.jinmei', 'euc-jp'),
('/home/tokuhirom/dotfiles/skk/SKK-JISYO.jawiki', 'utf-8'),
]
def get_mtime(fname):
try:
return os.path.getmtime(fname)
except FileNotFoundError:
return -1
cache_file = self.cache_file()
if get_mtime(cache_file) >= max([get_mtime(x[0]) for x in dicts]):
self.logger.info("loading cache dictionary")
trie = marisa_trie.BytesTrie()
trie.load(cache_file)
self.trie = trie
return
self.logger.info("loading dictionaries")
t0 = time.time()
t = []
# TODO cache trie.
dictionaries = [parse_skkdict(fname, encoding) for fname, encoding in dicts]
def expand_okuri(kana, kanjis):
if kana[-1].isalpha():
if kana[-1] in BOIN:
okuri = combromkan.to_hiragana(kana[-1])
yield kana[:-1] + okuri, [kanji + okuri for kanji in kanjis]
else:
for b in BOIN:
okuri = combromkan.to_hiragana(kana[-1] + b)
yield kana[:-1] + okuri, [kanji + okuri for kanji in kanjis]
else:
yield kana, kanjis
def ari2nasi(src):
retval = {}
for kana, kanjis in src.items():
for kkk, vvv in expand_okuri(kana, kanjis):
retval[kkk] = vvv
return retval
ari_dictionary = merge_skkdict([d[0] for d in dictionaries])
nasi_dictionary = merge_skkdict(
[d[1] for d in dictionaries] +
[ari2nasi(ari_dictionary)]
)
for k, v in nasi_dictionary.items():
t.append((k, '/'.join(v).encode('utf-8')))
self.trie = marisa_trie.BytesTrie(t)
self.logger.info(f"LOADed JISYO: in {time.time() - t0:f} sec")
self.trie.save(cache_file)
self.logger.info(f"Saved cache file: {cache_file} in {time.time() - t0:f} sec")
# src は /better/ みたいな英単語を検索するためにワタシテイルです。
def get_candidates(self, src, hiragana):
if src in self.trie:
kanjis = self.trie[src][0].decode('utf-8').split('/')
for kanji in kanjis:
yield kanji
for prefix in reversed(self.trie.prefixes(hiragana)):
kanjis = self.trie[prefix][0].decode('utf-8').split('/')
for kanji in kanjis:
yield kanji + hiragana[len(prefix):]
class UserDict:
def __init__(self, path, logger=logging.getLogger(__name__)):
self.path = path
self.logger = logger
if os.path.isfile(path):
self.dict_ari, self.dict_nasi = parse_skkdict(path, encoding='utf-8')
else:
self.dict_ari, self.dict_nasi = {}, {}
def get_candidates(self, src, hiragana):
candidates = []
for keyword in [src, hiragana]:
if keyword in self.dict_nasi:
got = self.dict_nasi[keyword]
self.logger.debug("GOT: %s" % str(got))
for e in got:
candidates.append([e, e])
return candidates
def add_entry(self, roma, kanji):
self.logger.info(f"add user_dict entry: roma='{roma}' kanji='{kanji}'")
kana = combromkan.to_hiragana(roma)
if kana in self.dict_nasi:
e = self.dict_nasi[kana]
if kanji in e:
# イチバンマエにもっていく。
e.remove(kanji)
e.insert(0, kanji)
else:
self.dict_nasi[kana].insert(0, kanji)
else:
self.dict_nasi[kana] = [kanji]
# 非同期でかくようにしたほうが better.
self.save()
self.logger.info("SAVED!")
def save(self):
write_skkdict(self.path, self.dict_ari, self.dict_nasi)
class Comb:
dictionaries: List[Any]
def __init__(self, logger, user_dict: UserDict, system_dict: SystemDict):
self.logger = logger
self.dictionaries = []
self.user_dict = user_dict
self.system_dict = system_dict
def convert(self, src):
hiragana = combromkan.to_hiragana(src)
katakana = jaconv.hira2kata(hiragana)
candidates = [[hiragana, hiragana]]
for e in self.user_dict.get_candidates(src, hiragana):
if e not in candidates:
candidates.append(e)
if [katakana, katakana] not in candidates:
candidates.append([katakana, katakana])
for e in [[x, x] for x in self.system_dict.get_candidates(src, hiragana)]:
if e not in candidates:
candidates.append(e)
if src[0].isupper():
# 先頭が大文字の場合、それを先頭にもってくる。
candidates.insert(0, [src, src])
else:
# そうじゃなければ、末尾にいれる。
candidates.append([src, src])
return candidates
if __name__ == '__main__':
from gi.repository import GLib
import pathlib
import logging
logging.basicConfig(level=logging.DEBUG)
configdir = os.path.join(GLib.get_user_config_dir(), 'ibus-comb')
pathlib.Path(configdir).mkdir(parents=True, exist_ok=True)
d = SystemDict()
u = UserDict(os.path.join(configdir, 'user-dict.txt'))
comb = Comb(logging.getLogger(__name__), u, d)
# print(comb.convert('henkandekiru'))
print(comb.convert('watasi'))
# print(comb.convert('hituyoudayo'))
# print(list(d.get_candidates('henkandekiru', 'へんかんできる')))
# print(list(d.get_candidates('warudakumi', 'わるだくみ')))
# print(list(d.get_candidates('subarasii', 'すばらしい')))
# print(list(d.get_candidates('watasi', 'わたし')))
# print(list(d.get_candidates('hiragana', 'ひらがな')))
# print(list(d.get_candidates('buffer', 'ぶっふぇr')))