mirror of
https://github.com/mii443/akaza.git
synced 2025-12-03 11:08:29 +00:00
208 lines
6.8 KiB
Python
208 lines
6.8 KiB
Python
from typing import List, Any
|
|
|
|
import comb
|
|
import os
|
|
import time
|
|
import logging
|
|
|
|
import marisa_trie
|
|
import jaconv
|
|
|
|
from gi.repository import GLib
|
|
|
|
from comb import combromkan
|
|
from comb.skkdict import parse_skkdict, write_skkdict, merge_skkdict
|
|
import pathlib
|
|
|
|
BOIN = set(['a', 'i', 'u', 'e', 'o'])
|
|
|
|
|
|
class SystemDict:
|
|
def __init__(self, logger=logging.getLogger(__name__)):
|
|
self.logger = logger
|
|
try:
|
|
self._load()
|
|
except:
|
|
self.logger.error("cannot LOAD JISYO", exc_info=True)
|
|
|
|
def cache_file(self):
|
|
cachedir = os.path.join(GLib.get_user_cache_dir(), 'ibus-comb')
|
|
pathlib.Path(cachedir).mkdir(parents=True, exist_ok=True)
|
|
return os.path.join(cachedir, 'system-dict.marisa')
|
|
|
|
def _load(self):
|
|
# TODO: load configuration file.
|
|
dicts = [
|
|
('/home/tokuhirom/dotfiles/skk/SKK-JISYO.tokuhirom', 'utf-8'),
|
|
('/usr/share/skk/SKK-JISYO.L', 'euc-jp'),
|
|
('/usr/share/skk/SKK-JISYO.jinmei', 'euc-jp'),
|
|
('/home/tokuhirom/dotfiles/skk/SKK-JISYO.jawiki', 'utf-8'),
|
|
]
|
|
|
|
def get_mtime(fname):
|
|
try:
|
|
return os.path.getmtime(fname)
|
|
except FileNotFoundError:
|
|
return -1
|
|
|
|
cache_file = self.cache_file()
|
|
if get_mtime(cache_file) >= max([get_mtime(x[0]) for x in dicts]):
|
|
self.logger.info("loading cache dictionary")
|
|
trie = marisa_trie.BytesTrie()
|
|
trie.load(cache_file)
|
|
self.trie = trie
|
|
return
|
|
|
|
self.logger.info("loading dictionaries")
|
|
t0 = time.time()
|
|
t = []
|
|
# TODO cache trie.
|
|
dictionaries = [parse_skkdict(fname, encoding) for fname, encoding in dicts]
|
|
|
|
def expand_okuri(kana, kanjis):
|
|
if kana[-1].isalpha():
|
|
if kana[-1] in BOIN:
|
|
okuri = combromkan.to_hiragana(kana[-1])
|
|
yield kana[:-1] + okuri, [kanji + okuri for kanji in kanjis]
|
|
else:
|
|
for b in BOIN:
|
|
okuri = combromkan.to_hiragana(kana[-1] + b)
|
|
yield kana[:-1] + okuri, [kanji + okuri for kanji in kanjis]
|
|
else:
|
|
yield kana, kanjis
|
|
|
|
def ari2nasi(src):
|
|
retval = {}
|
|
for kana, kanjis in src.items():
|
|
for kkk, vvv in expand_okuri(kana, kanjis):
|
|
retval[kkk] = vvv
|
|
return retval
|
|
|
|
ari_dictionary = merge_skkdict([d[0] for d in dictionaries])
|
|
nasi_dictionary = merge_skkdict(
|
|
[d[1] for d in dictionaries] +
|
|
[ari2nasi(ari_dictionary)]
|
|
)
|
|
|
|
for k, v in nasi_dictionary.items():
|
|
t.append((k, '/'.join(v).encode('utf-8')))
|
|
self.trie = marisa_trie.BytesTrie(t)
|
|
self.logger.info(f"LOADed JISYO: in {time.time() - t0:f} sec")
|
|
self.trie.save(cache_file)
|
|
self.logger.info(f"Saved cache file: {cache_file} in {time.time() - t0:f} sec")
|
|
|
|
# src は /better/ みたいな英単語を検索するためにワタシテイルです。
|
|
def get_candidates(self, src, hiragana):
|
|
if src in self.trie:
|
|
kanjis = self.trie[src][0].decode('utf-8').split('/')
|
|
for kanji in kanjis:
|
|
yield kanji
|
|
|
|
for prefix in reversed(self.trie.prefixes(hiragana)):
|
|
kanjis = self.trie[prefix][0].decode('utf-8').split('/')
|
|
for kanji in kanjis:
|
|
yield kanji + hiragana[len(prefix):]
|
|
|
|
|
|
class UserDict:
|
|
def __init__(self, path, logger=logging.getLogger(__name__)):
|
|
self.path = path
|
|
self.logger = logger
|
|
if os.path.isfile(path):
|
|
self.dict_ari, self.dict_nasi = parse_skkdict(path, encoding='utf-8')
|
|
else:
|
|
self.dict_ari, self.dict_nasi = {}, {}
|
|
|
|
def get_candidates(self, src, hiragana):
|
|
candidates = []
|
|
|
|
for keyword in [src, hiragana]:
|
|
if keyword in self.dict_nasi:
|
|
got = self.dict_nasi[keyword]
|
|
self.logger.debug("GOT: %s" % str(got))
|
|
for e in got:
|
|
candidates.append([e, e])
|
|
|
|
return candidates
|
|
|
|
def add_entry(self, roma, kanji):
|
|
self.logger.info(f"add user_dict entry: roma='{roma}' kanji='{kanji}'")
|
|
kana = combromkan.to_hiragana(roma)
|
|
|
|
if kana in self.dict_nasi:
|
|
e = self.dict_nasi[kana]
|
|
if kanji in e:
|
|
# イチバンマエにもっていく。
|
|
e.remove(kanji)
|
|
e.insert(0, kanji)
|
|
else:
|
|
self.dict_nasi[kana].insert(0, kanji)
|
|
else:
|
|
self.dict_nasi[kana] = [kanji]
|
|
|
|
# 非同期でかくようにしたほうが better.
|
|
self.save()
|
|
self.logger.info("SAVED!")
|
|
|
|
def save(self):
|
|
write_skkdict(self.path, self.dict_ari, self.dict_nasi)
|
|
|
|
|
|
class Comb:
|
|
dictionaries: List[Any]
|
|
|
|
def __init__(self, logger, user_dict: UserDict, system_dict: SystemDict):
|
|
self.logger = logger
|
|
self.dictionaries = []
|
|
self.user_dict = user_dict
|
|
self.system_dict = system_dict
|
|
|
|
def convert(self, src):
|
|
hiragana = combromkan.to_hiragana(src)
|
|
katakana = jaconv.hira2kata(hiragana)
|
|
|
|
candidates = [[hiragana, hiragana]]
|
|
|
|
for e in self.user_dict.get_candidates(src, hiragana):
|
|
if e not in candidates:
|
|
candidates.append(e)
|
|
|
|
if [katakana, katakana] not in candidates:
|
|
candidates.append([katakana, katakana])
|
|
|
|
for e in [[x, x] for x in self.system_dict.get_candidates(src, hiragana)]:
|
|
if e not in candidates:
|
|
candidates.append(e)
|
|
|
|
if src[0].isupper():
|
|
# 先頭が大文字の場合、それを先頭にもってくる。
|
|
candidates.insert(0, [src, src])
|
|
else:
|
|
# そうじゃなければ、末尾にいれる。
|
|
candidates.append([src, src])
|
|
|
|
return candidates
|
|
|
|
|
|
if __name__ == '__main__':
|
|
from gi.repository import GLib
|
|
import pathlib
|
|
import logging
|
|
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
configdir = os.path.join(GLib.get_user_config_dir(), 'ibus-comb')
|
|
pathlib.Path(configdir).mkdir(parents=True, exist_ok=True)
|
|
d = SystemDict()
|
|
u = UserDict(os.path.join(configdir, 'user-dict.txt'))
|
|
comb = Comb(logging.getLogger(__name__), u, d)
|
|
# print(comb.convert('henkandekiru'))
|
|
print(comb.convert('watasi'))
|
|
# print(comb.convert('hituyoudayo'))
|
|
# print(list(d.get_candidates('henkandekiru', 'へんかんできる')))
|
|
# print(list(d.get_candidates('warudakumi', 'わるだくみ')))
|
|
# print(list(d.get_candidates('subarasii', 'すばらしい')))
|
|
# print(list(d.get_candidates('watasi', 'わたし')))
|
|
# print(list(d.get_candidates('hiragana', 'ひらがな')))
|
|
# print(list(d.get_candidates('buffer', 'ぶっふぇr')))
|