mirror of
https://github.com/mii443/akaza.git
synced 2025-08-29 18:19:33 +00:00
連文節変換
This commit is contained in:
16
Makefile
16
Makefile
@ -22,13 +22,21 @@ comb.xml: comb.xml.in
|
|||||||
-e "s:@DATADIR@:$(DATADIR):g" $< > $@
|
-e "s:@DATADIR@:$(DATADIR):g" $< > $@
|
||||||
|
|
||||||
comb/config.py: comb/config.py.in
|
comb/config.py: comb/config.py.in
|
||||||
sed -e "s:@SYSCONFDIR@:$(SYSCONFDIR):g" $< > $@
|
sed -e "s:@SYSCONFDIR@:$(SYSCONFDIR):g" \
|
||||||
|
-e "s:@MODELDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/model:g" \
|
||||||
|
$< > $@
|
||||||
|
|
||||||
install: all check comb/config.py
|
model/jawiki.1gram:
|
||||||
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/comb $(DESTDIR)$(SYSCONFDIR)/xdg/comb $(DESTDIR)$(DATADIR)/ibus/component
|
make -C model jawiki.1gram
|
||||||
|
|
||||||
|
install: all comb/config.py model/jawiki.1gram check
|
||||||
|
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/comb $(DESTDIR)$(SYSCONFDIR)/xdg/comb $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-comb/model
|
||||||
|
install -m 0644 model/jawiki.1gram $(DESTDIR)$(DATADIR)/ibus-comb/model/
|
||||||
|
install -m 0644 model/jawiki.2gram $(DESTDIR)$(DATADIR)/ibus-comb/model/
|
||||||
install -m 0644 comb.svg $(DESTDIR)$(DATADIR)/ibus-comb
|
install -m 0644 comb.svg $(DESTDIR)$(DATADIR)/ibus-comb
|
||||||
install -m 0644 comb/__init__.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
install -m 0644 comb/__init__.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||||
install -m 0644 comb/graph.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
install -m 0644 comb/graph.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||||
|
install -m 0644 comb/config.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||||
install -m 0644 comb/skkdict.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
install -m 0644 comb/skkdict.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||||
install -m 0644 comb/combromkan.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
install -m 0644 comb/combromkan.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||||
install -m 0644 ibus.py $(DESTDIR)$(DATADIR)/ibus-comb
|
install -m 0644 ibus.py $(DESTDIR)$(DATADIR)/ibus-comb
|
||||||
@ -49,6 +57,8 @@ uninstall:
|
|||||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_dict.py
|
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_dict.py
|
||||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_dict.py
|
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_dict.py
|
||||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/ibus.py
|
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/ibus.py
|
||||||
|
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/jawiki.1gram
|
||||||
|
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/jawiki.2gram
|
||||||
rmdir $(DESTDIR)$(DATADIR)/ibus-comb
|
rmdir $(DESTDIR)$(DATADIR)/ibus-comb
|
||||||
rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/comb
|
rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/comb
|
||||||
rm -f $(DESTDIR)$(DATADIR)/ibus/component/comb.xml
|
rm -f $(DESTDIR)$(DATADIR)/ibus/component/comb.xml
|
||||||
|
@ -1 +1,2 @@
|
|||||||
SYS_CONF_DIR = '@SYSCONFDIR@'
|
SYS_CONF_DIR = '@SYSCONFDIR@'
|
||||||
|
MODEL_DIR = '@MODELDIR@'
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
from logging import Logger
|
||||||
from typing import List, Any
|
from typing import List, Any
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@ -8,27 +9,55 @@ from comb import combromkan
|
|||||||
|
|
||||||
from comb.system_dict import SystemDict
|
from comb.system_dict import SystemDict
|
||||||
from comb.user_dict import UserDict
|
from comb.user_dict import UserDict
|
||||||
|
from comb.graph import graph_construct, viterbi, lookup
|
||||||
|
from comb.config import MODEL_DIR
|
||||||
|
import logging
|
||||||
|
import marisa_trie
|
||||||
|
|
||||||
|
|
||||||
class Comb:
|
class Comb:
|
||||||
|
logger: Logger
|
||||||
dictionaries: List[Any]
|
dictionaries: List[Any]
|
||||||
|
|
||||||
def __init__(self, logger, user_dict: UserDict, system_dict: SystemDict):
|
def __init__(self, logger: Logger, user_dict: UserDict, system_dict: SystemDict):
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.dictionaries = []
|
self.dictionaries = []
|
||||||
self.user_dict = user_dict
|
self.user_dict = user_dict
|
||||||
self.system_dict = system_dict
|
self.system_dict = system_dict
|
||||||
|
|
||||||
|
self.unigram_score = marisa_trie.RecordTrie('@f')
|
||||||
|
self.unigram_score.load(f"{MODEL_DIR}/jawiki.1gram")
|
||||||
|
|
||||||
|
self.bigram_score = marisa_trie.RecordTrie('@f')
|
||||||
|
self.bigram_score.load(f"{MODEL_DIR}/jawiki.2gram")
|
||||||
|
|
||||||
def convert(self, src):
|
def convert(self, src):
|
||||||
hiragana: str = combromkan.to_hiragana(src)
|
hiragana: str = combromkan.to_hiragana(src)
|
||||||
katakana: str = jaconv.hira2kata(hiragana)
|
katakana: str = jaconv.hira2kata(hiragana)
|
||||||
|
|
||||||
|
self.logger.info(f"convert: src={src} hiragana={hiragana} katakana={katakana}")
|
||||||
|
|
||||||
candidates = [[hiragana, hiragana]]
|
candidates = [[hiragana, hiragana]]
|
||||||
|
|
||||||
for e in self.user_dict.get_candidates(src, hiragana):
|
for e in self.user_dict.get_candidates(src, hiragana):
|
||||||
if e not in candidates:
|
if e not in candidates:
|
||||||
candidates.append(e)
|
candidates.append(e)
|
||||||
|
|
||||||
|
try:
|
||||||
|
ht = dict(lookup(hiragana, self.system_dict))
|
||||||
|
graph = graph_construct(hiragana, ht, self.unigram_score, self.bigram_score)
|
||||||
|
got = viterbi(graph, self.unigram_score)
|
||||||
|
|
||||||
|
phrase = ''.join([x.word for x in got if not x.is_eos()])
|
||||||
|
|
||||||
|
self.logger.info(f"Got phrase: {phrase}")
|
||||||
|
|
||||||
|
if [phrase, phrase] not in candidates:
|
||||||
|
candidates.append([phrase, phrase])
|
||||||
|
except:
|
||||||
|
self.logger.error(f"Cannot convert: {hiragana} {katakana}",
|
||||||
|
exc_info=True)
|
||||||
|
|
||||||
if [katakana, katakana] not in candidates:
|
if [katakana, katakana] not in candidates:
|
||||||
candidates.append([katakana, katakana])
|
candidates.append([katakana, katakana])
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ class Node:
|
|||||||
class Graph:
|
class Graph:
|
||||||
d: Dict[int, List[Node]]
|
d: Dict[int, List[Node]]
|
||||||
|
|
||||||
def __init__(self, size: int, unigram_score, bigram_score):
|
def __init__(self, size: int, unigram_score, bigram_score, logger=logging.getLogger(__name__)):
|
||||||
self.d = {
|
self.d = {
|
||||||
0: [Node(start_pos=-9999, word='<S>', yomi='<S>', unigram_score=unigram_score,
|
0: [Node(start_pos=-9999, word='<S>', yomi='<S>', unigram_score=unigram_score,
|
||||||
bigram_score=bigram_score)],
|
bigram_score=bigram_score)],
|
||||||
@ -84,12 +84,8 @@ class Graph:
|
|||||||
|
|
||||||
def __getitem__(self, item):
|
def __getitem__(self, item):
|
||||||
ary = [None for _ in range(len(self.d))]
|
ary = [None for _ in range(len(self.d))]
|
||||||
try:
|
for k in sorted(self.d.keys()):
|
||||||
for k in sorted(self.d.keys()):
|
ary[k] = self.d[k]
|
||||||
ary[k] = self.d[k]
|
|
||||||
except IndexError:
|
|
||||||
logging.error(f"Cannot get entry {self.d[15]} {k}", exc_info=True)
|
|
||||||
sys.exit(1)
|
|
||||||
return ary[item]
|
return ary[item]
|
||||||
|
|
||||||
def dump(self, path: str):
|
def dump(self, path: str):
|
||||||
@ -202,6 +198,7 @@ def main():
|
|||||||
|
|
||||||
bigram_score = marisa_trie.RecordTrie('@f')
|
bigram_score = marisa_trie.RecordTrie('@f')
|
||||||
bigram_score.load('model/jawiki.2gram')
|
bigram_score.load('model/jawiki.2gram')
|
||||||
|
|
||||||
system_dict = SystemDict()
|
system_dict = SystemDict()
|
||||||
|
|
||||||
# print(ht)
|
# print(ht)
|
||||||
|
@ -46,6 +46,7 @@ class SystemDict:
|
|||||||
trie = marisa_trie.BytesTrie()
|
trie = marisa_trie.BytesTrie()
|
||||||
trie.load(cache_file)
|
trie.load(cache_file)
|
||||||
self.trie = trie
|
self.trie = trie
|
||||||
|
self.logger.info("loaded cache dictionary")
|
||||||
return
|
return
|
||||||
|
|
||||||
self.logger.info("loading dictionaries")
|
self.logger.info("loading dictionaries")
|
||||||
|
@ -35,8 +35,14 @@ user_dict = UserDict(os.path.join(configdir, 'user-dict.txt'), logging.getLogger
|
|||||||
logging.info("Loaded user dictionary")
|
logging.info("Loaded user dictionary")
|
||||||
|
|
||||||
system_dict = SystemDict()
|
system_dict = SystemDict()
|
||||||
|
logging.info("Loaded system dictionary")
|
||||||
|
|
||||||
comb = Comb(logging.getLogger('Comb'), user_dict, system_dict)
|
try:
|
||||||
|
comb = Comb(logging.getLogger('Comb'), user_dict, system_dict)
|
||||||
|
logging.info("Finished Comb.")
|
||||||
|
except:
|
||||||
|
logging.error("Cannot initialize.", exc_info=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------
|
# ----------------------------------------------------------------------
|
||||||
|
Reference in New Issue
Block a user