merge 2 system_language_model files into 1 file

This commit is contained in:
Tokuhiro Matsuno
2020-09-14 09:49:48 +09:00
parent aacb9e5b8d
commit 1c061b367d
6 changed files with 39 additions and 36 deletions

View File

@ -27,8 +27,8 @@ comb/config.py: comb/config.py.in
-e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/dictionary:g" \
$< > $@
model/jawiki.1gram: model/bin/create-ngram-from-json.py
make -C model jawiki.1gram
model/system_language_model.trie: model/bin/create-system_language_model-from-json.py
make -C model system_language_model.trie
model/system_dict.trie:
make -C model system_dict.trie
@ -39,8 +39,7 @@ install-dict: model/system_dict.trie
install: all comb/config.py model/jawiki.1gram install-dict
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/comb $(DESTDIR)$(SYSCONFDIR)/xdg/comb $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-comb/model $(DESTDIR)$(DATADIR)/ibus-comb/dictionary
install -m 0644 model/jawiki.1gram $(DESTDIR)$(DATADIR)/ibus-comb/model/
install -m 0644 model/jawiki.2gram $(DESTDIR)$(DATADIR)/ibus-comb/model/
install -m 0644 model/system_language_model.trie $(DESTDIR)$(DATADIR)/ibus-comb/model/
install -m 0644 comb.svg $(DESTDIR)$(DATADIR)/ibus-comb
install -m 0644 comb/__init__.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
@ -74,8 +73,7 @@ uninstall:
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_dict.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_dict.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/ibus.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/jawiki.1gram
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/jawiki.2gram
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/system_language_model.trie
rmdir $(DESTDIR)$(DATADIR)/ibus-comb
rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/comb
rm -f $(DESTDIR)$(DATADIR)/ibus/component/comb.xml

View File

@ -39,6 +39,16 @@ wikipedia の全データをダウンロードして言語モデルと辞書の
* 改造しやすい IME をめざす。
* 品詞を扱わなくてもよいようにした
## ファイル形式
* system_dict.trie
* `(u'読み', u'漢字1/漢字2/漢字3'.encode('utf-8'))` で入れている。
* common prefix search している。
* system_language_model.trie
* `("漢字/かな", score)`
* `("漢字/かな\t漢字/かな", score)`
* key でそのままひく
## See also
* http://www.phontron.com/slides/nlp-programming-ja-bonus-01-kkc.pdf

View File

@ -9,25 +9,21 @@ DEFAULT_SCORE = [(math.log10(0.00000000001),)]
class SystemLanguageModel:
def __init__(self, unigram_score: marisa_trie.RecordTrie, bigram_score: marisa_trie.RecordTrie):
self.unigram_score = unigram_score
self.bigram_score = bigram_score
def __init__(self, score: marisa_trie.RecordTrie):
self.score = score
@staticmethod
def create():
unigram_score = marisa_trie.RecordTrie('@f')
unigram_score.mmap(f"{MODEL_DIR}/jawiki.1gram")
score = marisa_trie.RecordTrie('@f')
score.mmap(f"{MODEL_DIR}/system_language_model.trie")
bigram_score = marisa_trie.RecordTrie('@f')
bigram_score.mmap(f"{MODEL_DIR}/jawiki.2gram")
return SystemLanguageModel(unigram_score, bigram_score)
return SystemLanguageModel(score)
def get_unigram_cost(self, key: str) -> float:
return self.unigram_score.get(key, DEFAULT_SCORE)[0][0]
return self.score.get(key, DEFAULT_SCORE)[0][0]
def get_bigram_cost(self, node1: Node, node2: Node) -> float:
key1 = node1.get_key()
key2 = node2.get_key()
key = key1 + "\t" + key2
return self.bigram_score.get(key, DEFAULT_SCORE)[0][0]
return self.score.get(key, DEFAULT_SCORE)[0][0]

1
model/.gitignore vendored
View File

@ -12,3 +12,4 @@
/jawiki.1gram.json
/jawiki.2gram.json
/system_dict.trie
/system_language_model.trie

View File

@ -25,8 +25,8 @@ jawiki.vocab: jawiki.wfreq
jawiki.1gram.json: jawiki.vocab bin/dumpngram.py
python bin/dumpngram.py jawiki.vocab
jawiki.1gram: jawiki.1gram.json jawiki.vocab bin/create-ngram-from-json.py
python bin/create-ngram-from-json.py
system_language_model.trie: jawiki.1gram.json jawiki.2gram.json jawiki.vocab bin/create-system_language_model-from-json.py
python bin/create-system_language_model-from-json.py
system_dict.trie: jawiki.vocab
python bin/make-system-dict.py

View File

@ -5,54 +5,52 @@ import time
import marisa_trie
# とりあえずでつくった、1gram のデータをダスやつ
# jawiki.1gram.json/jawiki.2gram.json から言語モデルを出力する
SPACES = re.compile(r'\s+')
BIGRAM_CUTOFF = 1
BIGRAM_CUTOFF = 3
def write_1gram():
# unigram かいていく
def write_model():
# bigram かいていく
retval = []
print('# 1gram')
with open('jawiki.1gram.json') as fp:
data = json.load(fp)
total = sum(data.values())
for word, count in data.items():
for word in sorted(data.keys()):
count = data[word]
score = math.log10(count / total)
retval.append((word, (float(score),),))
trie = marisa_trie.RecordTrie('<f', retval)
fname = 'jawiki.1gram'
print(f"writing {fname}. size={len(retval)}")
trie.save(fname)
def write_2gram():
# bigram かいていく
retval = []
print('# 2gram')
with open('jawiki.2gram.json', 'r') as fp:
data = json.load(fp)
for word1, word2data in data.items():
total = sum(word2data.values())
for word2, count in word2data.items():
if count <= BIGRAM_CUTOFF:
continue
score = math.log10(count / total)
retval.append((f"{word1}\t{word2}", (float(score),),))
trie = marisa_trie.RecordTrie('<f', retval)
fname = 'jawiki.2gram'
fname = 'system_language_model.trie'
print(f"writing {fname}. size={len(retval)}")
trie.save(fname)
def main():
t0 = time.time()
write_1gram()
write_2gram()
write_model()
print(f"Elapsed: {time.time() - t0} seconds")