mirror of
https://github.com/mii443/akaza.git
synced 2025-08-22 14:55:31 +00:00
merge 2 system_language_model files into 1 file
This commit is contained in:
10
Makefile
10
Makefile
@ -27,8 +27,8 @@ comb/config.py: comb/config.py.in
|
||||
-e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/dictionary:g" \
|
||||
$< > $@
|
||||
|
||||
model/jawiki.1gram: model/bin/create-ngram-from-json.py
|
||||
make -C model jawiki.1gram
|
||||
model/system_language_model.trie: model/bin/create-system_language_model-from-json.py
|
||||
make -C model system_language_model.trie
|
||||
|
||||
model/system_dict.trie:
|
||||
make -C model system_dict.trie
|
||||
@ -39,8 +39,7 @@ install-dict: model/system_dict.trie
|
||||
|
||||
install: all comb/config.py model/jawiki.1gram install-dict
|
||||
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/comb $(DESTDIR)$(SYSCONFDIR)/xdg/comb $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-comb/model $(DESTDIR)$(DATADIR)/ibus-comb/dictionary
|
||||
install -m 0644 model/jawiki.1gram $(DESTDIR)$(DATADIR)/ibus-comb/model/
|
||||
install -m 0644 model/jawiki.2gram $(DESTDIR)$(DATADIR)/ibus-comb/model/
|
||||
install -m 0644 model/system_language_model.trie $(DESTDIR)$(DATADIR)/ibus-comb/model/
|
||||
|
||||
install -m 0644 comb.svg $(DESTDIR)$(DATADIR)/ibus-comb
|
||||
install -m 0644 comb/__init__.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
@ -74,8 +73,7 @@ uninstall:
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_dict.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_dict.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/ibus.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/jawiki.1gram
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/jawiki.2gram
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/system_language_model.trie
|
||||
rmdir $(DESTDIR)$(DATADIR)/ibus-comb
|
||||
rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/comb
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus/component/comb.xml
|
||||
|
10
README.md
10
README.md
@ -39,6 +39,16 @@ wikipedia の全データをダウンロードして言語モデルと辞書の
|
||||
* 改造しやすい IME をめざす。
|
||||
* 品詞を扱わなくてもよいようにした
|
||||
|
||||
## ファイル形式
|
||||
|
||||
* system_dict.trie
|
||||
* `(u'読み', u'漢字1/漢字2/漢字3'.encode('utf-8'))` で入れている。
|
||||
* common prefix search している。
|
||||
* system_language_model.trie
|
||||
* `("漢字/かな", score)`
|
||||
* `("漢字/かな\t漢字/かな", score)`
|
||||
* key でそのままひく
|
||||
|
||||
## See also
|
||||
|
||||
* http://www.phontron.com/slides/nlp-programming-ja-bonus-01-kkc.pdf
|
||||
|
@ -9,25 +9,21 @@ DEFAULT_SCORE = [(math.log10(0.00000000001),)]
|
||||
|
||||
|
||||
class SystemLanguageModel:
|
||||
def __init__(self, unigram_score: marisa_trie.RecordTrie, bigram_score: marisa_trie.RecordTrie):
|
||||
self.unigram_score = unigram_score
|
||||
self.bigram_score = bigram_score
|
||||
def __init__(self, score: marisa_trie.RecordTrie):
|
||||
self.score = score
|
||||
|
||||
@staticmethod
|
||||
def create():
|
||||
unigram_score = marisa_trie.RecordTrie('@f')
|
||||
unigram_score.mmap(f"{MODEL_DIR}/jawiki.1gram")
|
||||
score = marisa_trie.RecordTrie('@f')
|
||||
score.mmap(f"{MODEL_DIR}/system_language_model.trie")
|
||||
|
||||
bigram_score = marisa_trie.RecordTrie('@f')
|
||||
bigram_score.mmap(f"{MODEL_DIR}/jawiki.2gram")
|
||||
|
||||
return SystemLanguageModel(unigram_score, bigram_score)
|
||||
return SystemLanguageModel(score)
|
||||
|
||||
def get_unigram_cost(self, key: str) -> float:
|
||||
return self.unigram_score.get(key, DEFAULT_SCORE)[0][0]
|
||||
return self.score.get(key, DEFAULT_SCORE)[0][0]
|
||||
|
||||
def get_bigram_cost(self, node1: Node, node2: Node) -> float:
|
||||
key1 = node1.get_key()
|
||||
key2 = node2.get_key()
|
||||
key = key1 + "\t" + key2
|
||||
return self.bigram_score.get(key, DEFAULT_SCORE)[0][0]
|
||||
return self.score.get(key, DEFAULT_SCORE)[0][0]
|
||||
|
1
model/.gitignore
vendored
1
model/.gitignore
vendored
@ -12,3 +12,4 @@
|
||||
/jawiki.1gram.json
|
||||
/jawiki.2gram.json
|
||||
/system_dict.trie
|
||||
/system_language_model.trie
|
@ -25,8 +25,8 @@ jawiki.vocab: jawiki.wfreq
|
||||
jawiki.1gram.json: jawiki.vocab bin/dumpngram.py
|
||||
python bin/dumpngram.py jawiki.vocab
|
||||
|
||||
jawiki.1gram: jawiki.1gram.json jawiki.vocab bin/create-ngram-from-json.py
|
||||
python bin/create-ngram-from-json.py
|
||||
system_language_model.trie: jawiki.1gram.json jawiki.2gram.json jawiki.vocab bin/create-system_language_model-from-json.py
|
||||
python bin/create-system_language_model-from-json.py
|
||||
|
||||
system_dict.trie: jawiki.vocab
|
||||
python bin/make-system-dict.py
|
||||
|
@ -5,54 +5,52 @@ import time
|
||||
|
||||
import marisa_trie
|
||||
|
||||
# とりあえずでつくった、1gram のデータをダスやつ。
|
||||
# jawiki.1gram.json/jawiki.2gram.json から言語モデルを出力する。
|
||||
|
||||
SPACES = re.compile(r'\s+')
|
||||
|
||||
BIGRAM_CUTOFF = 1
|
||||
BIGRAM_CUTOFF = 3
|
||||
|
||||
|
||||
def write_1gram():
|
||||
# unigram かいていく
|
||||
def write_model():
|
||||
# bigram かいていく
|
||||
retval = []
|
||||
|
||||
print('# 1gram')
|
||||
with open('jawiki.1gram.json') as fp:
|
||||
data = json.load(fp)
|
||||
|
||||
total = sum(data.values())
|
||||
|
||||
for word, count in data.items():
|
||||
for word in sorted(data.keys()):
|
||||
count = data[word]
|
||||
score = math.log10(count / total)
|
||||
|
||||
retval.append((word, (float(score),),))
|
||||
|
||||
trie = marisa_trie.RecordTrie('<f', retval)
|
||||
fname = 'jawiki.1gram'
|
||||
print(f"writing {fname}. size={len(retval)}")
|
||||
trie.save(fname)
|
||||
|
||||
|
||||
def write_2gram():
|
||||
# bigram かいていく
|
||||
retval = []
|
||||
print('# 2gram')
|
||||
with open('jawiki.2gram.json', 'r') as fp:
|
||||
data = json.load(fp)
|
||||
|
||||
for word1, word2data in data.items():
|
||||
total = sum(word2data.values())
|
||||
|
||||
for word2, count in word2data.items():
|
||||
if count <= BIGRAM_CUTOFF:
|
||||
continue
|
||||
|
||||
score = math.log10(count / total)
|
||||
retval.append((f"{word1}\t{word2}", (float(score),),))
|
||||
|
||||
trie = marisa_trie.RecordTrie('<f', retval)
|
||||
fname = 'jawiki.2gram'
|
||||
fname = 'system_language_model.trie'
|
||||
print(f"writing {fname}. size={len(retval)}")
|
||||
trie.save(fname)
|
||||
|
||||
|
||||
def main():
|
||||
t0 = time.time()
|
||||
write_1gram()
|
||||
write_2gram()
|
||||
write_model()
|
||||
print(f"Elapsed: {time.time() - t0} seconds")
|
||||
|
||||
|
Reference in New Issue
Block a user