mirror of
https://github.com/mii443/akaza.git
synced 2025-08-22 23:05:26 +00:00
merge 2 system_language_model files into 1 file
This commit is contained in:
10
Makefile
10
Makefile
@ -27,8 +27,8 @@ comb/config.py: comb/config.py.in
|
|||||||
-e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/dictionary:g" \
|
-e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/dictionary:g" \
|
||||||
$< > $@
|
$< > $@
|
||||||
|
|
||||||
model/jawiki.1gram: model/bin/create-ngram-from-json.py
|
model/system_language_model.trie: model/bin/create-system_language_model-from-json.py
|
||||||
make -C model jawiki.1gram
|
make -C model system_language_model.trie
|
||||||
|
|
||||||
model/system_dict.trie:
|
model/system_dict.trie:
|
||||||
make -C model system_dict.trie
|
make -C model system_dict.trie
|
||||||
@ -39,8 +39,7 @@ install-dict: model/system_dict.trie
|
|||||||
|
|
||||||
install: all comb/config.py model/jawiki.1gram install-dict
|
install: all comb/config.py model/jawiki.1gram install-dict
|
||||||
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/comb $(DESTDIR)$(SYSCONFDIR)/xdg/comb $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-comb/model $(DESTDIR)$(DATADIR)/ibus-comb/dictionary
|
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/comb $(DESTDIR)$(SYSCONFDIR)/xdg/comb $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-comb/model $(DESTDIR)$(DATADIR)/ibus-comb/dictionary
|
||||||
install -m 0644 model/jawiki.1gram $(DESTDIR)$(DATADIR)/ibus-comb/model/
|
install -m 0644 model/system_language_model.trie $(DESTDIR)$(DATADIR)/ibus-comb/model/
|
||||||
install -m 0644 model/jawiki.2gram $(DESTDIR)$(DATADIR)/ibus-comb/model/
|
|
||||||
|
|
||||||
install -m 0644 comb.svg $(DESTDIR)$(DATADIR)/ibus-comb
|
install -m 0644 comb.svg $(DESTDIR)$(DATADIR)/ibus-comb
|
||||||
install -m 0644 comb/__init__.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
install -m 0644 comb/__init__.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||||
@ -74,8 +73,7 @@ uninstall:
|
|||||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_dict.py
|
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_dict.py
|
||||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_dict.py
|
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_dict.py
|
||||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/ibus.py
|
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/ibus.py
|
||||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/jawiki.1gram
|
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/system_language_model.trie
|
||||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/jawiki.2gram
|
|
||||||
rmdir $(DESTDIR)$(DATADIR)/ibus-comb
|
rmdir $(DESTDIR)$(DATADIR)/ibus-comb
|
||||||
rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/comb
|
rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/comb
|
||||||
rm -f $(DESTDIR)$(DATADIR)/ibus/component/comb.xml
|
rm -f $(DESTDIR)$(DATADIR)/ibus/component/comb.xml
|
||||||
|
10
README.md
10
README.md
@ -39,6 +39,16 @@ wikipedia の全データをダウンロードして言語モデルと辞書の
|
|||||||
* 改造しやすい IME をめざす。
|
* 改造しやすい IME をめざす。
|
||||||
* 品詞を扱わなくてもよいようにした
|
* 品詞を扱わなくてもよいようにした
|
||||||
|
|
||||||
|
## ファイル形式
|
||||||
|
|
||||||
|
* system_dict.trie
|
||||||
|
* `(u'読み', u'漢字1/漢字2/漢字3'.encode('utf-8'))` で入れている。
|
||||||
|
* common prefix search している。
|
||||||
|
* system_language_model.trie
|
||||||
|
* `("漢字/かな", score)`
|
||||||
|
* `("漢字/かな\t漢字/かな", score)`
|
||||||
|
* key でそのままひく
|
||||||
|
|
||||||
## See also
|
## See also
|
||||||
|
|
||||||
* http://www.phontron.com/slides/nlp-programming-ja-bonus-01-kkc.pdf
|
* http://www.phontron.com/slides/nlp-programming-ja-bonus-01-kkc.pdf
|
||||||
|
@ -9,25 +9,21 @@ DEFAULT_SCORE = [(math.log10(0.00000000001),)]
|
|||||||
|
|
||||||
|
|
||||||
class SystemLanguageModel:
|
class SystemLanguageModel:
|
||||||
def __init__(self, unigram_score: marisa_trie.RecordTrie, bigram_score: marisa_trie.RecordTrie):
|
def __init__(self, score: marisa_trie.RecordTrie):
|
||||||
self.unigram_score = unigram_score
|
self.score = score
|
||||||
self.bigram_score = bigram_score
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create():
|
def create():
|
||||||
unigram_score = marisa_trie.RecordTrie('@f')
|
score = marisa_trie.RecordTrie('@f')
|
||||||
unigram_score.mmap(f"{MODEL_DIR}/jawiki.1gram")
|
score.mmap(f"{MODEL_DIR}/system_language_model.trie")
|
||||||
|
|
||||||
bigram_score = marisa_trie.RecordTrie('@f')
|
return SystemLanguageModel(score)
|
||||||
bigram_score.mmap(f"{MODEL_DIR}/jawiki.2gram")
|
|
||||||
|
|
||||||
return SystemLanguageModel(unigram_score, bigram_score)
|
|
||||||
|
|
||||||
def get_unigram_cost(self, key: str) -> float:
|
def get_unigram_cost(self, key: str) -> float:
|
||||||
return self.unigram_score.get(key, DEFAULT_SCORE)[0][0]
|
return self.score.get(key, DEFAULT_SCORE)[0][0]
|
||||||
|
|
||||||
def get_bigram_cost(self, node1: Node, node2: Node) -> float:
|
def get_bigram_cost(self, node1: Node, node2: Node) -> float:
|
||||||
key1 = node1.get_key()
|
key1 = node1.get_key()
|
||||||
key2 = node2.get_key()
|
key2 = node2.get_key()
|
||||||
key = key1 + "\t" + key2
|
key = key1 + "\t" + key2
|
||||||
return self.bigram_score.get(key, DEFAULT_SCORE)[0][0]
|
return self.score.get(key, DEFAULT_SCORE)[0][0]
|
||||||
|
1
model/.gitignore
vendored
1
model/.gitignore
vendored
@ -12,3 +12,4 @@
|
|||||||
/jawiki.1gram.json
|
/jawiki.1gram.json
|
||||||
/jawiki.2gram.json
|
/jawiki.2gram.json
|
||||||
/system_dict.trie
|
/system_dict.trie
|
||||||
|
/system_language_model.trie
|
@ -25,8 +25,8 @@ jawiki.vocab: jawiki.wfreq
|
|||||||
jawiki.1gram.json: jawiki.vocab bin/dumpngram.py
|
jawiki.1gram.json: jawiki.vocab bin/dumpngram.py
|
||||||
python bin/dumpngram.py jawiki.vocab
|
python bin/dumpngram.py jawiki.vocab
|
||||||
|
|
||||||
jawiki.1gram: jawiki.1gram.json jawiki.vocab bin/create-ngram-from-json.py
|
system_language_model.trie: jawiki.1gram.json jawiki.2gram.json jawiki.vocab bin/create-system_language_model-from-json.py
|
||||||
python bin/create-ngram-from-json.py
|
python bin/create-system_language_model-from-json.py
|
||||||
|
|
||||||
system_dict.trie: jawiki.vocab
|
system_dict.trie: jawiki.vocab
|
||||||
python bin/make-system-dict.py
|
python bin/make-system-dict.py
|
||||||
|
@ -5,54 +5,52 @@ import time
|
|||||||
|
|
||||||
import marisa_trie
|
import marisa_trie
|
||||||
|
|
||||||
# とりあえずでつくった、1gram のデータをダスやつ。
|
# jawiki.1gram.json/jawiki.2gram.json から言語モデルを出力する。
|
||||||
|
|
||||||
SPACES = re.compile(r'\s+')
|
SPACES = re.compile(r'\s+')
|
||||||
|
|
||||||
BIGRAM_CUTOFF = 1
|
BIGRAM_CUTOFF = 3
|
||||||
|
|
||||||
|
|
||||||
def write_1gram():
|
def write_model():
|
||||||
# unigram かいていく
|
# bigram かいていく
|
||||||
retval = []
|
retval = []
|
||||||
|
|
||||||
|
print('# 1gram')
|
||||||
with open('jawiki.1gram.json') as fp:
|
with open('jawiki.1gram.json') as fp:
|
||||||
data = json.load(fp)
|
data = json.load(fp)
|
||||||
|
|
||||||
total = sum(data.values())
|
total = sum(data.values())
|
||||||
|
|
||||||
for word, count in data.items():
|
for word in sorted(data.keys()):
|
||||||
|
count = data[word]
|
||||||
score = math.log10(count / total)
|
score = math.log10(count / total)
|
||||||
|
|
||||||
retval.append((word, (float(score),),))
|
retval.append((word, (float(score),),))
|
||||||
|
|
||||||
trie = marisa_trie.RecordTrie('<f', retval)
|
print('# 2gram')
|
||||||
fname = 'jawiki.1gram'
|
|
||||||
print(f"writing {fname}. size={len(retval)}")
|
|
||||||
trie.save(fname)
|
|
||||||
|
|
||||||
|
|
||||||
def write_2gram():
|
|
||||||
# bigram かいていく
|
|
||||||
retval = []
|
|
||||||
with open('jawiki.2gram.json', 'r') as fp:
|
with open('jawiki.2gram.json', 'r') as fp:
|
||||||
data = json.load(fp)
|
data = json.load(fp)
|
||||||
|
|
||||||
for word1, word2data in data.items():
|
for word1, word2data in data.items():
|
||||||
total = sum(word2data.values())
|
total = sum(word2data.values())
|
||||||
|
|
||||||
for word2, count in word2data.items():
|
for word2, count in word2data.items():
|
||||||
if count <= BIGRAM_CUTOFF:
|
if count <= BIGRAM_CUTOFF:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
score = math.log10(count / total)
|
score = math.log10(count / total)
|
||||||
retval.append((f"{word1}\t{word2}", (float(score),),))
|
retval.append((f"{word1}\t{word2}", (float(score),),))
|
||||||
|
|
||||||
trie = marisa_trie.RecordTrie('<f', retval)
|
trie = marisa_trie.RecordTrie('<f', retval)
|
||||||
fname = 'jawiki.2gram'
|
fname = 'system_language_model.trie'
|
||||||
print(f"writing {fname}. size={len(retval)}")
|
print(f"writing {fname}. size={len(retval)}")
|
||||||
trie.save(fname)
|
trie.save(fname)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
write_1gram()
|
write_model()
|
||||||
write_2gram()
|
|
||||||
print(f"Elapsed: {time.time() - t0} seconds")
|
print(f"Elapsed: {time.time() - t0} seconds")
|
||||||
|
|
||||||
|
|
Reference in New Issue
Block a user