merge 2 system_language_model files into 1 file

2025-08-22 14:55:31 +00:00 · 2020-09-14 09:49:48 +09:00
parent aacb9e5b8d
commit 1c061b367d
6 changed files with 39 additions and 36 deletions
--- a/10
+++ b/10
@ -27,8 +27,8 @@ comb/config.py: comb/config.py.in
 	    -e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/dictionary:g" \
 		$< > $@

-model/jawiki.1gram: model/bin/create-ngram-from-json.py
-	make -C model jawiki.1gram
+model/system_language_model.trie: model/bin/create-system_language_model-from-json.py
+	make -C model system_language_model.trie

 model/system_dict.trie:
 	make -C model system_dict.trie
@ -39,8 +39,7 @@ install-dict: model/system_dict.trie

 install: all comb/config.py model/jawiki.1gram install-dict
 	install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/comb $(DESTDIR)$(SYSCONFDIR)/xdg/comb $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-comb/model $(DESTDIR)$(DATADIR)/ibus-comb/dictionary
-	install -m 0644 model/jawiki.1gram $(DESTDIR)$(DATADIR)/ibus-comb/model/
-	install -m 0644 model/jawiki.2gram $(DESTDIR)$(DATADIR)/ibus-comb/model/
+	install -m 0644 model/system_language_model.trie $(DESTDIR)$(DATADIR)/ibus-comb/model/

 	install -m 0644 comb.svg $(DESTDIR)$(DATADIR)/ibus-comb
 	install -m 0644 comb/__init__.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
@ -74,8 +73,7 @@ uninstall:
 	rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_dict.py
 	rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_dict.py
 	rm -f $(DESTDIR)$(DATADIR)/ibus-comb/ibus.py
-	rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/jawiki.1gram
-	rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/jawiki.2gram
+	rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/system_language_model.trie
 	rmdir $(DESTDIR)$(DATADIR)/ibus-comb
 	rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/comb
 	rm -f $(DESTDIR)$(DATADIR)/ibus/component/comb.xml
--- a/README.md
+++ b/README.md
@ -39,6 +39,16 @@ wikipedia の全データをダウンロードして言語モデルと辞書の
   * 改造しやすい IME をめざす。
 * 品詞を扱わなくてもよいようにした

+## ファイル形式
+
+ * system_dict.trie
+   * `(u'読み', u'漢字1/漢字2/漢字3'.encode('utf-8'))` で入れている。
+   * common prefix search している。
+ * system_language_model.trie
+   * `("漢字/かな", score)`
+   * `("漢字/かな\t漢字/かな", score)`
+   * key でそのままひく
+
 ## See also

 * http://www.phontron.com/slides/nlp-programming-ja-bonus-01-kkc.pdf
--- a/comb/system_language_model.py
+++ b/comb/system_language_model.py
@ -9,25 +9,21 @@ DEFAULT_SCORE = [(math.log10(0.00000000001),)]


 class SystemLanguageModel:
-    def __init__(self, unigram_score: marisa_trie.RecordTrie, bigram_score: marisa_trie.RecordTrie):
-        self.unigram_score = unigram_score
-        self.bigram_score = bigram_score
+    def __init__(self, score: marisa_trie.RecordTrie):
+        self.score = score

    @staticmethod
    def create():
-        unigram_score = marisa_trie.RecordTrie('@f')
-        unigram_score.mmap(f"{MODEL_DIR}/jawiki.1gram")
+        score = marisa_trie.RecordTrie('@f')
+        score.mmap(f"{MODEL_DIR}/system_language_model.trie")

-        bigram_score = marisa_trie.RecordTrie('@f')
-        bigram_score.mmap(f"{MODEL_DIR}/jawiki.2gram")
-
-        return SystemLanguageModel(unigram_score, bigram_score)
+        return SystemLanguageModel(score)

    def get_unigram_cost(self, key: str) -> float:
-        return self.unigram_score.get(key, DEFAULT_SCORE)[0][0]
+        return self.score.get(key, DEFAULT_SCORE)[0][0]

    def get_bigram_cost(self, node1: Node, node2: Node) -> float:
        key1 = node1.get_key()
        key2 = node2.get_key()
        key = key1 + "\t" + key2
-        return self.bigram_score.get(key, DEFAULT_SCORE)[0][0]
+        return self.score.get(key, DEFAULT_SCORE)[0][0]
--- a/model/.gitignore
+++ b/model/.gitignore
@ -12,3 +12,4 @@
 /jawiki.1gram.json
 /jawiki.2gram.json
 /system_dict.trie
+/system_language_model.trie
--- a/model/Makefile
+++ b/model/Makefile
@ -25,8 +25,8 @@ jawiki.vocab: jawiki.wfreq
 jawiki.1gram.json: jawiki.vocab  bin/dumpngram.py
 	python bin/dumpngram.py jawiki.vocab

-jawiki.1gram: jawiki.1gram.json jawiki.vocab bin/create-ngram-from-json.py
-	python bin/create-ngram-from-json.py
+system_language_model.trie: jawiki.1gram.json jawiki.2gram.json jawiki.vocab bin/create-system_language_model-from-json.py
+	python bin/create-system_language_model-from-json.py

 system_dict.trie: jawiki.vocab
 	python bin/make-system-dict.py
--- a/model/bin/create-system_language_model-from-json.py
+++ b/model/bin/create-system_language_model-from-json.py
@ -5,54 +5,52 @@ import time

 import marisa_trie

-# とりあえずでつくった、1gram のデータをダスやつ。
+# jawiki.1gram.json/jawiki.2gram.json から言語モデルを出力する。

 SPACES = re.compile(r'\s+')

-BIGRAM_CUTOFF = 1
+BIGRAM_CUTOFF = 3


-def write_1gram():
-    # unigram かいていく
+def write_model():
+    # bigram かいていく
    retval = []
+
+    print('# 1gram')
    with open('jawiki.1gram.json') as fp:
        data = json.load(fp)

        total = sum(data.values())

-        for word, count in data.items():
+        for word in sorted(data.keys()):
+            count = data[word]
            score = math.log10(count / total)
+
            retval.append((word, (float(score),),))

-    trie = marisa_trie.RecordTrie('<f', retval)
-    fname = 'jawiki.1gram'
-    print(f"writing {fname}. size={len(retval)}")
-    trie.save(fname)
-
-
-def write_2gram():
-    # bigram かいていく
-    retval = []
+    print('# 2gram')
    with open('jawiki.2gram.json', 'r') as fp:
        data = json.load(fp)
+
        for word1, word2data in data.items():
            total = sum(word2data.values())
+
            for word2, count in word2data.items():
                if count <= BIGRAM_CUTOFF:
                    continue
+
                score = math.log10(count / total)
                retval.append((f"{word1}\t{word2}", (float(score),),))

    trie = marisa_trie.RecordTrie('<f', retval)
-    fname = 'jawiki.2gram'
+    fname = 'system_language_model.trie'
    print(f"writing {fname}. size={len(retval)}")
    trie.save(fname)


 def main():
    t0 = time.time()
-    write_1gram()
-    write_2gram()
+    write_model()
    print(f"Elapsed: {time.time() - t0} seconds")