merge 2 system_language_model files into 1 file

2025-08-23 15:22:21 +00:00 · 2020-09-14 09:49:48 +09:00
parent aacb9e5b8d
commit 1c061b367d
6 changed files with 39 additions and 36 deletions
--- a/model/bin/create-system_language_model-from-json.py
+++ b/model/bin/create-system_language_model-from-json.py
@ -0,0 +1,58 @@
+import json
+import math
+import re
+import time
+
+import marisa_trie
+
+# jawiki.1gram.json/jawiki.2gram.json から言語モデルを出力する。
+
+SPACES = re.compile(r'\s+')
+
+BIGRAM_CUTOFF = 3
+
+
+def write_model():
+    # bigram かいていく
+    retval = []
+
+    print('# 1gram')
+    with open('jawiki.1gram.json') as fp:
+        data = json.load(fp)
+
+        total = sum(data.values())
+
+        for word in sorted(data.keys()):
+            count = data[word]
+            score = math.log10(count / total)
+
+            retval.append((word, (float(score),),))
+
+    print('# 2gram')
+    with open('jawiki.2gram.json', 'r') as fp:
+        data = json.load(fp)
+
+        for word1, word2data in data.items():
+            total = sum(word2data.values())
+
+            for word2, count in word2data.items():
+                if count <= BIGRAM_CUTOFF:
+                    continue
+
+                score = math.log10(count / total)
+                retval.append((f"{word1}\t{word2}", (float(score),),))
+
+    trie = marisa_trie.RecordTrie('<f', retval)
+    fname = 'system_language_model.trie'
+    print(f"writing {fname}. size={len(retval)}")
+    trie.save(fname)
+
+
+def main():
+    t0 = time.time()
+    write_model()
+    print(f"Elapsed: {time.time() - t0} seconds")
+
+
+if __name__ == '__main__':
+    main()