Files
akaza/akaza-data/bin/create-system_language_model-from-json.py
Tokuhiro Matsuno 3ee8b9574f snapshot
2020-09-14 18:36:22 +09:00

59 lines
1.3 KiB
Python

import json
import math
import re
import time
import marisa_trie
# jawiki.1gram.json/jawiki.2gram.json から言語モデルを出力する。
SPACES = re.compile(r'\s+')
BIGRAM_CUTOFF = 3
def write_model():
# bigram かいていく
retval = []
print('# 1gram')
with open('jawiki.1gram.json') as fp:
data = json.load(fp)
total = sum(data.values())
for word in sorted(data.keys()):
count = data[word]
score = math.log10(count / total)
retval.append((word, (float(score),),))
print('# 2gram')
with open('jawiki.2gram.json', 'r') as fp:
data = json.load(fp)
for word1, word2data in data.items():
total = sum(word2data.values())
for word2, count in word2data.items():
if count <= BIGRAM_CUTOFF:
continue
score = math.log10(count / total)
retval.append((f"{word1}\t{word2}", (float(score),),))
trie = marisa_trie.RecordTrie('<f', retval)
fname = 'system_language_model.trie'
print(f"writing {fname}. size={len(retval)}")
trie.save(fname)
def main():
t0 = time.time()
write_model()
print(f"Elapsed: {time.time() - t0} seconds")
if __name__ == '__main__':
main()