import json import math import re import time import marisa_trie # jawiki.1gram.json/jawiki.2gram.json から言語モデルを出力する。 SPACES = re.compile(r'\s+') BIGRAM_CUTOFF = 3 def write_model(): # bigram かいていく retval = [] print('# 1gram') with open('jawiki.1gram.json') as fp: data = json.load(fp) total = sum(data.values()) for word in sorted(data.keys()): count = data[word] score = math.log10(count / total) retval.append((word, (float(score),),)) print('# 2gram') with open('jawiki.2gram.json', 'r') as fp: data = json.load(fp) for word1, word2data in data.items(): total = sum(word2data.values()) for word2, count in word2data.items(): if count <= BIGRAM_CUTOFF: continue score = math.log10(count / total) retval.append((f"{word1}\t{word2}", (float(score),),)) trie = marisa_trie.RecordTrie('