mirror of
https://github.com/mii443/akaza.git
synced 2025-08-22 14:55:31 +00:00
59 lines
1.3 KiB
Python
59 lines
1.3 KiB
Python
import json
|
|
import math
|
|
import re
|
|
import time
|
|
|
|
import marisa_trie
|
|
|
|
# jawiki.1gram.json/jawiki.2gram.json から言語モデルを出力する。
|
|
|
|
SPACES = re.compile(r'\s+')
|
|
|
|
BIGRAM_CUTOFF = 3
|
|
|
|
|
|
def write_model():
|
|
# bigram かいていく
|
|
retval = []
|
|
|
|
print('# 1gram')
|
|
with open('jawiki.1gram.json') as fp:
|
|
data = json.load(fp)
|
|
|
|
total = sum(data.values())
|
|
|
|
for word in sorted(data.keys()):
|
|
count = data[word]
|
|
score = math.log10(count / total)
|
|
|
|
retval.append((word, (float(score),),))
|
|
|
|
print('# 2gram')
|
|
with open('jawiki.2gram.json', 'r') as fp:
|
|
data = json.load(fp)
|
|
|
|
for word1, word2data in data.items():
|
|
total = sum(word2data.values())
|
|
|
|
for word2, count in word2data.items():
|
|
if count <= BIGRAM_CUTOFF:
|
|
continue
|
|
|
|
score = math.log10(count / total)
|
|
retval.append((f"{word1}\t{word2}", (float(score),),))
|
|
|
|
trie = marisa_trie.RecordTrie('<f', retval)
|
|
fname = 'system_language_model.trie'
|
|
print(f"writing {fname}. size={len(retval)}")
|
|
trie.save(fname)
|
|
|
|
|
|
def main():
|
|
t0 = time.time()
|
|
write_model()
|
|
print(f"Elapsed: {time.time() - t0} seconds")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|