mirror of
https://github.com/mii443/akaza.git
synced 2025-08-23 15:22:21 +00:00
merge 2 system_language_model files into 1 file
This commit is contained in:
58
model/bin/create-system_language_model-from-json.py
Normal file
58
model/bin/create-system_language_model-from-json.py
Normal file
@ -0,0 +1,58 @@
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
import time
|
||||
|
||||
import marisa_trie
|
||||
|
||||
# jawiki.1gram.json/jawiki.2gram.json から言語モデルを出力する。
|
||||
|
||||
SPACES = re.compile(r'\s+')
|
||||
|
||||
BIGRAM_CUTOFF = 3
|
||||
|
||||
|
||||
def write_model():
|
||||
# bigram かいていく
|
||||
retval = []
|
||||
|
||||
print('# 1gram')
|
||||
with open('jawiki.1gram.json') as fp:
|
||||
data = json.load(fp)
|
||||
|
||||
total = sum(data.values())
|
||||
|
||||
for word in sorted(data.keys()):
|
||||
count = data[word]
|
||||
score = math.log10(count / total)
|
||||
|
||||
retval.append((word, (float(score),),))
|
||||
|
||||
print('# 2gram')
|
||||
with open('jawiki.2gram.json', 'r') as fp:
|
||||
data = json.load(fp)
|
||||
|
||||
for word1, word2data in data.items():
|
||||
total = sum(word2data.values())
|
||||
|
||||
for word2, count in word2data.items():
|
||||
if count <= BIGRAM_CUTOFF:
|
||||
continue
|
||||
|
||||
score = math.log10(count / total)
|
||||
retval.append((f"{word1}\t{word2}", (float(score),),))
|
||||
|
||||
trie = marisa_trie.RecordTrie('<f', retval)
|
||||
fname = 'system_language_model.trie'
|
||||
print(f"writing {fname}. size={len(retval)}")
|
||||
trie.save(fname)
|
||||
|
||||
|
||||
def main():
|
||||
t0 = time.time()
|
||||
write_model()
|
||||
print(f"Elapsed: {time.time() - t0} seconds")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Reference in New Issue
Block a user