Files
akaza/akaza-data/bin/dumpngram.py
Tokuhiro Matsuno 3ee8b9574f snapshot
2020-09-14 18:36:22 +09:00

105 lines
2.5 KiB
Python

import sys
import glob
import re
import os
import psutil
import json
import time
# jawiki.vocab と dat/*/* を元に、jawiki.1gram.json と jawiki.2gram.json を構築する。
# Usage: $0 wfreq
from typing import Set
vocabfname = sys.argv[1]
SPACES = re.compile(r'\s+')
def read_vocab():
with open(vocabfname, 'r') as fp:
return [line.rstrip() for line in fp.readlines()]
class UniGram:
def __init__(self, vocab: Set[str]):
self.d = {}
self.vocab = vocab
def register(self, word):
if word not in self.vocab:
return
if word not in self.d:
self.d[word] = 0
self.d[word] += 1
def __len__(self):
return len(self.d)
def dump(self, fname):
with open(fname, 'w') as fp:
json.dump(self.d, fp, ensure_ascii=False)
class BiGram:
wfreq: Set[str]
def __init__(self, vocab: Set[str]):
self.d = {}
self.vocab = vocab
def register(self, word1, word2):
if word1 not in self.vocab or word2 not in self.vocab:
return
if word1 not in self.d:
self.d[word1] = {}
if word2 not in self.d[word1]:
self.d[word1][word2] = 0
self.d[word1][word2] += 1
def __len__(self):
return len(self.d)
def dump(self, fname):
with open(fname, 'w') as fp:
json.dump(self.d, fp, ensure_ascii=False)
def main():
t0 = time.time()
vocab = set(read_vocab())
unigram = UniGram(vocab)
bigram = BiGram(vocab)
all_files = len(glob.glob('dat/*/*'))
file_count = 0
for fname in glob.glob('dat/*/*'):
with open(fname) as rfp:
process = psutil.Process(os.getpid())
print(f"{fname} {file_count}/{all_files} "
f"({process.memory_info().rss / 1024 / 1024} MB): {time.time() - t0}."
f" unigram: {len(unigram)}. bigram: {len(bigram)}")
for line in rfp:
words = SPACES.split(line.rstrip())
for i in range(0, len(words) - 1):
unigram.register(words[i])
bigram.register(words[i], words[i + 1])
unigram.register(words[-1])
file_count += 1
print(f"Proceeded all files: {time.time() - t0}")
unigram.dump('jawiki.1gram.json')
bigram.dump('jawiki.2gram.json')
print(f"Finished: {time.time() - t0}")
# 2gram なら、全部オンメモリで5分ぐらいで終る。
main()