diff --git a/.run/bigram.run.xml b/.run/bigram.run.xml
new file mode 100644
index 0000000..5119053
--- /dev/null
+++ b/.run/bigram.run.xml
@@ -0,0 +1,19 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/akaza-data/Makefile b/akaza-data/Makefile
index 4669105..cea4ad5 100644
--- a/akaza-data/Makefile
+++ b/akaza-data/Makefile
@@ -80,7 +80,7 @@ work/stats-vibrato-bigram.raw.trie: work/stats-vibrato-unigram.raw.trie work/sta
cargo run --release -- make-stats-system-bigram-lm --threshold=3 \
--corpus-dirs work/jawiki/vibrato-ipadic/ \
--corpus-dirs work/aozora_bunko/vibrato-ipadic/ \
- data/stats-vibrato-unigram.trie work/stats-vibrato-bigram.raw.trie
+ work/stats-vibrato-unigram.raw.trie work/stats-vibrato-bigram.raw.trie
data/stats-vibrato-bigram.trie: work/stats-vibrato-bigram.raw.trie work/stats-vibrato-unigram.raw.trie src/subcmd/learn_corpus.rs corpus/must.txt corpus/should.txt corpus/may.txt
cargo run --release -- learn-corpus \
diff --git a/akaza-data/src/subcmd/make_stats_system_bigram_lm.rs b/akaza-data/src/subcmd/make_stats_system_bigram_lm.rs
index d8bbf13..6d82f3a 100644
--- a/akaza-data/src/subcmd/make_stats_system_bigram_lm.rs
+++ b/akaza-data/src/subcmd/make_stats_system_bigram_lm.rs
@@ -5,6 +5,7 @@ use std::path::{Path, PathBuf};
use anyhow::Result;
use anyhow::{anyhow, Context};
+use chrono::Local;
use log::info;
use rayon::prelude::*;
@@ -34,6 +35,10 @@ pub fn make_stats_system_bigram_lm(
.iter()
.map(|(key, (word_id, _cost))| (key.clone(), *word_id))
.collect::>();
+ let reverse_unigram_map = unigram_map
+ .iter()
+ .map(|(key, word_id)| (*word_id, key.to_string()))
+ .collect::>();
// 次に、コーパスをスキャンして bigram を読み取る。
let mut file_list: Vec = Vec::new();
@@ -49,6 +54,7 @@ pub fn make_stats_system_bigram_lm(
.collect::>();
// 集計した結果をマージする
+ info!("Merging");
let mut merged: HashMap<(i32, i32), u32> = HashMap::new();
for result in results {
let result = result?;
@@ -60,7 +66,27 @@ pub fn make_stats_system_bigram_lm(
// スコアを計算する
let scoremap = make_score_map(threshold, &merged);
+ // dump bigram text file.
+ let dumpfname = format!(
+ "work/dump/bigram-{}.txt",
+ Local::now().format("%Y%m%d-%H%M%S")
+ );
+ println!("Dump to text file: {}", dumpfname);
+ let mut file = File::create(dumpfname)?;
+ for ((word_id1, word_id2), cnt) in &merged {
+ let Some(word1) = reverse_unigram_map.get(word_id1) else {
+ continue
+ };
+ let Some(word2) = reverse_unigram_map.get(word_id2) else {
+ continue
+ };
+ if *cnt > 16 {
+ file.write_fmt(format_args!("{}\t{}\t{}\n", cnt, word1, word2))?;
+ }
+ }
+
// 結果を書き込む
+ info!("Generating trie file");
let mut builder = MarisaSystemBigramLMBuilder::default();
for ((word_id1, word_id2), score) in scoremap {
builder.add(word_id1, word_id2, score);
@@ -77,6 +103,7 @@ pub fn make_stats_system_bigram_lm(
builder.set_default_edge_cost(default_edge_cost);
info!("Default score for 0: {}", default_edge_cost);
}
+ info!("Writing {}", bigram_trie_file);
builder.save(bigram_trie_file)?;
validation(unigram_trie_file, bigram_trie_file)?;
@@ -112,6 +139,8 @@ fn count_bigram(
if words.len() < 2 {
continue;
}
+ // スライドしながらよんでいくので、同じ単語を二回ひかなくていいように
+ // 調整する
let word_ids = words
.iter()
.map(|word| unigram_lm.get(&word.to_string()))
@@ -124,6 +153,13 @@ fn count_bigram(
let Some(word_id2) = word_ids[i + 1] else {
continue;
};
+ // info!(
+ // "Register {}={}/{}={}",
+ // words[i],
+ // word_id1,
+ // words[i + 1],
+ // word_id2
+ // );
*map.entry((*word_id1, *word_id2)).or_insert(0) += 1;
}
}