dump bigram data to text file

This commit is contained in:
Tokuhiro Matsuno
2023-01-14 09:58:04 +09:00
parent 79bb714158
commit ac7b732b0a
3 changed files with 56 additions and 1 deletions

19
.run/bigram.run.xml Normal file
View File

@ -0,0 +1,19 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="bigram" type="CargoCommandRunConfiguration" factoryName="Cargo Command">
<option name="command" value="run --package akaza-data --bin akaza-data -- make-stats-system-bigram-lm --threshold=30 --corpus-dirs work/jawiki/vibrato-ipadic/ --corpus-dirs work/aozora_bunko/vibrato-ipadic/ data/stats-vibrato-unigram.trie work/stats-vibrato-bigram.raw.trie -vv" />
<option name="workingDirectory" value="file://$PROJECT_DIR$/akaza-data" />
<option name="channel" value="DEFAULT" />
<option name="requiredFeatures" value="true" />
<option name="allFeatures" value="false" />
<option name="emulateTerminal" value="false" />
<option name="withSudo" value="false" />
<option name="buildTarget" value="REMOTE" />
<option name="backtrace" value="SHORT" />
<envs />
<option name="isRedirectInput" value="false" />
<option name="redirectInputPath" value="" />
<method v="2">
<option name="CARGO.BUILD_TASK_PROVIDER" enabled="true" />
</method>
</configuration>
</component>

View File

@ -80,7 +80,7 @@ work/stats-vibrato-bigram.raw.trie: work/stats-vibrato-unigram.raw.trie work/sta
cargo run --release -- make-stats-system-bigram-lm --threshold=3 \ cargo run --release -- make-stats-system-bigram-lm --threshold=3 \
--corpus-dirs work/jawiki/vibrato-ipadic/ \ --corpus-dirs work/jawiki/vibrato-ipadic/ \
--corpus-dirs work/aozora_bunko/vibrato-ipadic/ \ --corpus-dirs work/aozora_bunko/vibrato-ipadic/ \
data/stats-vibrato-unigram.trie work/stats-vibrato-bigram.raw.trie work/stats-vibrato-unigram.raw.trie work/stats-vibrato-bigram.raw.trie
data/stats-vibrato-bigram.trie: work/stats-vibrato-bigram.raw.trie work/stats-vibrato-unigram.raw.trie src/subcmd/learn_corpus.rs corpus/must.txt corpus/should.txt corpus/may.txt data/stats-vibrato-bigram.trie: work/stats-vibrato-bigram.raw.trie work/stats-vibrato-unigram.raw.trie src/subcmd/learn_corpus.rs corpus/must.txt corpus/should.txt corpus/may.txt
cargo run --release -- learn-corpus \ cargo run --release -- learn-corpus \

View File

@ -5,6 +5,7 @@ use std::path::{Path, PathBuf};
use anyhow::Result; use anyhow::Result;
use anyhow::{anyhow, Context}; use anyhow::{anyhow, Context};
use chrono::Local;
use log::info; use log::info;
use rayon::prelude::*; use rayon::prelude::*;
@ -34,6 +35,10 @@ pub fn make_stats_system_bigram_lm(
.iter() .iter()
.map(|(key, (word_id, _cost))| (key.clone(), *word_id)) .map(|(key, (word_id, _cost))| (key.clone(), *word_id))
.collect::<HashMap<_, _>>(); .collect::<HashMap<_, _>>();
let reverse_unigram_map = unigram_map
.iter()
.map(|(key, word_id)| (*word_id, key.to_string()))
.collect::<HashMap<_, _>>();
// 次に、コーパスをスキャンして bigram を読み取る。 // 次に、コーパスをスキャンして bigram を読み取る。
let mut file_list: Vec<PathBuf> = Vec::new(); let mut file_list: Vec<PathBuf> = Vec::new();
@ -49,6 +54,7 @@ pub fn make_stats_system_bigram_lm(
.collect::<Vec<_>>(); .collect::<Vec<_>>();
// 集計した結果をマージする // 集計した結果をマージする
info!("Merging");
let mut merged: HashMap<(i32, i32), u32> = HashMap::new(); let mut merged: HashMap<(i32, i32), u32> = HashMap::new();
for result in results { for result in results {
let result = result?; let result = result?;
@ -60,7 +66,27 @@ pub fn make_stats_system_bigram_lm(
// スコアを計算する // スコアを計算する
let scoremap = make_score_map(threshold, &merged); let scoremap = make_score_map(threshold, &merged);
// dump bigram text file.
let dumpfname = format!(
"work/dump/bigram-{}.txt",
Local::now().format("%Y%m%d-%H%M%S")
);
println!("Dump to text file: {}", dumpfname);
let mut file = File::create(dumpfname)?;
for ((word_id1, word_id2), cnt) in &merged {
let Some(word1) = reverse_unigram_map.get(word_id1) else {
continue
};
let Some(word2) = reverse_unigram_map.get(word_id2) else {
continue
};
if *cnt > 16 {
file.write_fmt(format_args!("{}\t{}\t{}\n", cnt, word1, word2))?;
}
}
// 結果を書き込む // 結果を書き込む
info!("Generating trie file");
let mut builder = MarisaSystemBigramLMBuilder::default(); let mut builder = MarisaSystemBigramLMBuilder::default();
for ((word_id1, word_id2), score) in scoremap { for ((word_id1, word_id2), score) in scoremap {
builder.add(word_id1, word_id2, score); builder.add(word_id1, word_id2, score);
@ -77,6 +103,7 @@ pub fn make_stats_system_bigram_lm(
builder.set_default_edge_cost(default_edge_cost); builder.set_default_edge_cost(default_edge_cost);
info!("Default score for 0: {}", default_edge_cost); info!("Default score for 0: {}", default_edge_cost);
} }
info!("Writing {}", bigram_trie_file);
builder.save(bigram_trie_file)?; builder.save(bigram_trie_file)?;
validation(unigram_trie_file, bigram_trie_file)?; validation(unigram_trie_file, bigram_trie_file)?;
@ -112,6 +139,8 @@ fn count_bigram(
if words.len() < 2 { if words.len() < 2 {
continue; continue;
} }
// スライドしながらよんでいくので、同じ単語を二回ひかなくていいように
// 調整する
let word_ids = words let word_ids = words
.iter() .iter()
.map(|word| unigram_lm.get(&word.to_string())) .map(|word| unigram_lm.get(&word.to_string()))
@ -124,6 +153,13 @@ fn count_bigram(
let Some(word_id2) = word_ids[i + 1] else { let Some(word_id2) = word_ids[i + 1] else {
continue; continue;
}; };
// info!(
// "Register {}={}/{}={}",
// words[i],
// word_id1,
// words[i + 1],
// word_id2
// );
*map.entry((*word_id1, *word_id2)).or_insert(0) += 1; *map.entry((*word_id1, *word_id2)).or_insert(0) += 1;
} }
} }