mirror of
https://github.com/mii443/akaza.git
synced 2025-08-22 14:55:31 +00:00
dump bigram data to text file
This commit is contained in:
19
.run/bigram.run.xml
Normal file
19
.run/bigram.run.xml
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
<component name="ProjectRunConfigurationManager">
|
||||||
|
<configuration default="false" name="bigram" type="CargoCommandRunConfiguration" factoryName="Cargo Command">
|
||||||
|
<option name="command" value="run --package akaza-data --bin akaza-data -- make-stats-system-bigram-lm --threshold=30 --corpus-dirs work/jawiki/vibrato-ipadic/ --corpus-dirs work/aozora_bunko/vibrato-ipadic/ data/stats-vibrato-unigram.trie work/stats-vibrato-bigram.raw.trie -vv" />
|
||||||
|
<option name="workingDirectory" value="file://$PROJECT_DIR$/akaza-data" />
|
||||||
|
<option name="channel" value="DEFAULT" />
|
||||||
|
<option name="requiredFeatures" value="true" />
|
||||||
|
<option name="allFeatures" value="false" />
|
||||||
|
<option name="emulateTerminal" value="false" />
|
||||||
|
<option name="withSudo" value="false" />
|
||||||
|
<option name="buildTarget" value="REMOTE" />
|
||||||
|
<option name="backtrace" value="SHORT" />
|
||||||
|
<envs />
|
||||||
|
<option name="isRedirectInput" value="false" />
|
||||||
|
<option name="redirectInputPath" value="" />
|
||||||
|
<method v="2">
|
||||||
|
<option name="CARGO.BUILD_TASK_PROVIDER" enabled="true" />
|
||||||
|
</method>
|
||||||
|
</configuration>
|
||||||
|
</component>
|
@ -80,7 +80,7 @@ work/stats-vibrato-bigram.raw.trie: work/stats-vibrato-unigram.raw.trie work/sta
|
|||||||
cargo run --release -- make-stats-system-bigram-lm --threshold=3 \
|
cargo run --release -- make-stats-system-bigram-lm --threshold=3 \
|
||||||
--corpus-dirs work/jawiki/vibrato-ipadic/ \
|
--corpus-dirs work/jawiki/vibrato-ipadic/ \
|
||||||
--corpus-dirs work/aozora_bunko/vibrato-ipadic/ \
|
--corpus-dirs work/aozora_bunko/vibrato-ipadic/ \
|
||||||
data/stats-vibrato-unigram.trie work/stats-vibrato-bigram.raw.trie
|
work/stats-vibrato-unigram.raw.trie work/stats-vibrato-bigram.raw.trie
|
||||||
|
|
||||||
data/stats-vibrato-bigram.trie: work/stats-vibrato-bigram.raw.trie work/stats-vibrato-unigram.raw.trie src/subcmd/learn_corpus.rs corpus/must.txt corpus/should.txt corpus/may.txt
|
data/stats-vibrato-bigram.trie: work/stats-vibrato-bigram.raw.trie work/stats-vibrato-unigram.raw.trie src/subcmd/learn_corpus.rs corpus/must.txt corpus/should.txt corpus/may.txt
|
||||||
cargo run --release -- learn-corpus \
|
cargo run --release -- learn-corpus \
|
||||||
|
@ -5,6 +5,7 @@ use std::path::{Path, PathBuf};
|
|||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use anyhow::{anyhow, Context};
|
use anyhow::{anyhow, Context};
|
||||||
|
use chrono::Local;
|
||||||
use log::info;
|
use log::info;
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
|
|
||||||
@ -34,6 +35,10 @@ pub fn make_stats_system_bigram_lm(
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|(key, (word_id, _cost))| (key.clone(), *word_id))
|
.map(|(key, (word_id, _cost))| (key.clone(), *word_id))
|
||||||
.collect::<HashMap<_, _>>();
|
.collect::<HashMap<_, _>>();
|
||||||
|
let reverse_unigram_map = unigram_map
|
||||||
|
.iter()
|
||||||
|
.map(|(key, word_id)| (*word_id, key.to_string()))
|
||||||
|
.collect::<HashMap<_, _>>();
|
||||||
|
|
||||||
// 次に、コーパスをスキャンして bigram を読み取る。
|
// 次に、コーパスをスキャンして bigram を読み取る。
|
||||||
let mut file_list: Vec<PathBuf> = Vec::new();
|
let mut file_list: Vec<PathBuf> = Vec::new();
|
||||||
@ -49,6 +54,7 @@ pub fn make_stats_system_bigram_lm(
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
// 集計した結果をマージする
|
// 集計した結果をマージする
|
||||||
|
info!("Merging");
|
||||||
let mut merged: HashMap<(i32, i32), u32> = HashMap::new();
|
let mut merged: HashMap<(i32, i32), u32> = HashMap::new();
|
||||||
for result in results {
|
for result in results {
|
||||||
let result = result?;
|
let result = result?;
|
||||||
@ -60,7 +66,27 @@ pub fn make_stats_system_bigram_lm(
|
|||||||
// スコアを計算する
|
// スコアを計算する
|
||||||
let scoremap = make_score_map(threshold, &merged);
|
let scoremap = make_score_map(threshold, &merged);
|
||||||
|
|
||||||
|
// dump bigram text file.
|
||||||
|
let dumpfname = format!(
|
||||||
|
"work/dump/bigram-{}.txt",
|
||||||
|
Local::now().format("%Y%m%d-%H%M%S")
|
||||||
|
);
|
||||||
|
println!("Dump to text file: {}", dumpfname);
|
||||||
|
let mut file = File::create(dumpfname)?;
|
||||||
|
for ((word_id1, word_id2), cnt) in &merged {
|
||||||
|
let Some(word1) = reverse_unigram_map.get(word_id1) else {
|
||||||
|
continue
|
||||||
|
};
|
||||||
|
let Some(word2) = reverse_unigram_map.get(word_id2) else {
|
||||||
|
continue
|
||||||
|
};
|
||||||
|
if *cnt > 16 {
|
||||||
|
file.write_fmt(format_args!("{}\t{}\t{}\n", cnt, word1, word2))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// 結果を書き込む
|
// 結果を書き込む
|
||||||
|
info!("Generating trie file");
|
||||||
let mut builder = MarisaSystemBigramLMBuilder::default();
|
let mut builder = MarisaSystemBigramLMBuilder::default();
|
||||||
for ((word_id1, word_id2), score) in scoremap {
|
for ((word_id1, word_id2), score) in scoremap {
|
||||||
builder.add(word_id1, word_id2, score);
|
builder.add(word_id1, word_id2, score);
|
||||||
@ -77,6 +103,7 @@ pub fn make_stats_system_bigram_lm(
|
|||||||
builder.set_default_edge_cost(default_edge_cost);
|
builder.set_default_edge_cost(default_edge_cost);
|
||||||
info!("Default score for 0: {}", default_edge_cost);
|
info!("Default score for 0: {}", default_edge_cost);
|
||||||
}
|
}
|
||||||
|
info!("Writing {}", bigram_trie_file);
|
||||||
builder.save(bigram_trie_file)?;
|
builder.save(bigram_trie_file)?;
|
||||||
|
|
||||||
validation(unigram_trie_file, bigram_trie_file)?;
|
validation(unigram_trie_file, bigram_trie_file)?;
|
||||||
@ -112,6 +139,8 @@ fn count_bigram(
|
|||||||
if words.len() < 2 {
|
if words.len() < 2 {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
// スライドしながらよんでいくので、同じ単語を二回ひかなくていいように
|
||||||
|
// 調整する
|
||||||
let word_ids = words
|
let word_ids = words
|
||||||
.iter()
|
.iter()
|
||||||
.map(|word| unigram_lm.get(&word.to_string()))
|
.map(|word| unigram_lm.get(&word.to_string()))
|
||||||
@ -124,6 +153,13 @@ fn count_bigram(
|
|||||||
let Some(word_id2) = word_ids[i + 1] else {
|
let Some(word_id2) = word_ids[i + 1] else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
|
// info!(
|
||||||
|
// "Register {}={}/{}={}",
|
||||||
|
// words[i],
|
||||||
|
// word_id1,
|
||||||
|
// words[i + 1],
|
||||||
|
// word_id2
|
||||||
|
// );
|
||||||
*map.entry((*word_id1, *word_id2)).or_insert(0) += 1;
|
*map.entry((*word_id1, *word_id2)).or_insert(0) += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user