diff --git a/akaza-data/src/main.rs b/akaza-data/src/main.rs index f45c5d8..3efc032 100644 --- a/akaza-data/src/main.rs +++ b/akaza-data/src/main.rs @@ -72,6 +72,8 @@ struct TokenizeArgs { user_dict: Option, #[arg(short, long)] system_dict: String, + #[arg(long)] + kana_preferred: bool, src_dir: String, dst_dir: String, } @@ -212,6 +214,7 @@ fn main() -> anyhow::Result<()> { opt.reader, opt.system_dict, opt.user_dict, + opt.kana_preferred, opt.src_dir.as_str(), opt.dst_dir.as_str(), ), diff --git a/akaza-data/src/subcmd/tokenize.rs b/akaza-data/src/subcmd/tokenize.rs index 0c359b2..42b5765 100644 --- a/akaza-data/src/subcmd/tokenize.rs +++ b/akaza-data/src/subcmd/tokenize.rs @@ -16,6 +16,7 @@ pub fn tokenize( reader: String, system_dict: String, user_dict: Option, + kana_preferred: bool, src_dir: &str, dst_dir: &str, ) -> anyhow::Result<()> { @@ -34,7 +35,7 @@ pub fn tokenize( processor.process_file( Path::new(src), Path::new(dst), - &mut (|f| tokenizer.tokenize(f)), + &mut (|f| tokenizer.tokenize(f, kana_preferred)), ) }) .collect::>(); @@ -52,7 +53,7 @@ pub fn tokenize( processor.process_file( Path::new(src), Path::new(dst), - &mut (|f| tokenizer.tokenize(f)), + &mut (|f| tokenizer.tokenize(f, kana_preferred)), ) }) .collect::>(); diff --git a/akaza-data/src/tokenizer/base.rs b/akaza-data/src/tokenizer/base.rs index f1783fd..24d1ce7 100644 --- a/akaza-data/src/tokenizer/base.rs +++ b/akaza-data/src/tokenizer/base.rs @@ -1,5 +1,5 @@ pub trait AkazaTokenizer { - fn tokenize(&self, src: &str) -> anyhow::Result; + fn tokenize(&self, src: &str, kana_preferred: bool) -> anyhow::Result; } /// マージ処理に利用する為の中間表現 diff --git a/akaza-data/src/tokenizer/vibrato.rs b/akaza-data/src/tokenizer/vibrato.rs index dd9a66a..db760ab 100644 --- a/akaza-data/src/tokenizer/vibrato.rs +++ b/akaza-data/src/tokenizer/vibrato.rs @@ -42,7 +42,7 @@ impl VibratoTokenizer { impl AkazaTokenizer for VibratoTokenizer { /// Vibrato を利用してファイルをアノテーションします。 - fn tokenize(&self, src: &str) -> anyhow::Result { + fn tokenize(&self, src: &str, kana_preferred: bool) -> anyhow::Result { let mut worker = self.tokenizer.new_worker(); worker.reset_sentence(src); @@ -70,8 +70,13 @@ impl AkazaTokenizer for VibratoTokenizer { token.surface() }; let yomi = kata2hira(yomi, ConvOption::default()); + let surface = if should_be_kana(kana_preferred, hinshi, subhinshi) { + yomi.to_string() + } else { + token.surface().to_string() + }; let intermediate = IntermediateToken::new( - token.surface().to_string(), + surface, yomi.to_string(), hinshi.to_string(), subhinshi.to_string(), @@ -85,16 +90,54 @@ impl AkazaTokenizer for VibratoTokenizer { } } +/// かな優先モードの処理 +fn should_be_kana(kana_preferred: bool, hinshi: &str, subhinshi: &str) -> bool { + if !kana_preferred { + return false; + } + + // 貴方 名詞,代名詞,一般,*,*,*,貴方,アナタ,アナタ + subhinshi == "代名詞" + // 美しい 形容詞,自立,*,*,形容詞・イ段,基本形,美しい,ウツクシイ,ウツ クシイ + || hinshi == "形容詞" + // 到底 副詞,一般,*,*,*,*,到底,トウテイ,トーテイ + || hinshi == "副詞" + // 及び 接続詞,*,*,*,*,*,及び,オヨビ,オヨビ + || hinshi == "接続詞" + // 嗚呼 感動詞,*,*,*,*,*,嗚呼,アア,アー + || hinshi == "感動詞" + // 仰ぐ 動詞,自立,*,*,五段・ガ行,基本形,仰ぐ,アオグ,アオグ + || hinshi == "動詞" +} + #[cfg(test)] mod tests { use log::LevelFilter; use super::*; + #[test] + fn test_should_be_kana() -> anyhow::Result<()> { + assert!(!should_be_kana(false, "形容詞", "自立")); + assert!(should_be_kana(true, "形容詞", "自立")); + Ok(()) + } + + #[test] + fn test_with_kana() -> anyhow::Result<()> { + let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?; + let got = runner.tokenize("私の名前は中野です。", true)?; + assert_eq!( + got, + "わたし/わたし の/の 名前/なまえ は/は 中野/なかの です/です 。/。" + ); + Ok(()) + } + #[test] fn test() -> anyhow::Result<()> { let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?; - runner.tokenize("私の名前は中野です。")?; + runner.tokenize("私の名前は中野です。", false)?; Ok(()) } @@ -118,7 +161,7 @@ mod tests { let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?; assert_eq!( - runner.tokenize("書いていたものである")?, + runner.tokenize("書いていたものである", false)?, "書いて/かいて いた/いた もの/もの である/である" ); Ok(()) @@ -146,7 +189,7 @@ mod tests { .try_init(); let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?; - assert_eq!(runner.tokenize("井伊家")?, "井伊家/いいけ"); + assert_eq!(runner.tokenize("井伊家", false)?, "井伊家/いいけ"); Ok(()) } }