add kana-preferred mode for akaza-data tokenizer. (#280)

https://github.com/akaza-im/akaza-default-model/issues/14
This commit is contained in:
Tokuhiro Matsuno
2023-02-02 10:52:26 +09:00
committed by GitHub
parent d9f38b20a6
commit e8dd59eae7
4 changed files with 55 additions and 8 deletions

View File

@ -72,6 +72,8 @@ struct TokenizeArgs {
user_dict: Option<String>,
#[arg(short, long)]
system_dict: String,
#[arg(long)]
kana_preferred: bool,
src_dir: String,
dst_dir: String,
}
@ -212,6 +214,7 @@ fn main() -> anyhow::Result<()> {
opt.reader,
opt.system_dict,
opt.user_dict,
opt.kana_preferred,
opt.src_dir.as_str(),
opt.dst_dir.as_str(),
),

View File

@ -16,6 +16,7 @@ pub fn tokenize(
reader: String,
system_dict: String,
user_dict: Option<String>,
kana_preferred: bool,
src_dir: &str,
dst_dir: &str,
) -> anyhow::Result<()> {
@ -34,7 +35,7 @@ pub fn tokenize(
processor.process_file(
Path::new(src),
Path::new(dst),
&mut (|f| tokenizer.tokenize(f)),
&mut (|f| tokenizer.tokenize(f, kana_preferred)),
)
})
.collect::<Vec<_>>();
@ -52,7 +53,7 @@ pub fn tokenize(
processor.process_file(
Path::new(src),
Path::new(dst),
&mut (|f| tokenizer.tokenize(f)),
&mut (|f| tokenizer.tokenize(f, kana_preferred)),
)
})
.collect::<Vec<_>>();

View File

@ -1,5 +1,5 @@
pub trait AkazaTokenizer {
fn tokenize(&self, src: &str) -> anyhow::Result<String>;
fn tokenize(&self, src: &str, kana_preferred: bool) -> anyhow::Result<String>;
}
/// マージ処理に利用する為の中間表現

View File

@ -42,7 +42,7 @@ impl VibratoTokenizer {
impl AkazaTokenizer for VibratoTokenizer {
/// Vibrato を利用してファイルをアノテーションします。
fn tokenize(&self, src: &str) -> anyhow::Result<String> {
fn tokenize(&self, src: &str, kana_preferred: bool) -> anyhow::Result<String> {
let mut worker = self.tokenizer.new_worker();
worker.reset_sentence(src);
@ -70,8 +70,13 @@ impl AkazaTokenizer for VibratoTokenizer {
token.surface()
};
let yomi = kata2hira(yomi, ConvOption::default());
let surface = if should_be_kana(kana_preferred, hinshi, subhinshi) {
yomi.to_string()
} else {
token.surface().to_string()
};
let intermediate = IntermediateToken::new(
token.surface().to_string(),
surface,
yomi.to_string(),
hinshi.to_string(),
subhinshi.to_string(),
@ -85,16 +90,54 @@ impl AkazaTokenizer for VibratoTokenizer {
}
}
/// かな優先モードの処理
fn should_be_kana(kana_preferred: bool, hinshi: &str, subhinshi: &str) -> bool {
if !kana_preferred {
return false;
}
// 貴方 名詞,代名詞,一般,*,*,*,貴方,アナタ,アナタ
subhinshi == "代名詞"
// 美しい 形容詞,自立,*,*,形容詞・イ段,基本形,美しい,ウツクシイ,ウツ クシイ
|| hinshi == "形容詞"
// 到底 副詞,一般,*,*,*,*,到底,トウテイ,トーテイ
|| hinshi == "副詞"
// 及び 接続詞,*,*,*,*,*,及び,オヨビ,オヨビ
|| hinshi == "接続詞"
// 嗚呼 感動詞,*,*,*,*,*,嗚呼,アア,アー
|| hinshi == "感動詞"
// 仰ぐ 動詞,自立,*,*,五段・ガ行,基本形,仰ぐ,アオグ,アオグ
|| hinshi == "動詞"
}
#[cfg(test)]
mod tests {
use log::LevelFilter;
use super::*;
#[test]
fn test_should_be_kana() -> anyhow::Result<()> {
assert!(!should_be_kana(false, "形容詞", "自立"));
assert!(should_be_kana(true, "形容詞", "自立"));
Ok(())
}
#[test]
fn test_with_kana() -> anyhow::Result<()> {
let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?;
let got = runner.tokenize("私の名前は中野です。", true)?;
assert_eq!(
got,
"わたし/わたし の/の 名前/なまえ は/は 中野/なかの です/です 。/。"
);
Ok(())
}
#[test]
fn test() -> anyhow::Result<()> {
let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?;
runner.tokenize("私の名前は中野です。")?;
runner.tokenize("私の名前は中野です。", false)?;
Ok(())
}
@ -118,7 +161,7 @@ mod tests {
let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?;
assert_eq!(
runner.tokenize("書いていたものである")?,
runner.tokenize("書いていたものである", false)?,
"書いて/かいて いた/いた もの/もの である/である"
);
Ok(())
@ -146,7 +189,7 @@ mod tests {
.try_init();
let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?;
assert_eq!(runner.tokenize("井伊家")?, "井伊家/いいけ");
assert_eq!(runner.tokenize("井伊家", false)?, "井伊家/いいけ");
Ok(())
}
}