mirror of
https://github.com/mii443/akaza.git
synced 2025-08-22 14:55:31 +00:00
add kana-preferred mode for akaza-data tokenizer. (#280)
https://github.com/akaza-im/akaza-default-model/issues/14
This commit is contained in:
@ -72,6 +72,8 @@ struct TokenizeArgs {
|
|||||||
user_dict: Option<String>,
|
user_dict: Option<String>,
|
||||||
#[arg(short, long)]
|
#[arg(short, long)]
|
||||||
system_dict: String,
|
system_dict: String,
|
||||||
|
#[arg(long)]
|
||||||
|
kana_preferred: bool,
|
||||||
src_dir: String,
|
src_dir: String,
|
||||||
dst_dir: String,
|
dst_dir: String,
|
||||||
}
|
}
|
||||||
@ -212,6 +214,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
opt.reader,
|
opt.reader,
|
||||||
opt.system_dict,
|
opt.system_dict,
|
||||||
opt.user_dict,
|
opt.user_dict,
|
||||||
|
opt.kana_preferred,
|
||||||
opt.src_dir.as_str(),
|
opt.src_dir.as_str(),
|
||||||
opt.dst_dir.as_str(),
|
opt.dst_dir.as_str(),
|
||||||
),
|
),
|
||||||
|
@ -16,6 +16,7 @@ pub fn tokenize(
|
|||||||
reader: String,
|
reader: String,
|
||||||
system_dict: String,
|
system_dict: String,
|
||||||
user_dict: Option<String>,
|
user_dict: Option<String>,
|
||||||
|
kana_preferred: bool,
|
||||||
src_dir: &str,
|
src_dir: &str,
|
||||||
dst_dir: &str,
|
dst_dir: &str,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
@ -34,7 +35,7 @@ pub fn tokenize(
|
|||||||
processor.process_file(
|
processor.process_file(
|
||||||
Path::new(src),
|
Path::new(src),
|
||||||
Path::new(dst),
|
Path::new(dst),
|
||||||
&mut (|f| tokenizer.tokenize(f)),
|
&mut (|f| tokenizer.tokenize(f, kana_preferred)),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
@ -52,7 +53,7 @@ pub fn tokenize(
|
|||||||
processor.process_file(
|
processor.process_file(
|
||||||
Path::new(src),
|
Path::new(src),
|
||||||
Path::new(dst),
|
Path::new(dst),
|
||||||
&mut (|f| tokenizer.tokenize(f)),
|
&mut (|f| tokenizer.tokenize(f, kana_preferred)),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
pub trait AkazaTokenizer {
|
pub trait AkazaTokenizer {
|
||||||
fn tokenize(&self, src: &str) -> anyhow::Result<String>;
|
fn tokenize(&self, src: &str, kana_preferred: bool) -> anyhow::Result<String>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// マージ処理に利用する為の中間表現
|
/// マージ処理に利用する為の中間表現
|
||||||
|
@ -42,7 +42,7 @@ impl VibratoTokenizer {
|
|||||||
|
|
||||||
impl AkazaTokenizer for VibratoTokenizer {
|
impl AkazaTokenizer for VibratoTokenizer {
|
||||||
/// Vibrato を利用してファイルをアノテーションします。
|
/// Vibrato を利用してファイルをアノテーションします。
|
||||||
fn tokenize(&self, src: &str) -> anyhow::Result<String> {
|
fn tokenize(&self, src: &str, kana_preferred: bool) -> anyhow::Result<String> {
|
||||||
let mut worker = self.tokenizer.new_worker();
|
let mut worker = self.tokenizer.new_worker();
|
||||||
|
|
||||||
worker.reset_sentence(src);
|
worker.reset_sentence(src);
|
||||||
@ -70,8 +70,13 @@ impl AkazaTokenizer for VibratoTokenizer {
|
|||||||
token.surface()
|
token.surface()
|
||||||
};
|
};
|
||||||
let yomi = kata2hira(yomi, ConvOption::default());
|
let yomi = kata2hira(yomi, ConvOption::default());
|
||||||
|
let surface = if should_be_kana(kana_preferred, hinshi, subhinshi) {
|
||||||
|
yomi.to_string()
|
||||||
|
} else {
|
||||||
|
token.surface().to_string()
|
||||||
|
};
|
||||||
let intermediate = IntermediateToken::new(
|
let intermediate = IntermediateToken::new(
|
||||||
token.surface().to_string(),
|
surface,
|
||||||
yomi.to_string(),
|
yomi.to_string(),
|
||||||
hinshi.to_string(),
|
hinshi.to_string(),
|
||||||
subhinshi.to_string(),
|
subhinshi.to_string(),
|
||||||
@ -85,16 +90,54 @@ impl AkazaTokenizer for VibratoTokenizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// かな優先モードの処理
|
||||||
|
fn should_be_kana(kana_preferred: bool, hinshi: &str, subhinshi: &str) -> bool {
|
||||||
|
if !kana_preferred {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 貴方 名詞,代名詞,一般,*,*,*,貴方,アナタ,アナタ
|
||||||
|
subhinshi == "代名詞"
|
||||||
|
// 美しい 形容詞,自立,*,*,形容詞・イ段,基本形,美しい,ウツクシイ,ウツ クシイ
|
||||||
|
|| hinshi == "形容詞"
|
||||||
|
// 到底 副詞,一般,*,*,*,*,到底,トウテイ,トーテイ
|
||||||
|
|| hinshi == "副詞"
|
||||||
|
// 及び 接続詞,*,*,*,*,*,及び,オヨビ,オヨビ
|
||||||
|
|| hinshi == "接続詞"
|
||||||
|
// 嗚呼 感動詞,*,*,*,*,*,嗚呼,アア,アー
|
||||||
|
|| hinshi == "感動詞"
|
||||||
|
// 仰ぐ 動詞,自立,*,*,五段・ガ行,基本形,仰ぐ,アオグ,アオグ
|
||||||
|
|| hinshi == "動詞"
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use log::LevelFilter;
|
use log::LevelFilter;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_should_be_kana() -> anyhow::Result<()> {
|
||||||
|
assert!(!should_be_kana(false, "形容詞", "自立"));
|
||||||
|
assert!(should_be_kana(true, "形容詞", "自立"));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_with_kana() -> anyhow::Result<()> {
|
||||||
|
let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?;
|
||||||
|
let got = runner.tokenize("私の名前は中野です。", true)?;
|
||||||
|
assert_eq!(
|
||||||
|
got,
|
||||||
|
"わたし/わたし の/の 名前/なまえ は/は 中野/なかの です/です 。/。"
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test() -> anyhow::Result<()> {
|
fn test() -> anyhow::Result<()> {
|
||||||
let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?;
|
let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?;
|
||||||
runner.tokenize("私の名前は中野です。")?;
|
runner.tokenize("私の名前は中野です。", false)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -118,7 +161,7 @@ mod tests {
|
|||||||
|
|
||||||
let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?;
|
let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
runner.tokenize("書いていたものである")?,
|
runner.tokenize("書いていたものである", false)?,
|
||||||
"書いて/かいて いた/いた もの/もの である/である"
|
"書いて/かいて いた/いた もの/もの である/である"
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -146,7 +189,7 @@ mod tests {
|
|||||||
.try_init();
|
.try_init();
|
||||||
|
|
||||||
let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?;
|
let runner = VibratoTokenizer::new("work/vibrato/ipadic-mecab-2_7_0/system.dic", None)?;
|
||||||
assert_eq!(runner.tokenize("井伊家")?, "井伊家/いいけ");
|
assert_eq!(runner.tokenize("井伊家", false)?, "井伊家/いいけ");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user