mirror of
https://github.com/mii443/akaza.git
synced 2025-08-22 14:55:31 +00:00
Merge pull request #207 from akaza-im/refactor-kana_kanji
かな漢字辞書のデータ構造を整理
This commit is contained in:
@ -14,7 +14,7 @@ pub fn check(yomi: &str, expected: Option<String>, user_data: bool) -> anyhow::R
|
||||
DictConfig {
|
||||
dict_type: "skk".to_string(),
|
||||
encoding: Some("euc-jp".to_string()),
|
||||
path: "skk-dev-dict/SKK-JISYO.L".to_string(),
|
||||
path: "/usr/share/skk/SKK-JISYO.L".to_string(),
|
||||
},
|
||||
DictConfig {
|
||||
dict_type: "skk".to_string(),
|
||||
@ -22,7 +22,7 @@ pub fn check(yomi: &str, expected: Option<String>, user_data: bool) -> anyhow::R
|
||||
path: "data/SKK-JISYO.akaza".to_string(),
|
||||
},
|
||||
],
|
||||
single_term: None,
|
||||
single_term: Default::default(),
|
||||
romkan: None,
|
||||
keymap: None,
|
||||
model: None,
|
||||
|
@ -70,7 +70,7 @@ pub fn evaluate(
|
||||
|
||||
let akaza = BigramWordViterbiEngineBuilder::new(Config {
|
||||
dicts,
|
||||
single_term: None,
|
||||
single_term: Default::default(),
|
||||
romkan: None,
|
||||
keymap: None,
|
||||
model: None,
|
||||
|
@ -14,6 +14,7 @@ use libakaza::dict::skk::read::read_skkdict;
|
||||
use libakaza::graph::graph_builder::GraphBuilder;
|
||||
use libakaza::graph::graph_resolver::GraphResolver;
|
||||
use libakaza::graph::segmenter::Segmenter;
|
||||
use libakaza::kana_kanji::hashmap_vec::HashmapVecKanaKanjiDict;
|
||||
use libakaza::kana_trie::cedarwood_kana_trie::CedarwoodKanaTrie;
|
||||
use libakaza::lm::base::{SystemBigramLM, SystemUnigramLM};
|
||||
use libakaza::lm::on_memory::on_memory_system_bigram_lm::OnMemorySystemBigramLM;
|
||||
@ -23,7 +24,8 @@ use libakaza::lm::system_unigram_lm::{MarisaSystemUnigramLM, MarisaSystemUnigram
|
||||
use libakaza::user_side_data::user_data::UserData;
|
||||
|
||||
struct LearningService {
|
||||
graph_builder: GraphBuilder<OnMemorySystemUnigramLM, OnMemorySystemBigramLM>,
|
||||
graph_builder:
|
||||
GraphBuilder<OnMemorySystemUnigramLM, OnMemorySystemBigramLM, HashmapVecKanaKanjiDict>,
|
||||
segmenter: Segmenter,
|
||||
system_unigram_lm: Rc<OnMemorySystemUnigramLM>,
|
||||
system_bigram_lm: Rc<OnMemorySystemBigramLM>,
|
||||
@ -81,8 +83,8 @@ impl LearningService {
|
||||
));
|
||||
|
||||
let graph_builder = GraphBuilder::new(
|
||||
system_kana_kanji_dict,
|
||||
HashMap::default(),
|
||||
HashmapVecKanaKanjiDict::new(system_kana_kanji_dict),
|
||||
HashmapVecKanaKanjiDict::new(HashMap::default()),
|
||||
Arc::new(Mutex::new(UserData::default())),
|
||||
system_unigram_lm.clone(),
|
||||
system_bigram_lm.clone(),
|
||||
|
@ -35,6 +35,7 @@ use libakaza::consonant::ConsonantSuffixExtractor;
|
||||
use libakaza::engine::base::HenkanEngine;
|
||||
use libakaza::engine::bigram_word_viterbi_engine::BigramWordViterbiEngine;
|
||||
use libakaza::graph::candidate::Candidate;
|
||||
use libakaza::kana_kanji::marisa_kana_kanji_dict::MarisaKanaKanjiDict;
|
||||
use libakaza::keymap::KeyState;
|
||||
use libakaza::lm::system_bigram::MarisaSystemBigramLM;
|
||||
use libakaza::lm::system_unigram_lm::MarisaSystemUnigramLM;
|
||||
@ -55,7 +56,8 @@ pub struct AkazaContext {
|
||||
keymap: KeyMap,
|
||||
romkan: RomKanConverter,
|
||||
command_map: HashMap<&'static str, IbusAkazaCommand>,
|
||||
engine: BigramWordViterbiEngine<MarisaSystemUnigramLM, MarisaSystemBigramLM>,
|
||||
engine:
|
||||
BigramWordViterbiEngine<MarisaSystemUnigramLM, MarisaSystemBigramLM, MarisaKanaKanjiDict>,
|
||||
consonant_suffix_extractor: ConsonantSuffixExtractor,
|
||||
|
||||
// ==== 現在の入力状態を保持 ====
|
||||
@ -68,7 +70,11 @@ pub struct AkazaContext {
|
||||
|
||||
impl AkazaContext {
|
||||
pub(crate) fn new(
|
||||
akaza: BigramWordViterbiEngine<MarisaSystemUnigramLM, MarisaSystemBigramLM>,
|
||||
akaza: BigramWordViterbiEngine<
|
||||
MarisaSystemUnigramLM,
|
||||
MarisaSystemBigramLM,
|
||||
MarisaKanaKanjiDict,
|
||||
>,
|
||||
config: Config,
|
||||
) -> Result<Self> {
|
||||
let input_mode = INPUT_MODE_HIRAGANA;
|
||||
|
@ -1,7 +1,7 @@
|
||||
use alloc::ffi::CString;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use log::{error, info, trace};
|
||||
use log::{error, trace};
|
||||
|
||||
use ibus_sys::core::{IBusModifierType_IBUS_CONTROL_MASK, IBusModifierType_IBUS_SHIFT_MASK};
|
||||
use ibus_sys::glib::guint;
|
||||
@ -57,7 +57,7 @@ impl KeyMap {
|
||||
error!("Unknown key symbol: {} {:?}", key, key_pattern);
|
||||
continue;
|
||||
}
|
||||
info!("Insert: {} {} {} {:?}", modifier, keyval, key, key_pattern);
|
||||
trace!("Insert: {} {} {} {:?}", modifier, keyval, key, key_pattern);
|
||||
for state in &key_pattern.states {
|
||||
mapping.insert(KeyPattern::new(*state, keyval, modifier), command.clone());
|
||||
}
|
||||
|
@ -5,16 +5,17 @@ dicts:
|
||||
encoding: euc-jp
|
||||
dict_type: skk
|
||||
*/
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use anyhow::Result;
|
||||
use log::{info, warn};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
#[derive(Debug, PartialEq, Serialize, Deserialize, Default)]
|
||||
pub struct Config {
|
||||
pub dicts: Vec<DictConfig>,
|
||||
pub single_term: Option<Vec<DictConfig>>,
|
||||
pub single_term: Vec<DictConfig>,
|
||||
/// ローマ字かな変換テーブルの指定
|
||||
/// "default", "kana", etc.
|
||||
pub romkan: Option<String>,
|
||||
@ -57,7 +58,7 @@ impl Config {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Serialize, Deserialize, Default)]
|
||||
#[derive(Debug, PartialEq, Serialize, Deserialize, Default, Clone)]
|
||||
pub struct DictConfig {
|
||||
pub path: String,
|
||||
/// Default: UTF-8
|
||||
@ -78,7 +79,7 @@ mod tests {
|
||||
DictConfig {
|
||||
path: "/usr/share/skk/SKK-JISYO.L".to_string(),
|
||||
encoding: Some("euc-jp".to_string()),
|
||||
dict_type: "skk".to_string()
|
||||
dict_type: "skk".to_string(),
|
||||
}
|
||||
);
|
||||
Ok(())
|
||||
|
@ -1,15 +1,63 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use anyhow::bail;
|
||||
use anyhow::Result;
|
||||
use anyhow::{bail, Context};
|
||||
use encoding_rs::{EUC_JP, UTF_8};
|
||||
use log::{error, info};
|
||||
|
||||
use crate::config::DictConfig;
|
||||
use crate::dict::merge_dict::merge_dict;
|
||||
use crate::dict::skk::read::read_skkdict;
|
||||
use crate::kana_kanji::marisa_kana_kanji_dict::MarisaKanaKanjiDict;
|
||||
|
||||
fn try_get_mtime(path: &str) -> Result<u64> {
|
||||
let file = File::open(path)?;
|
||||
let metadata = file.metadata()?;
|
||||
let mtime = metadata.modified()?;
|
||||
let t = mtime.duration_since(SystemTime::UNIX_EPOCH)?;
|
||||
Ok(t.as_secs())
|
||||
}
|
||||
|
||||
pub fn load_dicts_ex(
|
||||
dict_configs: &Vec<DictConfig>,
|
||||
cache_name: &str,
|
||||
) -> Result<MarisaKanaKanjiDict> {
|
||||
// さて、ここで、全部の依存先ファイルの mtime の max とキャッシュファイルの mtime の max を比較する
|
||||
// 更新が必要だったら、更新する。
|
||||
let max_dict_mtime = dict_configs
|
||||
.iter()
|
||||
.map(|it| try_get_mtime(&it.path).unwrap_or(0_u64))
|
||||
.max()
|
||||
.unwrap_or(0_u64);
|
||||
|
||||
// cache file のパスを得る
|
||||
let base_dirs = xdg::BaseDirectories::with_prefix("akaza")
|
||||
.with_context(|| "xdg directory with 'akaza' prefix")?;
|
||||
base_dirs.create_cache_directory("")?;
|
||||
let cache_path = base_dirs
|
||||
.get_cache_file(cache_name)
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
let cache_mtime = try_get_mtime(&cache_path).unwrap_or(0_u64);
|
||||
|
||||
if cache_mtime >= max_dict_mtime {
|
||||
info!("Cache is fresh! {:?} => {}", dict_configs, cache_path);
|
||||
match MarisaKanaKanjiDict::load(cache_path.as_str()) {
|
||||
Ok(dict) => return Ok(dict),
|
||||
Err(err) => {
|
||||
info!("Cannot load {:?}: {:?}", cache_path, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Cache is not fresh! {:?} => {}", dict_configs, cache_path);
|
||||
let dicts = load_dicts(dict_configs)?;
|
||||
|
||||
MarisaKanaKanjiDict::build(dicts, &cache_path)
|
||||
}
|
||||
|
||||
pub fn load_dicts(dict_configs: &Vec<DictConfig>) -> Result<HashMap<String, Vec<String>>> {
|
||||
let mut dicts: Vec<HashMap<String, Vec<String>>> = Vec::new();
|
||||
|
@ -1,23 +1,20 @@
|
||||
use std::collections::vec_deque::VecDeque;
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
use std::path::Path;
|
||||
use std::rc::Rc;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use anyhow::Result;
|
||||
use encoding_rs::UTF_8;
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::dict::loader::load_dicts;
|
||||
use crate::dict::merge_dict::merge_dict;
|
||||
use crate::dict::skk::read::read_skkdict;
|
||||
use crate::config::{Config, DictConfig};
|
||||
use crate::dict::loader::load_dicts_ex;
|
||||
use crate::engine::base::HenkanEngine;
|
||||
use crate::graph::candidate::Candidate;
|
||||
use crate::graph::graph_builder::GraphBuilder;
|
||||
use crate::graph::graph_resolver::GraphResolver;
|
||||
use crate::graph::lattice_graph::LatticeGraph;
|
||||
use crate::graph::segmenter::Segmenter;
|
||||
use crate::kana_kanji::base::KanaKanjiDict;
|
||||
use crate::kana_kanji::marisa_kana_kanji_dict::MarisaKanaKanjiDict;
|
||||
use crate::kana_trie::cedarwood_kana_trie::CedarwoodKanaTrie;
|
||||
use crate::lm::base::{SystemBigramLM, SystemUnigramLM};
|
||||
use crate::lm::system_bigram::MarisaSystemBigramLM;
|
||||
@ -28,15 +25,17 @@ use crate::user_side_data::user_data::UserData;
|
||||
|
||||
/// バイグラムのビタビベースかな漢字変換エンジンです。
|
||||
/// 単語バイグラムを採用しています。
|
||||
pub struct BigramWordViterbiEngine<U: SystemUnigramLM, B: SystemBigramLM> {
|
||||
graph_builder: GraphBuilder<U, B>,
|
||||
pub struct BigramWordViterbiEngine<U: SystemUnigramLM, B: SystemBigramLM, KD: KanaKanjiDict> {
|
||||
graph_builder: GraphBuilder<U, B, KD>,
|
||||
pub segmenter: Segmenter,
|
||||
pub graph_resolver: GraphResolver,
|
||||
romkan_converter: RomKanConverter,
|
||||
pub user_data: Arc<Mutex<UserData>>,
|
||||
}
|
||||
|
||||
impl<U: SystemUnigramLM, B: SystemBigramLM> HenkanEngine for BigramWordViterbiEngine<U, B> {
|
||||
impl<U: SystemUnigramLM, B: SystemBigramLM, KD: KanaKanjiDict> HenkanEngine
|
||||
for BigramWordViterbiEngine<U, B, KD>
|
||||
{
|
||||
fn learn(&mut self, candidates: &[Candidate]) {
|
||||
self.user_data.lock().unwrap().record_entries(candidates);
|
||||
}
|
||||
@ -62,7 +61,7 @@ impl<U: SystemUnigramLM, B: SystemBigramLM> HenkanEngine for BigramWordViterbiEn
|
||||
}
|
||||
}
|
||||
|
||||
impl<U: SystemUnigramLM, B: SystemBigramLM> BigramWordViterbiEngine<U, B> {
|
||||
impl<U: SystemUnigramLM, B: SystemBigramLM, KD: KanaKanjiDict> BigramWordViterbiEngine<U, B, KD> {
|
||||
pub fn resolve(&self, lattice: &LatticeGraph<U, B>) -> Result<Vec<VecDeque<Candidate>>> {
|
||||
self.graph_resolver.resolve(lattice)
|
||||
}
|
||||
@ -112,7 +111,9 @@ impl BigramWordViterbiEngineBuilder {
|
||||
|
||||
pub fn build(
|
||||
&self,
|
||||
) -> Result<BigramWordViterbiEngine<MarisaSystemUnigramLM, MarisaSystemBigramLM>> {
|
||||
) -> Result<
|
||||
BigramWordViterbiEngine<MarisaSystemUnigramLM, MarisaSystemBigramLM, MarisaKanaKanjiDict>,
|
||||
> {
|
||||
let model_name = self
|
||||
.config
|
||||
.model
|
||||
@ -125,10 +126,7 @@ impl BigramWordViterbiEngineBuilder {
|
||||
let system_bigram_lm = MarisaSystemBigramLM::load(
|
||||
Self::try_load(&format!("{}/bigram.model", model_name))?.as_str(),
|
||||
)?;
|
||||
let system_dict = read_skkdict(
|
||||
Path::new(Self::try_load(&format!("{}/SKK-JISYO.akaza", model_name))?.as_str()),
|
||||
UTF_8,
|
||||
)?;
|
||||
let system_dict = Self::try_load(&format!("{}/SKK-JISYO.akaza", model_name))?;
|
||||
|
||||
let user_data = if let Some(d) = &self.user_data {
|
||||
d.clone()
|
||||
@ -136,22 +134,26 @@ impl BigramWordViterbiEngineBuilder {
|
||||
Arc::new(Mutex::new(UserData::default()))
|
||||
};
|
||||
|
||||
let dict = load_dicts(&self.config.dicts)?;
|
||||
let dict = merge_dict(vec![system_dict, dict]);
|
||||
let dict = {
|
||||
let mut dicts = self.config.dicts.to_vec();
|
||||
dicts.push(DictConfig {
|
||||
path: system_dict,
|
||||
dict_type: "skk".to_string(),
|
||||
encoding: None,
|
||||
});
|
||||
|
||||
let single_term = if let Some(st) = &&self.config.single_term {
|
||||
load_dicts(st)?
|
||||
} else {
|
||||
HashMap::new()
|
||||
load_dicts_ex(&dicts, "kana_kanji_cache.marisa")?
|
||||
};
|
||||
|
||||
let single_term = load_dicts_ex(&self.config.single_term, "single_term_cache.marisa")?;
|
||||
|
||||
// 辞書を元に、トライを作成していく。
|
||||
let mut kana_trie = CedarwoodKanaTrie::default();
|
||||
for yomi in dict.keys() {
|
||||
for yomi in dict.yomis() {
|
||||
assert!(!yomi.is_empty());
|
||||
kana_trie.update(yomi.as_str());
|
||||
}
|
||||
for yomi in single_term.keys() {
|
||||
for yomi in single_term.yomis() {
|
||||
assert!(!yomi.is_empty());
|
||||
kana_trie.update(yomi.as_str());
|
||||
}
|
||||
@ -161,7 +163,11 @@ impl BigramWordViterbiEngineBuilder {
|
||||
user_data.lock().unwrap().kana_trie.clone(),
|
||||
]);
|
||||
|
||||
let graph_builder = GraphBuilder::new_with_default_score(
|
||||
let graph_builder: GraphBuilder<
|
||||
MarisaSystemUnigramLM,
|
||||
MarisaSystemBigramLM,
|
||||
MarisaKanaKanjiDict,
|
||||
> = GraphBuilder::new_with_default_score(
|
||||
dict,
|
||||
single_term,
|
||||
user_data.clone(),
|
||||
|
@ -1,5 +1,5 @@
|
||||
use std::collections::btree_map::BTreeMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashSet;
|
||||
use std::rc::Rc;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
@ -9,25 +9,26 @@ use log::trace;
|
||||
use crate::graph::lattice_graph::LatticeGraph;
|
||||
use crate::graph::segmenter::SegmentationResult;
|
||||
use crate::graph::word_node::WordNode;
|
||||
use crate::kana_kanji::base::KanaKanjiDict;
|
||||
use crate::lm::base::{SystemBigramLM, SystemUnigramLM};
|
||||
use crate::user_side_data::user_data::UserData;
|
||||
|
||||
pub struct GraphBuilder<U: SystemUnigramLM, B: SystemBigramLM> {
|
||||
system_kana_kanji_dict: HashMap<String, Vec<String>>,
|
||||
system_single_term_dict: HashMap<String, Vec<String>>,
|
||||
pub struct GraphBuilder<U: SystemUnigramLM, B: SystemBigramLM, KD: KanaKanjiDict> {
|
||||
system_kana_kanji_dict: KD,
|
||||
system_single_term_dict: KD,
|
||||
user_data: Arc<Mutex<UserData>>,
|
||||
system_unigram_lm: Rc<U>,
|
||||
system_bigram_lm: Rc<B>,
|
||||
}
|
||||
|
||||
impl<U: SystemUnigramLM, B: SystemBigramLM> GraphBuilder<U, B> {
|
||||
impl<U: SystemUnigramLM, B: SystemBigramLM, KD: KanaKanjiDict> GraphBuilder<U, B, KD> {
|
||||
pub fn new(
|
||||
system_kana_kanji_dict: HashMap<String, Vec<String>>,
|
||||
system_single_term_dict: HashMap<String, Vec<String>>,
|
||||
system_kana_kanji_dict: KD,
|
||||
system_single_term_dict: KD,
|
||||
user_data: Arc<Mutex<UserData>>,
|
||||
system_unigram_lm: Rc<U>,
|
||||
system_bigram_lm: Rc<B>,
|
||||
) -> GraphBuilder<U, B> {
|
||||
) -> GraphBuilder<U, B, KD> {
|
||||
GraphBuilder {
|
||||
system_kana_kanji_dict,
|
||||
system_single_term_dict,
|
||||
@ -38,12 +39,12 @@ impl<U: SystemUnigramLM, B: SystemBigramLM> GraphBuilder<U, B> {
|
||||
}
|
||||
|
||||
pub fn new_with_default_score(
|
||||
system_kana_kanji_dict: HashMap<String, Vec<String>>,
|
||||
system_single_term_dict: HashMap<String, Vec<String>>,
|
||||
system_kana_kanji_dict: KD,
|
||||
system_single_term_dict: KD,
|
||||
user_data: Arc<Mutex<UserData>>,
|
||||
system_unigram_lm: Rc<U>,
|
||||
system_bigram_lm: Rc<B>,
|
||||
) -> GraphBuilder<U, B> {
|
||||
) -> GraphBuilder<U, B, KD> {
|
||||
Self::new(
|
||||
system_kana_kanji_dict,
|
||||
system_single_term_dict,
|
||||
@ -73,7 +74,7 @@ impl<U: SystemUnigramLM, B: SystemBigramLM> GraphBuilder<U, B> {
|
||||
for kanji in kanjis {
|
||||
let node = WordNode::new(
|
||||
(end_pos - segmented_yomi.len()) as i32,
|
||||
kanji,
|
||||
&kanji,
|
||||
segmented_yomi,
|
||||
self.system_unigram_lm
|
||||
.find((kanji.to_string() + "/" + segmented_yomi).as_str()),
|
||||
@ -106,7 +107,7 @@ impl<U: SystemUnigramLM, B: SystemBigramLM> GraphBuilder<U, B> {
|
||||
for surface in surfaces {
|
||||
let node = WordNode::new(
|
||||
(end_pos - segmented_yomi.len()) as i32,
|
||||
surface,
|
||||
&surface,
|
||||
segmented_yomi,
|
||||
self.system_unigram_lm
|
||||
.find((surface.to_string() + "/" + segmented_yomi).as_str()),
|
||||
@ -129,16 +130,21 @@ impl<U: SystemUnigramLM, B: SystemBigramLM> GraphBuilder<U, B> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::kana_kanji::hashmap_vec::HashmapVecKanaKanjiDict;
|
||||
use crate::lm::system_bigram::MarisaSystemBigramLMBuilder;
|
||||
use crate::lm::system_unigram_lm::MarisaSystemUnigramLMBuilder;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_single_term() -> anyhow::Result<()> {
|
||||
let graph_builder = GraphBuilder::new_with_default_score(
|
||||
HashMap::new(),
|
||||
HashMap::from([("すし".to_string(), vec!["🍣".to_string()])]),
|
||||
HashmapVecKanaKanjiDict::new(HashMap::new()),
|
||||
HashmapVecKanaKanjiDict::new(HashMap::from([(
|
||||
"すし".to_string(),
|
||||
vec!["🍣".to_string()],
|
||||
)])),
|
||||
Arc::new(Mutex::new(UserData::default())),
|
||||
Rc::new(
|
||||
MarisaSystemUnigramLMBuilder::default()
|
||||
@ -170,8 +176,8 @@ mod tests {
|
||||
#[test]
|
||||
fn test_default_terms() -> anyhow::Result<()> {
|
||||
let graph_builder = GraphBuilder::new_with_default_score(
|
||||
HashMap::new(),
|
||||
HashMap::new(),
|
||||
HashmapVecKanaKanjiDict::new(HashMap::new()),
|
||||
HashmapVecKanaKanjiDict::new(HashMap::new()),
|
||||
Arc::new(Mutex::new(UserData::default())),
|
||||
Rc::new(
|
||||
MarisaSystemUnigramLMBuilder::default()
|
||||
@ -200,8 +206,11 @@ mod tests {
|
||||
#[test]
|
||||
fn test_default_terms_duplicated() -> anyhow::Result<()> {
|
||||
let graph_builder = GraphBuilder::new_with_default_score(
|
||||
HashMap::from([("す".to_string(), vec!["す".to_string(), "ス".to_string()])]),
|
||||
HashMap::new(),
|
||||
HashmapVecKanaKanjiDict::new(HashMap::from([(
|
||||
"す".to_string(),
|
||||
vec!["す".to_string(), "ス".to_string()],
|
||||
)])),
|
||||
HashmapVecKanaKanjiDict::new(HashMap::new()),
|
||||
Arc::new(Mutex::new(UserData::default())),
|
||||
Rc::new(
|
||||
MarisaSystemUnigramLMBuilder::default()
|
||||
|
@ -127,6 +127,7 @@ mod tests {
|
||||
|
||||
use crate::graph::graph_builder::GraphBuilder;
|
||||
use crate::graph::segmenter::{SegmentationResult, Segmenter};
|
||||
use crate::kana_kanji::hashmap_vec::HashmapVecKanaKanjiDict;
|
||||
use crate::kana_trie::cedarwood_kana_trie::CedarwoodKanaTrie;
|
||||
use crate::lm::system_bigram::MarisaSystemBigramLMBuilder;
|
||||
use crate::lm::system_unigram_lm::MarisaSystemUnigramLMBuilder;
|
||||
@ -165,8 +166,8 @@ mod tests {
|
||||
.build()?;
|
||||
let user_data = UserData::default();
|
||||
let graph_builder = GraphBuilder::new_with_default_score(
|
||||
HashMap::new(),
|
||||
Default::default(),
|
||||
HashmapVecKanaKanjiDict::new(HashMap::new()),
|
||||
HashmapVecKanaKanjiDict::new(Default::default()),
|
||||
Arc::new(Mutex::new(user_data)),
|
||||
Rc::new(system_unigram_lm),
|
||||
Rc::new(system_bigram_lm),
|
||||
@ -219,8 +220,8 @@ mod tests {
|
||||
// 私/わたし のスコアをガッと上げる。
|
||||
user_data.record_entries(&[Candidate::new("わたし", "私", 0_f32)]);
|
||||
let graph_builder = GraphBuilder::new_with_default_score(
|
||||
dict,
|
||||
HashMap::new(),
|
||||
HashmapVecKanaKanjiDict::new(dict),
|
||||
HashmapVecKanaKanjiDict::new(HashMap::new()),
|
||||
Arc::new(Mutex::new(user_data)),
|
||||
Rc::new(system_unigram_lm),
|
||||
Rc::new(system_bigram_lm),
|
||||
|
3
libakaza/src/kana_kanji/base.rs
Normal file
3
libakaza/src/kana_kanji/base.rs
Normal file
@ -0,0 +1,3 @@
|
||||
pub trait KanaKanjiDict {
|
||||
fn get(&self, kana: &str) -> Option<Vec<String>>;
|
||||
}
|
20
libakaza/src/kana_kanji/hashmap_vec.rs
Normal file
20
libakaza/src/kana_kanji/hashmap_vec.rs
Normal file
@ -0,0 +1,20 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::kana_kanji::base::KanaKanjiDict;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct HashmapVecKanaKanjiDict {
|
||||
map: HashMap<String, Vec<String>>,
|
||||
}
|
||||
|
||||
impl HashmapVecKanaKanjiDict {
|
||||
pub fn new(map: HashMap<String, Vec<String>>) -> Self {
|
||||
HashmapVecKanaKanjiDict { map }
|
||||
}
|
||||
}
|
||||
|
||||
impl KanaKanjiDict for HashmapVecKanaKanjiDict {
|
||||
fn get(&self, kana: &str) -> Option<Vec<String>> {
|
||||
self.map.get(kana).cloned()
|
||||
}
|
||||
}
|
94
libakaza/src/kana_kanji/marisa_kana_kanji_dict.rs
Normal file
94
libakaza/src/kana_kanji/marisa_kana_kanji_dict.rs
Normal file
@ -0,0 +1,94 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use log::trace;
|
||||
|
||||
use marisa_sys::{Keyset, Marisa};
|
||||
|
||||
use crate::kana_kanji::base::KanaKanjiDict;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct MarisaKanaKanjiDict {
|
||||
marisa: Marisa,
|
||||
}
|
||||
|
||||
impl MarisaKanaKanjiDict {
|
||||
pub(crate) fn build(
|
||||
dicts: HashMap<String, Vec<String>>,
|
||||
cache_path: &str,
|
||||
) -> anyhow::Result<MarisaKanaKanjiDict> {
|
||||
let mut keyset = Keyset::default();
|
||||
for (kana, surfaces) in dicts {
|
||||
keyset.push_back(
|
||||
[
|
||||
kana.as_bytes(),
|
||||
b"\t", // seperator
|
||||
surfaces.join("/").as_bytes(),
|
||||
]
|
||||
.concat()
|
||||
.as_slice(),
|
||||
);
|
||||
}
|
||||
|
||||
let mut marisa = Marisa::default();
|
||||
marisa.build(&keyset);
|
||||
marisa.save(cache_path)?;
|
||||
Ok(MarisaKanaKanjiDict { marisa })
|
||||
}
|
||||
|
||||
pub fn load(file_name: &str) -> anyhow::Result<MarisaKanaKanjiDict> {
|
||||
let mut marisa = Marisa::default();
|
||||
marisa.load(file_name)?;
|
||||
Ok(MarisaKanaKanjiDict { marisa })
|
||||
}
|
||||
|
||||
pub fn yomis(&self) -> Vec<String> {
|
||||
let mut yomis: Vec<String> = Vec::new();
|
||||
|
||||
self.marisa.predictive_search("".as_bytes(), |word, _| {
|
||||
let idx = word.iter().position(|f| *f == b'\t').unwrap();
|
||||
yomis.push(String::from_utf8_lossy(&word[0..idx]).to_string());
|
||||
true
|
||||
});
|
||||
|
||||
yomis
|
||||
}
|
||||
}
|
||||
|
||||
impl KanaKanjiDict for MarisaKanaKanjiDict {
|
||||
fn get(&self, kana: &str) -> Option<Vec<String>> {
|
||||
let mut surfaces: Vec<String> = Vec::new();
|
||||
let query = [kana.as_bytes(), b"\t".as_slice()].concat();
|
||||
self.marisa.predictive_search(query.as_slice(), |word, _| {
|
||||
let idx = word.iter().position(|f| *f == b'\t').unwrap();
|
||||
let s = String::from_utf8_lossy(&word[idx + 1..word.len()]).to_string();
|
||||
for s in s.split('/').collect::<Vec<_>>() {
|
||||
surfaces.push(s.to_string());
|
||||
}
|
||||
false
|
||||
});
|
||||
trace!("Got result: {:?}, {:?}", kana, surfaces);
|
||||
Some(surfaces)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn write_read() -> anyhow::Result<()> {
|
||||
let tmpfile = NamedTempFile::new().unwrap();
|
||||
let path = tmpfile.path().to_str().unwrap().to_string();
|
||||
|
||||
let dict = MarisaKanaKanjiDict::build(
|
||||
HashMap::from([("たなか".to_string(), vec!["田中".to_string()])]),
|
||||
path.as_str(),
|
||||
)?;
|
||||
|
||||
assert_eq!(dict.get("たなか"), Some(vec!["田中".to_string()]));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
3
libakaza/src/kana_kanji/mod.rs
Normal file
3
libakaza/src/kana_kanji/mod.rs
Normal file
@ -0,0 +1,3 @@
|
||||
pub mod base;
|
||||
pub mod hashmap_vec;
|
||||
pub mod marisa_kana_kanji_dict;
|
@ -10,6 +10,7 @@ pub mod dict;
|
||||
pub mod engine;
|
||||
pub mod extend_clause;
|
||||
pub mod graph;
|
||||
pub mod kana_kanji;
|
||||
pub mod kana_trie;
|
||||
pub mod keymap;
|
||||
pub mod lm;
|
||||
|
Reference in New Issue
Block a user