mirror of
https://github.com/mii443/akaza.git
synced 2025-08-22 14:55:31 +00:00
snapshot
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,3 +3,4 @@ __pycache__
|
||||
/comb.xml
|
||||
/hello.*
|
||||
/test_graph2.py
|
||||
/akaza.xml
|
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +1,6 @@
|
||||
[submodule "model/skk-dev-dict"]
|
||||
path = model/skk-dev-dict
|
||||
url = https://github.com/skk-dev/dict.git
|
||||
[submodule "akaza-data/skk-dev-dict"]
|
||||
path = akaza-data/skk-dev-dict
|
||||
url = https://github.com/skk-dev/dict.git
|
||||
|
11
Changes.md
11
Changes.md
@ -1,3 +1,12 @@
|
||||
# 2020-09-14(Mon)
|
||||
|
||||
* comb を akaza に改名した。
|
||||
* ibus 関連部分とそれ以外を分離する。以下のようなモジュール構成を目指す。
|
||||
* ibus-akaza: ibus 連動部分。
|
||||
* akaza-core: 変換コアエンジン。ibus 関連部分と独立させることにより、fcitx との連動を可能にすることを目指す
|
||||
* skkdictutils: SKK 辞書関連ユーティリティライブラリ。単独利用可能なようにパッケージングし、単独レポジトリに独立させる予定。
|
||||
* akaza-data : システム辞書/システム言語モデル
|
||||
|
||||
# 2020-09-13(Sun)
|
||||
|
||||
* Rust で書き直そうかなぁ。。
|
||||
@ -14,7 +23,7 @@
|
||||
* kytea から得られた結果をもとに、平仮名を連結して ngram を作成したが、この結果は惨憺たるものであった。
|
||||
* DONE: 文節を伸ばす機能が死んでいる。
|
||||
* ユーザー辞書を設定できるようにしたい。
|
||||
* ~/.config/ibus-comb/user-dict.json のなかに設定をいれる。
|
||||
* ~/.config/ibus-akaza/user-dict.json のなかに設定をいれる。
|
||||
* `path/to/dict.txt;format=skk;charset=euc-jp` みたいなフォーマットでいいかなぁ。。JSONでもいいかな。。
|
||||
* 思ったより、簡単に実装できそう。
|
||||
|
||||
|
107
Makefile
107
Makefile
@ -8,79 +8,68 @@ DESTDIR ?=
|
||||
|
||||
PYTHON ?= /usr/bin/python3
|
||||
|
||||
all: comb.xml comb/config.py comb model/jawiki.1gram
|
||||
all: akaza.xml akaza/config.py akaza akaza-data/system_language_model.trie akaza-data/system_dict.trie
|
||||
|
||||
check:
|
||||
python -m py_compile ibus.py
|
||||
python -m py_compile comb/combromkan.py
|
||||
python -m py_compile comb/engine.py
|
||||
python -m py_compile comb/skkdict.py
|
||||
python -m py_compile akaza/akazaromkan.py
|
||||
python -m py_compile akaza/engine.py
|
||||
python -m py_compile akaza/skkdict.py
|
||||
pytest
|
||||
|
||||
comb.xml: comb.xml.in
|
||||
sed -e "s:@PYTHON@:$(PYTHON):g;" \
|
||||
-e "s:@DATADIR@:$(DATADIR):g" $< > $@
|
||||
|
||||
comb/config.py: comb/config.py.in
|
||||
sed -e "s:@SYSCONFDIR@:$(SYSCONFDIR):g" \
|
||||
-e "s:@MODELDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/model:g" \
|
||||
-e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/dictionary:g" \
|
||||
$< > $@
|
||||
|
||||
model/system_language_model.trie: model/bin/create-system_language_model-from-json.py
|
||||
akaza-data/system_language_model.trie: akaza-data/bin/create-system_language_model-from-json.py
|
||||
make -C model system_language_model.trie
|
||||
|
||||
model/system_dict.trie:
|
||||
akaza-data/system_dict.trie:
|
||||
make -C model system_dict.trie
|
||||
|
||||
install-dict: model/system_dict.trie
|
||||
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/dictionary
|
||||
install -p -m 0644 model/system_dict.trie $(DESTDIR)$(DATADIR)/ibus-comb/dictionary/
|
||||
install-data: model/system_dict.trie
|
||||
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-akaza/dictionary
|
||||
install -p -m 0644 model/system_dict.trie $(DESTDIR)$(DATADIR)/ibus-akaza/dictionary/
|
||||
install -m 0644 model/system_language_model.trie $(DESTDIR)$(DATADIR)/ibus-akaza/model/
|
||||
|
||||
install: all comb/config.py model/jawiki.1gram install-dict
|
||||
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/comb $(DESTDIR)$(SYSCONFDIR)/xdg/comb $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-comb/model $(DESTDIR)$(DATADIR)/ibus-comb/dictionary
|
||||
install -m 0644 model/system_language_model.trie $(DESTDIR)$(DATADIR)/ibus-comb/model/
|
||||
install: all akaza/config.py model/system_dict.trie install-data
|
||||
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-akaza/akaza $(DESTDIR)$(SYSCONFDIR)/xdg/akaza $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-akaza/model $(DESTDIR)$(DATADIR)/ibus-akaza/dictionary
|
||||
|
||||
install -m 0644 comb.svg $(DESTDIR)$(DATADIR)/ibus-comb
|
||||
install -m 0644 comb/__init__.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb/graph.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb/language_model.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb/node.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb/config.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb/skkdict.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb/combromkan.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 ibus.py $(DESTDIR)$(DATADIR)/ibus-comb
|
||||
install -m 0644 comb/engine.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb/ui.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb/user_language_model.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb/system_language_model.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb/system_dict.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb/user_dict.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
|
||||
install -m 0644 comb.xml $(DESTDIR)$(DATADIR)/ibus/component
|
||||
install -m 0644 akaza.svg $(DESTDIR)$(DATADIR)/ibus-akaza
|
||||
install -m 0644 ibus.py $(DESTDIR)$(DATADIR)/ibus-akaza
|
||||
install -m 0644 akaza.xml $(DESTDIR)$(DATADIR)/ibus/component
|
||||
|
||||
install -m 0644 akaza/__init__.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/graph.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/language_model.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/node.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/config.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/skkdict.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/akazaromkan.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/engine.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/ui.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/user_language_model.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/system_language_model.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/system_dict.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
install -m 0644 akaza/user_dict.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
|
||||
|
||||
uninstall:
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb.svg
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/config.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/engine.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/skkdict.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/combromkan.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/graph.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/language_model.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/node.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/ui.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_language_model.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_language_model.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_dict.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_dict.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/ibus.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/system_language_model.trie
|
||||
rmdir $(DESTDIR)$(DATADIR)/ibus-comb
|
||||
rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/comb
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus/component/comb.xml
|
||||
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus/component/akaza.xml
|
||||
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/engine.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/skkdict.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/akazaromkan.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/graph.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/language_model.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/node.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ui.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/user_language_model.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/system_language_model.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/user_dict.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/system_dict.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/model/system_language_model.trie
|
||||
rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/akaza
|
||||
|
||||
clean:
|
||||
rm -f comb.xml
|
||||
rm -f comb/config.py
|
||||
rm -f akaza.xml
|
||||
rm -f akaza/config.py
|
||||
|
||||
.PHONY: all check install uninstall
|
||||
|
||||
|
2
Note.md
2
Note.md
@ -12,7 +12,7 @@
|
||||
* LOUDS はサイズが小さくなるが、動的な追加削除はできない。
|
||||
* 検索速度は Double Array のほうが速い
|
||||
|
||||
* comb には何に trie を作成しているのか?
|
||||
* akaza には何に trie を作成しているのか?
|
||||
* ユーザー辞書とシステム辞書とシステム言語モデルに利用している。
|
||||
* ただし、システム言語モデルは、純粋なキーからの検索しかしていない。
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
# ibus-comb
|
||||
# ibus-akaza
|
||||
|
||||
Yet another kana-kanji-converter on IBus, written in Python.
|
||||
|
||||
|
4
TODO.md
4
TODO.md
@ -7,7 +7,6 @@
|
||||
## Priority mid
|
||||
|
||||
- support 3gram(必要?)
|
||||
- 2gram のデータがデカすぎる。libkkc と同等の圧縮をすべき
|
||||
- ユーザー言語モデル学習機 from text file or web.
|
||||
- クローラーをかく?
|
||||
- ユーザー辞書機能を実装する
|
||||
@ -38,6 +37,7 @@
|
||||
- 共起的なスコアをいれたい?
|
||||
- 青空文庫をコーパスとして使う?
|
||||
- 古くさすぎるかも
|
||||
- 言語モデルを小さくできないか?
|
||||
|
||||
# DONE
|
||||
|
||||
@ -45,7 +45,7 @@
|
||||
- カタカナ語辞書の作成
|
||||
- 連文節変換用の UI を実装する
|
||||
- Function key とかのショートカットで、全部カタカナにすることができるように。
|
||||
- ibus-comb がバグってた時に便利。
|
||||
- ibus-akaza がバグってた時に便利。
|
||||
- 末尾のアルファベット一文字は、変換しない。
|
||||
- 前向きDP後ろ向きA* で候補を得る
|
||||
- 平仮名語辞書もいるのかもしれない。
|
||||
|
20
akaza-core/README.md
Normal file
20
akaza-core/README.md
Normal file
@ -0,0 +1,20 @@
|
||||
# Akaza
|
||||
|
||||
## What's this?
|
||||
|
||||
Yet another kana-kanji conversion system written in Python 3.
|
||||
|
||||
## How do I use it?
|
||||
|
||||
### Use as a library
|
||||
|
||||
system_language_model = SystemLanguageModel.create('path/to/system_language_model.trie')
|
||||
system_dict = SystemDictionary.create('path/to/system_language_model.trie')
|
||||
akaza = Akaza(
|
||||
system_language_model = system_language_model,
|
||||
system_dict: system_dict,
|
||||
user_language_model: user_language_model,
|
||||
user_dict: user_dict,
|
||||
)
|
||||
print(akaza.convert('watasinonamaehanakanodesu.'))
|
||||
# → 私の名前は中野です。
|
9
akaza-core/akaza.egg-info/PKG-INFO
Normal file
9
akaza-core/akaza.egg-info/PKG-INFO
Normal file
@ -0,0 +1,9 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: akaza
|
||||
Version: 0.0.1
|
||||
Summary: UNKNOWN
|
||||
Home-page: UNKNOWN
|
||||
License: UNKNOWN
|
||||
Description: UNKNOWN
|
||||
Platform: UNKNOWN
|
||||
Provides-Extra: develop
|
6
akaza-core/akaza.egg-info/SOURCES.txt
Normal file
6
akaza-core/akaza.egg-info/SOURCES.txt
Normal file
@ -0,0 +1,6 @@
|
||||
setup.py
|
||||
akaza.egg-info/PKG-INFO
|
||||
akaza.egg-info/SOURCES.txt
|
||||
akaza.egg-info/dependency_links.txt
|
||||
akaza.egg-info/requires.txt
|
||||
akaza.egg-info/top_level.txt
|
1
akaza-core/akaza.egg-info/dependency_links.txt
Normal file
1
akaza-core/akaza.egg-info/dependency_links.txt
Normal file
@ -0,0 +1 @@
|
||||
|
6
akaza-core/akaza.egg-info/requires.txt
Normal file
6
akaza-core/akaza.egg-info/requires.txt
Normal file
@ -0,0 +1,6 @@
|
||||
marisa-trie==0.7.5
|
||||
jaconv==0.2.4
|
||||
|
||||
[develop]
|
||||
dev-packageA
|
||||
dev-packageB
|
1
akaza-core/akaza.egg-info/top_level.txt
Normal file
1
akaza-core/akaza.egg-info/top_level.txt
Normal file
@ -0,0 +1 @@
|
||||
|
3
akaza-core/akaza/__init__.py
Normal file
3
akaza-core/akaza/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from . import akaza
|
||||
|
||||
Akaza = akaza.Akaza
|
@ -6,26 +6,29 @@ from typing import List, Any, Optional
|
||||
|
||||
import jaconv
|
||||
|
||||
from comb import combromkan
|
||||
from comb.graph import graph_construct, viterbi, lookup
|
||||
from comb.language_model import LanguageModel
|
||||
from comb.node import Node
|
||||
from comb.system_dict import SystemDict
|
||||
from comb.system_language_model import SystemLanguageModel
|
||||
from comb.user_dict import UserDict
|
||||
from comb.user_language_model import UserLanguageModel
|
||||
from akaza import akazaromkan
|
||||
from akaza.graph import graph_construct, viterbi, lookup
|
||||
from akaza.language_model import LanguageModel
|
||||
from akaza.node import Node
|
||||
from akaza.system_dict import SystemDict
|
||||
from akaza.system_language_model import SystemLanguageModel
|
||||
from akaza.user_dict import UserDict
|
||||
from akaza.user_language_model import UserLanguageModel
|
||||
|
||||
# 子音だが、N は NN だと「ん」になるので処理しない。
|
||||
TRAILING_CONSONANT_PATTERN = re.compile(r'^(.*?)([qwrtypsdfghjklzxcvbm]+)$')
|
||||
|
||||
|
||||
class Comb:
|
||||
class Akaza:
|
||||
user_dict: Optional[UserDict]
|
||||
logger: Logger
|
||||
dictionaries: List[Any]
|
||||
|
||||
def __init__(self, user_language_model: UserLanguageModel, system_dict: SystemDict,
|
||||
user_dict: Optional[UserDict],
|
||||
def __init__(self,
|
||||
system_language_model: SystemLanguageModel,
|
||||
system_dict: SystemDict,
|
||||
user_language_model: UserLanguageModel,
|
||||
user_dict: Optional[UserDict] = None,
|
||||
logger: Logger = logging.getLogger(__name__)):
|
||||
assert user_language_model
|
||||
self.logger = logger
|
||||
@ -34,8 +37,6 @@ class Comb:
|
||||
self.system_dict = system_dict
|
||||
self.user_dict = user_dict
|
||||
|
||||
system_language_model = SystemLanguageModel.create()
|
||||
|
||||
self.language_model = LanguageModel(system_language_model, user_language_model)
|
||||
|
||||
# 連文節変換するバージョン。
|
||||
@ -52,7 +53,7 @@ class Comb:
|
||||
)
|
||||
]]
|
||||
|
||||
hiragana: str = combromkan.to_hiragana(src)
|
||||
hiragana: str = akazaromkan.to_hiragana(src)
|
||||
|
||||
# 末尾の子音を変換対象外とする。
|
||||
m = TRAILING_CONSONANT_PATTERN.match(hiragana)
|
@ -5,11 +5,11 @@ from typing import Dict, List, Optional
|
||||
|
||||
import jaconv
|
||||
|
||||
from comb.language_model import LanguageModel
|
||||
from comb.node import Node
|
||||
from comb.system_dict import SystemDict
|
||||
from comb.user_dict import UserDict
|
||||
from comb.user_language_model import UserLanguageModel
|
||||
from akaza.language_model import LanguageModel
|
||||
from akaza.node import Node
|
||||
from akaza.system_dict import SystemDict
|
||||
from akaza.user_dict import UserDict
|
||||
from akaza.user_language_model import UserLanguageModel
|
||||
|
||||
|
||||
class Graph:
|
@ -4,9 +4,9 @@ import math
|
||||
|
||||
import marisa_trie
|
||||
|
||||
from comb.node import Node
|
||||
from comb.system_language_model import SystemLanguageModel
|
||||
from comb.user_language_model import UserLanguageModel
|
||||
from akaza.node import Node
|
||||
from akaza.system_language_model import SystemLanguageModel
|
||||
from akaza.user_language_model import UserLanguageModel
|
||||
|
||||
|
||||
|
@ -1,11 +1,8 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
import marisa_trie
|
||||
from marisa_trie import BytesTrie
|
||||
|
||||
from comb.config import DICTIONARY_DIR
|
||||
|
||||
|
||||
class SystemDict:
|
||||
_trie: BytesTrie
|
||||
@ -18,11 +15,6 @@ class SystemDict:
|
||||
trie.mmap(path)
|
||||
self._trie = trie
|
||||
|
||||
@staticmethod
|
||||
def create():
|
||||
path = os.path.join(DICTIONARY_DIR, 'system_dict.trie')
|
||||
return SystemDict(path)
|
||||
|
||||
def prefixes(self, key):
|
||||
return self._trie.prefixes(key)
|
||||
|
29
akaza-core/akaza/system_language_model.py
Normal file
29
akaza-core/akaza/system_language_model.py
Normal file
@ -0,0 +1,29 @@
|
||||
import math
|
||||
|
||||
import marisa_trie
|
||||
|
||||
from akaza.node import Node
|
||||
|
||||
DEFAULT_SCORE = [(math.log10(0.00000000001),)]
|
||||
|
||||
|
||||
class SystemLanguageModel:
|
||||
def __init__(self, score: marisa_trie.RecordTrie, default_score=None):
|
||||
self.default_score = DEFAULT_SCORE if default_score is None else default_score
|
||||
self.score = score
|
||||
|
||||
@staticmethod
|
||||
def create(path: str, default_score=None):
|
||||
score = marisa_trie.RecordTrie('@f')
|
||||
score.mmap(path)
|
||||
|
||||
return SystemLanguageModel(
|
||||
score=score,
|
||||
default_score=DEFAULT_SCORE if default_score is None else default_score
|
||||
)
|
||||
|
||||
def get_unigram_cost(self, key: str) -> float:
|
||||
return self.score.get(key, self.default_score)[0][0]
|
||||
|
||||
def get_bigram_cost(self, key1: str, key2: str) -> float:
|
||||
return self.score.get(key1 + "\t" + key2, self.default_score)[0][0]
|
@ -3,7 +3,7 @@ from typing import List, Dict
|
||||
|
||||
import marisa_trie
|
||||
|
||||
from comb.skkdict import parse_skkdict, merge_skkdict, ari2nasi
|
||||
from skkdictutils import parse_skkdict, merge_skkdict, ari2nasi
|
||||
|
||||
|
||||
class UserDict:
|
||||
@ -41,7 +41,6 @@ def load_user_dict_from_json_config(path: str) -> UserDict:
|
||||
t = []
|
||||
for k, v in merged.items():
|
||||
t.append((k, '/'.join(v).encode('utf-8')))
|
||||
print(t)
|
||||
trie = marisa_trie.BytesTrie(t)
|
||||
|
||||
return UserDict(trie)
|
@ -5,7 +5,7 @@ from typing import List, Dict, Optional
|
||||
|
||||
from atomicwrites import atomic_write
|
||||
|
||||
from comb.node import Node
|
||||
from akaza.node import Node
|
||||
|
||||
|
||||
# ユーザーの言語モデル。
|
@ -1,2 +1,3 @@
|
||||
marisa-trie=0.7.5
|
||||
jaconv==0.2.4
|
||||
skkdictutils=0.0.2
|
12
akaza-core/setup.py
Normal file
12
akaza-core/setup.py
Normal file
@ -0,0 +1,12 @@
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="akaza",
|
||||
version="0.0.1",
|
||||
install_requires=["marisa-trie==0.7.5", "jaconv==0.2.4"],
|
||||
extras_require={
|
||||
"develop": ["dev-packageA", "dev-packageB"]
|
||||
},
|
||||
entry_points={
|
||||
}
|
||||
)
|
@ -1,6 +1,6 @@
|
||||
import pytest
|
||||
|
||||
from comb.combromkan import to_hiragana
|
||||
from akaza.akazaromkan import to_hiragana
|
||||
|
||||
|
||||
def test_foo():
|
@ -1,16 +1,26 @@
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from comb.engine import Comb
|
||||
from comb.system_dict import SystemDict
|
||||
from comb.user_language_model import UserLanguageModel
|
||||
from akaza import Akaza
|
||||
from akaza.system_dict import SystemDict
|
||||
from akaza.user_language_model import UserLanguageModel
|
||||
from akaza.system_language_model import SystemLanguageModel
|
||||
|
||||
tmpfile = NamedTemporaryFile(delete=False)
|
||||
user_language_model = UserLanguageModel(tmpfile.name)
|
||||
system_dict = SystemDict.create()
|
||||
system_dict = SystemDict('../akaza-data/system_dict.trie')
|
||||
|
||||
comb = Comb(user_language_model=user_language_model, system_dict=system_dict, user_dict=None)
|
||||
system_language_model = SystemLanguageModel.create('../akaza-data/system_language_model.trie')
|
||||
|
||||
akaza = Akaza(
|
||||
user_language_model=user_language_model,
|
||||
system_dict=system_dict,
|
||||
user_dict=None,
|
||||
system_language_model=system_language_model
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('src, expected', [
|
||||
@ -25,13 +35,13 @@ comb = Comb(user_language_model=user_language_model, system_dict=system_dict, us
|
||||
('IME', 'IME'),
|
||||
])
|
||||
def test_wnn(src, expected):
|
||||
clauses = comb.convert(src)
|
||||
clauses = akaza.convert(src)
|
||||
got = ''.join([clause[0].word for clause in clauses])
|
||||
assert got == expected
|
||||
|
||||
|
||||
def test_wnn2():
|
||||
clauses = comb.convert("わたし")
|
||||
clauses = akaza.convert("わたし")
|
||||
hiragana_len = len([True for node in clauses[0] if node.word == 'わたし'])
|
||||
for node in clauses[0]:
|
||||
print(node)
|
@ -1,30 +1,22 @@
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from comb.combromkan import to_hiragana
|
||||
import pytest
|
||||
import marisa_trie
|
||||
from comb.system_dict import SystemDict
|
||||
from comb.graph import lookup, graph_construct, viterbi
|
||||
from comb.language_model import LanguageModel
|
||||
import logging
|
||||
from tempfile import TemporaryDirectory
|
||||
import os
|
||||
import pytest
|
||||
|
||||
from comb.system_language_model import SystemLanguageModel
|
||||
from comb.user_language_model import UserLanguageModel
|
||||
from akaza.graph import lookup, graph_construct, viterbi
|
||||
from akaza.language_model import LanguageModel
|
||||
from akaza.system_dict import SystemDict
|
||||
from akaza.system_language_model import SystemLanguageModel
|
||||
from akaza.user_language_model import UserLanguageModel
|
||||
|
||||
unigram_score = marisa_trie.RecordTrie('@f')
|
||||
unigram_score.load('model/jawiki.1gram')
|
||||
|
||||
bigram_score = marisa_trie.RecordTrie('@f')
|
||||
bigram_score.load('model/jawiki.2gram')
|
||||
|
||||
system_language_model = SystemLanguageModel(unigram_score, bigram_score)
|
||||
system_language_model = SystemLanguageModel.create('../akaza-data/system_language_model.trie')
|
||||
|
||||
tmpdir = TemporaryDirectory()
|
||||
user_language_model = UserLanguageModel(tmpdir.name)
|
||||
|
||||
language_model = LanguageModel(system_language_model, user_language_model=user_language_model)
|
||||
|
||||
system_dict = SystemDict('model/system_dict.trie')
|
||||
system_dict = SystemDict('../akaza-data/system_dict.trie')
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
@ -1,6 +1,6 @@
|
||||
from comb.system_dict import SystemDict
|
||||
from akaza.system_dict import SystemDict
|
||||
|
||||
system_dict = SystemDict('model/system_dict.trie')
|
||||
system_dict = SystemDict('../akaza-data/system_dict.trie')
|
||||
|
||||
|
||||
def test_system_dict():
|
@ -1,14 +1,7 @@
|
||||
from tempfile import NamedTemporaryFile, TemporaryDirectory
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from comb.node import Node
|
||||
from comb.user_language_model import UserLanguageModel
|
||||
import marisa_trie
|
||||
|
||||
unigram_score = marisa_trie.RecordTrie('@f')
|
||||
unigram_score.load('model/jawiki.1gram')
|
||||
|
||||
bigram_score = marisa_trie.RecordTrie('@f')
|
||||
bigram_score.load('model/jawiki.2gram')
|
||||
from akaza.node import Node
|
||||
from akaza.user_language_model import UserLanguageModel
|
||||
|
||||
|
||||
def test_read():
|
@ -1,4 +1,4 @@
|
||||
all: jawiki.1gram system_dict.trie
|
||||
all: system_dict.trie system_language_model.trie
|
||||
|
||||
jawiki-latest-pages-articles.xml.bz2:
|
||||
wget --no-verbose --no-clobber -O jawiki-latest-pages-articles.xml.bz2 https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2
|
||||
@ -32,4 +32,3 @@ system_dict.trie: jawiki.vocab
|
||||
python bin/make-system-dict.py
|
||||
|
||||
.PHONY: all
|
||||
|
14
akaza-data/README.md
Normal file
14
akaza-data/README.md
Normal file
@ -0,0 +1,14 @@
|
||||
# akaza-data
|
||||
|
||||
## What's this?
|
||||
|
||||
System dictionary/language model package for Akaza.
|
||||
|
||||
## PyPI's size limit
|
||||
|
||||
*The default size limit on PyPI is 60MB*
|
||||
|
||||
* [unidic-lite](https://www.dampfkraft.com/code/distributing-large-files-with-pypi.html)
|
||||
|
||||
## See also
|
||||
|
@ -5,7 +5,7 @@ import marisa_trie
|
||||
|
||||
sys.path.append('../')
|
||||
|
||||
from comb.skkdict import parse_skkdict, merge_skkdict, ari2nasi
|
||||
from skkdictutils import parse_skkdict, merge_skkdict, ari2nasi
|
||||
|
||||
# jawiki.vocab から system_dict.trie を作成する。
|
||||
|
@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import sys
|
||||
from Mykytea import Mykytea
|
||||
import re
|
||||
@ -68,6 +69,9 @@ def main():
|
||||
total = len(sys.argv[1:])
|
||||
for ifile in sys.argv[1:]:
|
||||
ofile = ifile.replace('text/', 'dat/')
|
||||
|
||||
pathlib.Path(ofile).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.info(f"[{os.getpid()}] {ifile} -> {ofile} ({count}/{total})")
|
||||
with open(ifile, 'r') as rfp, \
|
||||
open(ofile, 'w') as wfp:
|
1
akaza-data/requirements.txt
Normal file
1
akaza-data/requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
skkdictutils>=0.0.2
|
@ -1,3 +0,0 @@
|
||||
SYS_CONF_DIR = '@SYSCONFDIR@'
|
||||
MODEL_DIR = '@MODELDIR@'
|
||||
DICTIONARY_DIR = '@DICTIONARYDIR@'
|
@ -1,29 +0,0 @@
|
||||
import math
|
||||
|
||||
import marisa_trie
|
||||
|
||||
from comb.config import MODEL_DIR
|
||||
from comb.node import Node
|
||||
|
||||
DEFAULT_SCORE = [(math.log10(0.00000000001),)]
|
||||
|
||||
|
||||
class SystemLanguageModel:
|
||||
def __init__(self, score: marisa_trie.RecordTrie):
|
||||
self.score = score
|
||||
|
||||
@staticmethod
|
||||
def create():
|
||||
score = marisa_trie.RecordTrie('@f')
|
||||
score.mmap(f"{MODEL_DIR}/system_language_model.trie")
|
||||
|
||||
return SystemLanguageModel(score)
|
||||
|
||||
def get_unigram_cost(self, key: str) -> float:
|
||||
return self.score.get(key, DEFAULT_SCORE)[0][0]
|
||||
|
||||
def get_bigram_cost(self, node1: Node, node2: Node) -> float:
|
||||
key1 = node1.get_key()
|
||||
key2 = node2.get_key()
|
||||
key = key1 + "\t" + key2
|
||||
return self.score.get(key, DEFAULT_SCORE)[0][0]
|
1
ibus-akaza/.gitignore
vendored
Normal file
1
ibus-akaza/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/akaza.xml
|
45
ibus-akaza/Makefile
Normal file
45
ibus-akaza/Makefile
Normal file
@ -0,0 +1,45 @@
|
||||
PREFIX ?= /usr
|
||||
SYSCONFDIR ?= /etc
|
||||
DATADIR ?= $(PREFIX)/share
|
||||
DESTDIR ?=
|
||||
|
||||
PYTHON ?= /usr/bin/python3
|
||||
|
||||
all: akaza.xml ibus_akaza/config.py
|
||||
|
||||
install: ibus_akaza/config.py akaza.xml
|
||||
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza $(DESTDIR)$(SYSCONFDIR)/xdg/akaza $(DESTDIR)$(DATADIR)/ibus/component
|
||||
|
||||
install -m 0644 akaza.svg $(DESTDIR)$(DATADIR)/ibus-akaza
|
||||
install -m 0644 ibus.py $(DESTDIR)$(DATADIR)/ibus-akaza
|
||||
install -m 0644 ibus_akaza/ui.py $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/
|
||||
install -m 0644 ibus_akaza/config.py $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/
|
||||
install -m 0644 akaza.xml $(DESTDIR)$(DATADIR)/ibus/component
|
||||
|
||||
ibus_akaza/config.py: ibus_akaza/config.py.in
|
||||
sed -e "s:@SYSCONFDIR@:$(SYSCONFDIR):g" \
|
||||
-e "s:@MODELDIR@:$(DESTDIR)/$(DATADIR)/ibus-akaza/model:g" \
|
||||
-e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-akaza/dictionary:g" \
|
||||
$< > $@
|
||||
|
||||
akaza.xml: akaza.xml.in
|
||||
sed -e "s:@PYTHON@:$(PYTHON):g;" \
|
||||
-e "s:@DATADIR@:$(DATADIR):g" $< > $@
|
||||
|
||||
check:
|
||||
python -m py_compile ibus.py
|
||||
python -m py_compile ibus_akaza/ui.py
|
||||
|
||||
uninstall:
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza.svg
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/ui.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/config.py
|
||||
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/ibus.py
|
||||
rmdir $(DESTDIR)$(DATADIR)/ibus-akaza
|
||||
|
||||
|
||||
clean:
|
||||
rm -f akaza.xml
|
||||
|
||||
.PHONY: all check install uninstall clean
|
||||
|
@ -1,23 +1,23 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<!-- filename: comb.xml -->
|
||||
<!-- filename: akaza.xml -->
|
||||
<component>
|
||||
<name>org.freedesktop.IBus.Comb</name>
|
||||
<description>Comb - kana kanji converter</description>
|
||||
<name>org.freedesktop.IBus.Akaza</name>
|
||||
<description>Akaza - kana kanji converter</description>
|
||||
<version>0.0.1</version>
|
||||
<license>GPL</license>
|
||||
<author>Tokuhiro Matsuno <tokuhirom@gmail.com></author>
|
||||
<homepage>https://github.com/tokuhirom/ibus-comb</homepage>
|
||||
<exec>@PYTHON@ @DATADIR@/ibus-comb/ibus.py --ibus</exec>
|
||||
<textdomain>comb</textdomain>
|
||||
<homepage>https://github.com/tokuhirom/ibus-akaza</homepage>
|
||||
<exec>@PYTHON@ @DATADIR@/ibus-akaza/ibus.py --ibus</exec>
|
||||
<textdomain>akaza</textdomain>
|
||||
<engines>
|
||||
<engine>
|
||||
<name>comb</name>
|
||||
<longname>comb</longname>
|
||||
<description>Comb - Kana Kanji Converter</description>
|
||||
<name>akaza</name>
|
||||
<longname>akaza</longname>
|
||||
<description>Akaza - Kana Kanji Converter</description>
|
||||
<language>ja</language>
|
||||
<license>GPL</license>
|
||||
<author>Tokuhiro Matsuno <tokuhirom@gmail.com></author>
|
||||
<icon>@DATADIR@/ibus-comb/comb.svg</icon>
|
||||
<icon>@DATADIR@/ibus-akaza/akaza.svg</icon>
|
||||
<layout>us</layout>
|
||||
<layout_variant></layout_variant>
|
||||
<layout_option></layout_option>
|
Before Width: | Height: | Size: 3.4 KiB After Width: | Height: | Size: 3.4 KiB |
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# ibus-comb: ibus engine for japanese characters
|
||||
# ibus-akaza: ibus engine for japanese characters
|
||||
#
|
||||
# Copyright (c) 2020 Tokuhiro Matsuno <tokuhirom@gmail.com>
|
||||
#
|
||||
@ -33,10 +33,11 @@ import getopt
|
||||
import locale
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG, filename='/tmp/ibus-comb.log', filemode='w')
|
||||
logging.info("Loading ibus-comb")
|
||||
# TODO: remove log file generation
|
||||
logging.basicConfig(level=logging.DEBUG, filename='/tmp/ibus-akaza.log', filemode='w')
|
||||
logging.info("Loading ibus-akaza")
|
||||
|
||||
libpath = os.path.join(os.path.dirname(__file__), "comb")
|
||||
libpath = os.path.join(os.path.dirname(__file__), "akaza")
|
||||
logging.info(f"library path: {libpath}")
|
||||
sys.path.append(libpath)
|
||||
|
||||
@ -56,23 +57,23 @@ class IMApp:
|
||||
|
||||
logging.info("Loading IMApp")
|
||||
|
||||
from comb.ui import CombIBusEngine
|
||||
from akaza.ui import AkazaIBusEngine
|
||||
|
||||
self.mainloop = GLib.MainLoop()
|
||||
self.bus = IBus.Bus()
|
||||
self.bus.connect("disconnected", self.bus_disconnected_cb)
|
||||
self.factory = IBus.Factory.new(self.bus.get_connection())
|
||||
self.factory.add_engine("comb", GObject.type_from_name("CombIBusEngine"))
|
||||
self.factory.add_engine("akaza", GObject.type_from_name("AkazaIBusEngine"))
|
||||
|
||||
if exec_by_ibus:
|
||||
self.bus.request_name("org.freedesktop.IBus.Comb", 0)
|
||||
else:
|
||||
xml_path = os.path.join(__base_dir__, 'comb.xml')
|
||||
xml_path = os.path.join(__base_dir__, 'AkazaIBusEngine.xml')
|
||||
if os.path.exists(xml_path):
|
||||
component = IBus.Component.new_from_file(xml_path)
|
||||
else:
|
||||
xml_path = os.path.join(os.path.dirname(__base_dir__),
|
||||
'ibus', 'component', 'comb.xml')
|
||||
'ibus', 'component', 'akaza.xml')
|
||||
component = IBus.Component.new_from_file(xml_path)
|
||||
self.bus.register_component(component)
|
||||
|
1
ibus-akaza/ibus_akaza/.gitignore
vendored
Normal file
1
ibus-akaza/ibus_akaza/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/config.py
|
5
ibus-akaza/ibus_akaza/config.py.in
Normal file
5
ibus-akaza/ibus_akaza/config.py.in
Normal file
@ -0,0 +1,5 @@
|
||||
import os
|
||||
|
||||
SYS_CONF_DIR = os.environ.get('AKAZA_SYSCONF_DIR', '@SYSCONFDIR@')
|
||||
MODEL_DIR = os.environ.get('AKAZA_MODEL_DIR', '@MODELDIR@')
|
||||
DICTIONARY_DIR = os.environ.get('AKAZA_DICTIONARY_DIR', '@DICTIONARYDIR@')
|
@ -15,12 +15,13 @@ import pathlib
|
||||
|
||||
from jaconv import jaconv
|
||||
|
||||
from comb import combromkan
|
||||
from comb.engine import Comb
|
||||
from comb.node import Node
|
||||
from comb.user_language_model import UserLanguageModel
|
||||
from comb.system_dict import SystemDict
|
||||
from comb.user_dict import load_user_dict_from_json_config
|
||||
from akaza import akazaromkan
|
||||
from akaza.engine import Comb
|
||||
from akaza.node import Node
|
||||
from akaza.user_language_model import UserLanguageModel
|
||||
from akaza.system_dict import SystemDict
|
||||
from akaza.user_dict import load_user_dict_from_json_config
|
||||
from akaza.config import MODEL_DIR
|
||||
|
||||
MODE_KANA = 1
|
||||
MODE_ALPHA = 2
|
||||
@ -37,13 +38,14 @@ for n in range(1, 10):
|
||||
numpad_keys.append(getattr(IBus, 'KP_0'))
|
||||
del n
|
||||
|
||||
configdir = os.path.join(GLib.get_user_config_dir(), 'ibus-comb')
|
||||
configdir = os.path.join(GLib.get_user_config_dir(), 'ibus-akaza')
|
||||
pathlib.Path(os.path.join(configdir, 'user-dict')).mkdir(parents=True, exist_ok=True)
|
||||
logging.info(f"Loading user dictionary: {configdir}")
|
||||
user_language_model = UserLanguageModel(os.path.join(configdir, 'user-dict'))
|
||||
logging.info("Loaded user dictionary")
|
||||
|
||||
system_dict = SystemDict.create()
|
||||
system_dict_path = os.path.join(DICTIONARY_DIR, 'system_dict.trie')
|
||||
system_dict = SystemDict(system_dict_path)
|
||||
logging.info("Loaded system dictionary")
|
||||
|
||||
try:
|
||||
@ -60,7 +62,10 @@ try:
|
||||
else:
|
||||
logging.info(f"'{user_dict_conf_path}' does not exist.")
|
||||
|
||||
comb = Comb(user_language_model, system_dict, user_dict)
|
||||
system_language_model_path = f"{MODEL_DIR}/system_language_model.trie"
|
||||
system_language_model = SystemLanguageModel.create(system_language_model_path)
|
||||
|
||||
akaza = Comb(user_language_model, system_dict, user_dict, system_language_model)
|
||||
logging.info("Finished Comb.")
|
||||
except:
|
||||
logging.error("Cannot initialize.", exc_info=True)
|
||||
@ -71,27 +76,27 @@ except:
|
||||
# the engine
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
class CombIBusEngine(IBus.Engine):
|
||||
class AkazaIBusEngine(IBus.Engine):
|
||||
user_language_model: UserLanguageModel
|
||||
current_clause: int
|
||||
node_selected: Dict[int, int]
|
||||
clauses: List[List[Node]]
|
||||
prop_list: IBus.PropList
|
||||
comb: Comb
|
||||
akaza: Comb
|
||||
mode: int
|
||||
force_selected_clause: List[slice]
|
||||
|
||||
__gtype_name__ = 'CombIBusEngine'
|
||||
__gtype_name__ = 'kaza'
|
||||
|
||||
def __init__(self):
|
||||
super(CombIBusEngine, self).__init__()
|
||||
super(AkazaIBusEngine, self).__init__()
|
||||
self.is_invalidate = False
|
||||
# 未確定文字列。
|
||||
self.preedit_string = ''
|
||||
# 候補文字列
|
||||
self.lookup_table = IBus.LookupTable.new(page_size=10, cursor_pos=0, cursor_visible=True, round=True)
|
||||
self.prop_list = IBus.PropList()
|
||||
self.comb = comb
|
||||
self.akaza = akaza
|
||||
self.user_language_model = user_language_model
|
||||
self.user_dict = user_dict
|
||||
self.logger = logging.getLogger(__name__)
|
||||
@ -168,7 +173,7 @@ class CombIBusEngine(IBus.Engine):
|
||||
self.commit_candidate()
|
||||
else:
|
||||
# 無変換状態では、ひらがなに変換してコミットします。
|
||||
self.commit_string(combromkan.to_hiragana(self.preedit_string))
|
||||
self.commit_string(akazaromkan.to_hiragana(self.preedit_string))
|
||||
return True
|
||||
elif keyval == IBus.Escape:
|
||||
self.preedit_string = ''
|
||||
@ -289,7 +294,7 @@ class CombIBusEngine(IBus.Engine):
|
||||
self.logger.info("Convert to full katakana")
|
||||
|
||||
# カタカナ候補のみを表示するようにする。
|
||||
hira = combromkan.to_hiragana(self.preedit_string)
|
||||
hira = akazaromkan.to_hiragana(self.preedit_string)
|
||||
kata = jaconv.hira2kata(hira)
|
||||
|
||||
self.convert_to_single(hira, kata)
|
||||
@ -298,14 +303,14 @@ class CombIBusEngine(IBus.Engine):
|
||||
self.logger.info("Convert to full hiragana")
|
||||
|
||||
# カタカナ候補のみを表示するようにする。
|
||||
hira = combromkan.to_hiragana(self.preedit_string)
|
||||
hira = akazaromkan.to_hiragana(self.preedit_string)
|
||||
self.convert_to_single(hira, hira)
|
||||
|
||||
def convert_to_half_katakana(self):
|
||||
self.logger.info("Convert to half katakana")
|
||||
|
||||
# 半角カタカナ候補のみを表示するようにする。
|
||||
hira = combromkan.to_hiragana(self.preedit_string)
|
||||
hira = akazaromkan.to_hiragana(self.preedit_string)
|
||||
kata = jaconv.hira2kata(hira)
|
||||
kata = jaconv.z2h(kata)
|
||||
|
||||
@ -315,7 +320,7 @@ class CombIBusEngine(IBus.Engine):
|
||||
self.logger.info("Convert to half romaji")
|
||||
|
||||
# 半角カタカナ候補のみを表示するようにする。
|
||||
hira = combromkan.to_hiragana(self.preedit_string)
|
||||
hira = akazaromkan.to_hiragana(self.preedit_string)
|
||||
romaji = jaconv.z2h(self.preedit_string)
|
||||
|
||||
self.convert_to_single(hira, romaji)
|
||||
@ -323,7 +328,7 @@ class CombIBusEngine(IBus.Engine):
|
||||
def convert_to_full_romaji(self):
|
||||
self.logger.info("Convert to full romaji")
|
||||
|
||||
hira = combromkan.to_hiragana(self.preedit_string)
|
||||
hira = akazaromkan.to_hiragana(self.preedit_string)
|
||||
romaji = jaconv.h2z(self.preedit_string, kana=True, digit=True, ascii=True)
|
||||
|
||||
self.convert_to_single(hira, romaji)
|
||||
@ -530,7 +535,7 @@ class CombIBusEngine(IBus.Engine):
|
||||
def _update_candidates(self):
|
||||
if len(self.preedit_string) > 0:
|
||||
# 変換をかける
|
||||
self.clauses = self.comb.convert(self.preedit_string, self.force_selected_clause)
|
||||
self.clauses = self.akaza.convert(self.preedit_string, self.force_selected_clause)
|
||||
else:
|
||||
self.clauses = []
|
||||
self.create_lookup_table()
|
||||
@ -588,7 +593,7 @@ class CombIBusEngine(IBus.Engine):
|
||||
return
|
||||
|
||||
# 平仮名にする。
|
||||
text = combromkan.to_hiragana(self.preedit_string)
|
||||
text = akazaromkan.to_hiragana(self.preedit_string)
|
||||
self.clauses = [
|
||||
[Node(word=text, yomi=text, start_pos=3)]
|
||||
]
|
@ -1,8 +1,13 @@
|
||||
from comb.ui import CombIBusEngine
|
||||
import os
|
||||
|
||||
os.environ['AKAZA_DICTIONARY_DIR'] = 'model/'
|
||||
os.environ['AKAZA_MODEL_DIR'] = 'model/'
|
||||
|
||||
from akaza.ui import AkazaIBusEngine
|
||||
|
||||
|
||||
def test_extend_clause_right():
|
||||
ui = CombIBusEngine()
|
||||
ui = AkazaIBusEngine()
|
||||
ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず
|
||||
ui.update_candidates()
|
||||
|
||||
@ -32,7 +37,7 @@ def test_extend_clause_right():
|
||||
|
||||
|
||||
def test_extend_clause_right_most_right():
|
||||
ui = CombIBusEngine()
|
||||
ui = AkazaIBusEngine()
|
||||
ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず
|
||||
ui.update_candidates()
|
||||
|
||||
@ -56,7 +61,7 @@ def test_extend_clause_right_most_right():
|
||||
|
||||
|
||||
def test_extend_clause_left():
|
||||
ui = CombIBusEngine()
|
||||
ui = AkazaIBusEngine()
|
||||
ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず
|
||||
ui.update_candidates()
|
||||
|
||||
@ -89,7 +94,7 @@ def test_extend_clause_left():
|
||||
|
||||
|
||||
def test_extend_clause_left_most_left():
|
||||
ui = CombIBusEngine()
|
||||
ui = AkazaIBusEngine()
|
||||
ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず
|
||||
ui.update_candidates()
|
||||
|
@ -1,3 +0,0 @@
|
||||
[pytest]
|
||||
log_format = %(asctime)s %(levelname)s %(message)s
|
||||
log_date_format = %Y-%m-%d %H:%M:%S
|
@ -1,29 +0,0 @@
|
||||
from comb.engine import parse_skkdict
|
||||
import marisa_trie
|
||||
|
||||
dictionary = parse_skkdict('/usr/share/skk/SKK-JISYO.L', encoding='euc-jp')
|
||||
|
||||
print("START")
|
||||
|
||||
t = []
|
||||
for k, v in dictionary.items():
|
||||
vvv = '/'.join(v).encode('utf-8')
|
||||
t.append((k, vvv))
|
||||
|
||||
trie = marisa_trie.BytesTrie(t)
|
||||
|
||||
print("LOADED")
|
||||
|
||||
|
||||
def gen_latice(s):
|
||||
for n in range(len(s) - 1):
|
||||
print(n)
|
||||
word = s[0:n]
|
||||
print(word)
|
||||
|
||||
|
||||
src = 'ひつようなことは'
|
||||
for prefix in reversed(trie.prefixes(src)):
|
||||
kanjis = trie[prefix][0].decode('utf-8').split('/')
|
||||
for kanji in kanjis:
|
||||
print(kanji + src[len(prefix):])
|
@ -1,28 +0,0 @@
|
||||
from comb.engine import parse_skkdict
|
||||
import pygtrie
|
||||
|
||||
dictionary = parse_skkdict('/usr/share/skk/SKK-JISYO.L', encoding='euc-jp')
|
||||
|
||||
t = pygtrie.CharTrie()
|
||||
|
||||
for k, v in dictionary.items():
|
||||
vvv = '/'.join(v).encode('utf-8')
|
||||
t[k] = v
|
||||
|
||||
print("LOADED")
|
||||
|
||||
|
||||
def gen_latice(s):
|
||||
for n in range(len(s) - 1):
|
||||
print(n)
|
||||
word = s[0:n]
|
||||
print(word)
|
||||
|
||||
|
||||
src = 'じゅうかきんぜい'
|
||||
# print(t.get('じゅうか'))
|
||||
for s in t.prefixes('たんげつ'):
|
||||
print(s)
|
||||
|
||||
# f = src[0]
|
||||
# print(gen_latice(src))
|
4
skkdictutils/.gitignore
vendored
Normal file
4
skkdictutils/.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
/.pytest_cache
|
||||
/dist/
|
||||
*.egg-info
|
||||
build/
|
33
skkdictutils/README.md
Normal file
33
skkdictutils/README.md
Normal file
@ -0,0 +1,33 @@
|
||||
# skkdictutils
|
||||
|
||||
## Current status
|
||||
|
||||
**This library is under development.**
|
||||
|
||||
API is unstable.
|
||||
|
||||
## Release process
|
||||
|
||||
See https://packaging.python.org/tutorials/packaging-projects/
|
||||
|
||||
## LICENSE
|
||||
|
||||
Copyright (c) 2020 Tokuhiro Matsuno
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
18
skkdictutils/setup.py
Normal file
18
skkdictutils/setup.py
Normal file
@ -0,0 +1,18 @@
|
||||
import setuptools
|
||||
|
||||
setuptools.setup(
|
||||
name="skkdictutils",
|
||||
version="0.0.2",
|
||||
install_requires=['romkan==0.2.1'],
|
||||
extras_require={
|
||||
},
|
||||
entry_points={
|
||||
},
|
||||
packages=setuptools.find_packages(),
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
],
|
||||
python_requires='>=3.6',
|
||||
)
|
@ -1,11 +1,13 @@
|
||||
import re
|
||||
from typing import Dict, List
|
||||
|
||||
from comb import combromkan
|
||||
import romkan
|
||||
|
||||
BOIN = ['a', 'i', 'u', 'e', 'o']
|
||||
__all__ = ['parse_skkdict', 'merge_skkdict', 'ari2nasi', 'write_skkdict', 'expand_okuri']
|
||||
|
||||
LOWER_PATTERN = re.compile('[a-z]')
|
||||
_BOIN = ['a', 'i', 'u', 'e', 'o']
|
||||
|
||||
_LOWER_PATTERN = re.compile('[a-z]')
|
||||
|
||||
|
||||
def parse_skkdict(path: str, encoding: str = 'euc-jp'):
|
||||
@ -64,13 +66,13 @@ def merge_skkdict(dicts: List[Dict[str, List[str]]]) -> Dict[str, List[str]]:
|
||||
|
||||
def expand_okuri(kana: str, kanjis: List[str]):
|
||||
if kana[-1].isalpha():
|
||||
if kana[-1] in BOIN:
|
||||
okuri = combromkan.to_hiragana(kana[-1])
|
||||
if kana[-1] in _BOIN:
|
||||
okuri = romkan.to_hiragana(kana[-1])
|
||||
yield kana[:-1] + okuri, [kanji + okuri for kanji in kanjis]
|
||||
else:
|
||||
for b in BOIN:
|
||||
okuri = combromkan.to_hiragana(kana[-1] + b)
|
||||
if LOWER_PATTERN.match(okuri):
|
||||
for b in _BOIN:
|
||||
okuri = romkan.to_hiragana(kana[-1] + b)
|
||||
if _LOWER_PATTERN.match(okuri):
|
||||
# wu のように、変換できないものは無視する。
|
||||
continue
|
||||
yield kana[:-1] + okuri, [kanji + okuri for kanji in kanjis]
|
@ -1,4 +1,4 @@
|
||||
from comb.skkdict import merge_skkdict, expand_okuri
|
||||
from skkdictutils import merge_skkdict, expand_okuri
|
||||
|
||||
|
||||
def test_merge_skkdict():
|
Reference in New Issue
Block a user