diff --git a/.gitignore b/.gitignore index 620314f..4a1047b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ __pycache__ /comb.xml /hello.* -/test_graph2.py \ No newline at end of file +/test_graph2.py +/akaza.xml \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index 2d8d44e..4150c84 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "model/skk-dev-dict"] path = model/skk-dev-dict url = https://github.com/skk-dev/dict.git +[submodule "akaza-data/skk-dev-dict"] + path = akaza-data/skk-dev-dict + url = https://github.com/skk-dev/dict.git diff --git a/Changes.md b/Changes.md index 2e9876e..7ae3cc9 100644 --- a/Changes.md +++ b/Changes.md @@ -1,3 +1,12 @@ +# 2020-09-14(Mon) + +* comb を akaza に改名した。 +* ibus 関連部分とそれ以外を分離する。以下のようなモジュール構成を目指す。 + * ibus-akaza: ibus 連動部分。 + * akaza-core: 変換コアエンジン。ibus 関連部分と独立させることにより、fcitx との連動を可能にすることを目指す + * skkdictutils: SKK 辞書関連ユーティリティライブラリ。単独利用可能なようにパッケージングし、単独レポジトリに独立させる予定。 + * akaza-data : システム辞書/システム言語モデル + # 2020-09-13(Sun) * Rust で書き直そうかなぁ。。 @@ -14,7 +23,7 @@ * kytea から得られた結果をもとに、平仮名を連結して ngram を作成したが、この結果は惨憺たるものであった。 * DONE: 文節を伸ばす機能が死んでいる。 * ユーザー辞書を設定できるようにしたい。 - * ~/.config/ibus-comb/user-dict.json のなかに設定をいれる。 + * ~/.config/ibus-akaza/user-dict.json のなかに設定をいれる。 * `path/to/dict.txt;format=skk;charset=euc-jp` みたいなフォーマットでいいかなぁ。。JSONでもいいかな。。 * 思ったより、簡単に実装できそう。 diff --git a/Makefile b/Makefile index d4a26fc..786090f 100644 --- a/Makefile +++ b/Makefile @@ -8,79 +8,68 @@ DESTDIR ?= PYTHON ?= /usr/bin/python3 -all: comb.xml comb/config.py comb model/jawiki.1gram +all: akaza.xml akaza/config.py akaza akaza-data/system_language_model.trie akaza-data/system_dict.trie check: python -m py_compile ibus.py - python -m py_compile comb/combromkan.py - python -m py_compile comb/engine.py - python -m py_compile comb/skkdict.py + python -m py_compile akaza/akazaromkan.py + python -m py_compile akaza/engine.py + python -m py_compile akaza/skkdict.py pytest -comb.xml: comb.xml.in - sed -e "s:@PYTHON@:$(PYTHON):g;" \ - -e "s:@DATADIR@:$(DATADIR):g" $< > $@ - -comb/config.py: comb/config.py.in - sed -e "s:@SYSCONFDIR@:$(SYSCONFDIR):g" \ - -e "s:@MODELDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/model:g" \ - -e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/dictionary:g" \ - $< > $@ - -model/system_language_model.trie: model/bin/create-system_language_model-from-json.py +akaza-data/system_language_model.trie: akaza-data/bin/create-system_language_model-from-json.py make -C model system_language_model.trie -model/system_dict.trie: +akaza-data/system_dict.trie: make -C model system_dict.trie -install-dict: model/system_dict.trie - install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/dictionary - install -p -m 0644 model/system_dict.trie $(DESTDIR)$(DATADIR)/ibus-comb/dictionary/ +install-data: model/system_dict.trie + install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-akaza/dictionary + install -p -m 0644 model/system_dict.trie $(DESTDIR)$(DATADIR)/ibus-akaza/dictionary/ + install -m 0644 model/system_language_model.trie $(DESTDIR)$(DATADIR)/ibus-akaza/model/ -install: all comb/config.py model/jawiki.1gram install-dict - install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/comb $(DESTDIR)$(SYSCONFDIR)/xdg/comb $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-comb/model $(DESTDIR)$(DATADIR)/ibus-comb/dictionary - install -m 0644 model/system_language_model.trie $(DESTDIR)$(DATADIR)/ibus-comb/model/ +install: all akaza/config.py model/system_dict.trie install-data + install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-akaza/akaza $(DESTDIR)$(SYSCONFDIR)/xdg/akaza $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-akaza/model $(DESTDIR)$(DATADIR)/ibus-akaza/dictionary - install -m 0644 comb.svg $(DESTDIR)$(DATADIR)/ibus-comb - install -m 0644 comb/__init__.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb/graph.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb/language_model.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb/node.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb/config.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb/skkdict.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb/combromkan.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 ibus.py $(DESTDIR)$(DATADIR)/ibus-comb - install -m 0644 comb/engine.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb/ui.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb/user_language_model.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb/system_language_model.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb/system_dict.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb/user_dict.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/ - install -m 0644 comb.xml $(DESTDIR)$(DATADIR)/ibus/component + install -m 0644 akaza.svg $(DESTDIR)$(DATADIR)/ibus-akaza + install -m 0644 ibus.py $(DESTDIR)$(DATADIR)/ibus-akaza + install -m 0644 akaza.xml $(DESTDIR)$(DATADIR)/ibus/component + + install -m 0644 akaza/__init__.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/graph.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/language_model.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/node.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/config.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/skkdict.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/akazaromkan.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/engine.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/ui.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/user_language_model.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/system_language_model.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/system_dict.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ + install -m 0644 akaza/user_dict.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ uninstall: - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb.svg - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/config.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/engine.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/skkdict.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/combromkan.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/graph.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/language_model.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/node.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/ui.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_language_model.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_language_model.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_dict.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_dict.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/ibus.py - rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/system_language_model.trie - rmdir $(DESTDIR)$(DATADIR)/ibus-comb - rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/comb - rm -f $(DESTDIR)$(DATADIR)/ibus/component/comb.xml + + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza + rm -f $(DESTDIR)$(DATADIR)/ibus/component/akaza.xml + + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/engine.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/skkdict.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/akazaromkan.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/graph.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/language_model.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/node.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ui.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/user_language_model.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/system_language_model.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/user_dict.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/system_dict.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/model/system_language_model.trie + rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/akaza clean: - rm -f comb.xml - rm -f comb/config.py + rm -f akaza.xml + rm -f akaza/config.py .PHONY: all check install uninstall - diff --git a/Note.md b/Note.md index a6d4c27..79fe7d0 100644 --- a/Note.md +++ b/Note.md @@ -12,7 +12,7 @@ * LOUDS はサイズが小さくなるが、動的な追加削除はできない。 * 検索速度は Double Array のほうが速い - * comb には何に trie を作成しているのか? + * akaza には何に trie を作成しているのか? * ユーザー辞書とシステム辞書とシステム言語モデルに利用している。 * ただし、システム言語モデルは、純粋なキーからの検索しかしていない。 diff --git a/README.md b/README.md index a429b64..c772805 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# ibus-comb +# ibus-akaza Yet another kana-kanji-converter on IBus, written in Python. diff --git a/TODO.md b/TODO.md index b2bdee7..5069a99 100644 --- a/TODO.md +++ b/TODO.md @@ -7,7 +7,6 @@ ## Priority mid - support 3gram(必要?) -- 2gram のデータがデカすぎる。libkkc と同等の圧縮をすべき - ユーザー言語モデル学習機 from text file or web. - クローラーをかく? - ユーザー辞書機能を実装する @@ -38,6 +37,7 @@ - 共起的なスコアをいれたい? - 青空文庫をコーパスとして使う? - 古くさすぎるかも +- 言語モデルを小さくできないか? # DONE @@ -45,7 +45,7 @@ - カタカナ語辞書の作成 - 連文節変換用の UI を実装する - Function key とかのショートカットで、全部カタカナにすることができるように。 - - ibus-comb がバグってた時に便利。 + - ibus-akaza がバグってた時に便利。 - 末尾のアルファベット一文字は、変換しない。 - 前向きDP後ろ向きA* で候補を得る - 平仮名語辞書もいるのかもしれない。 diff --git a/akaza-core/README.md b/akaza-core/README.md new file mode 100644 index 0000000..bfea4e3 --- /dev/null +++ b/akaza-core/README.md @@ -0,0 +1,20 @@ +# Akaza + +## What's this? + +Yet another kana-kanji conversion system written in Python 3. + +## How do I use it? + +### Use as a library + + system_language_model = SystemLanguageModel.create('path/to/system_language_model.trie') + system_dict = SystemDictionary.create('path/to/system_language_model.trie') + akaza = Akaza( + system_language_model = system_language_model, + system_dict: system_dict, + user_language_model: user_language_model, + user_dict: user_dict, + ) + print(akaza.convert('watasinonamaehanakanodesu.')) + # → 私の名前は中野です。 diff --git a/akaza-core/akaza.egg-info/PKG-INFO b/akaza-core/akaza.egg-info/PKG-INFO new file mode 100644 index 0000000..fb34afd --- /dev/null +++ b/akaza-core/akaza.egg-info/PKG-INFO @@ -0,0 +1,9 @@ +Metadata-Version: 2.1 +Name: akaza +Version: 0.0.1 +Summary: UNKNOWN +Home-page: UNKNOWN +License: UNKNOWN +Description: UNKNOWN +Platform: UNKNOWN +Provides-Extra: develop diff --git a/akaza-core/akaza.egg-info/SOURCES.txt b/akaza-core/akaza.egg-info/SOURCES.txt new file mode 100644 index 0000000..f148955 --- /dev/null +++ b/akaza-core/akaza.egg-info/SOURCES.txt @@ -0,0 +1,6 @@ +setup.py +akaza.egg-info/PKG-INFO +akaza.egg-info/SOURCES.txt +akaza.egg-info/dependency_links.txt +akaza.egg-info/requires.txt +akaza.egg-info/top_level.txt \ No newline at end of file diff --git a/akaza-core/akaza.egg-info/dependency_links.txt b/akaza-core/akaza.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/akaza-core/akaza.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/akaza-core/akaza.egg-info/requires.txt b/akaza-core/akaza.egg-info/requires.txt new file mode 100644 index 0000000..909700e --- /dev/null +++ b/akaza-core/akaza.egg-info/requires.txt @@ -0,0 +1,6 @@ +marisa-trie==0.7.5 +jaconv==0.2.4 + +[develop] +dev-packageA +dev-packageB diff --git a/akaza-core/akaza.egg-info/top_level.txt b/akaza-core/akaza.egg-info/top_level.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/akaza-core/akaza.egg-info/top_level.txt @@ -0,0 +1 @@ + diff --git a/comb/.gitignore b/akaza-core/akaza/.gitignore similarity index 100% rename from comb/.gitignore rename to akaza-core/akaza/.gitignore diff --git a/akaza-core/akaza/__init__.py b/akaza-core/akaza/__init__.py new file mode 100644 index 0000000..77f1d2a --- /dev/null +++ b/akaza-core/akaza/__init__.py @@ -0,0 +1,3 @@ +from . import akaza + +Akaza = akaza.Akaza diff --git a/comb/engine.py b/akaza-core/akaza/akaza.py similarity index 78% rename from comb/engine.py rename to akaza-core/akaza/akaza.py index e00a50c..6d20ff4 100644 --- a/comb/engine.py +++ b/akaza-core/akaza/akaza.py @@ -6,26 +6,29 @@ from typing import List, Any, Optional import jaconv -from comb import combromkan -from comb.graph import graph_construct, viterbi, lookup -from comb.language_model import LanguageModel -from comb.node import Node -from comb.system_dict import SystemDict -from comb.system_language_model import SystemLanguageModel -from comb.user_dict import UserDict -from comb.user_language_model import UserLanguageModel +from akaza import akazaromkan +from akaza.graph import graph_construct, viterbi, lookup +from akaza.language_model import LanguageModel +from akaza.node import Node +from akaza.system_dict import SystemDict +from akaza.system_language_model import SystemLanguageModel +from akaza.user_dict import UserDict +from akaza.user_language_model import UserLanguageModel # 子音だが、N は NN だと「ん」になるので処理しない。 TRAILING_CONSONANT_PATTERN = re.compile(r'^(.*?)([qwrtypsdfghjklzxcvbm]+)$') -class Comb: +class Akaza: user_dict: Optional[UserDict] logger: Logger dictionaries: List[Any] - def __init__(self, user_language_model: UserLanguageModel, system_dict: SystemDict, - user_dict: Optional[UserDict], + def __init__(self, + system_language_model: SystemLanguageModel, + system_dict: SystemDict, + user_language_model: UserLanguageModel, + user_dict: Optional[UserDict] = None, logger: Logger = logging.getLogger(__name__)): assert user_language_model self.logger = logger @@ -34,8 +37,6 @@ class Comb: self.system_dict = system_dict self.user_dict = user_dict - system_language_model = SystemLanguageModel.create() - self.language_model = LanguageModel(system_language_model, user_language_model) # 連文節変換するバージョン。 @@ -52,7 +53,7 @@ class Comb: ) ]] - hiragana: str = combromkan.to_hiragana(src) + hiragana: str = akazaromkan.to_hiragana(src) # 末尾の子音を変換対象外とする。 m = TRAILING_CONSONANT_PATTERN.match(hiragana) diff --git a/comb/combromkan.py b/akaza-core/akaza/akazaromkan.py similarity index 100% rename from comb/combromkan.py rename to akaza-core/akaza/akazaromkan.py diff --git a/comb/graph.py b/akaza-core/akaza/graph.py similarity index 97% rename from comb/graph.py rename to akaza-core/akaza/graph.py index ac81256..75518c5 100644 --- a/comb/graph.py +++ b/akaza-core/akaza/graph.py @@ -5,11 +5,11 @@ from typing import Dict, List, Optional import jaconv -from comb.language_model import LanguageModel -from comb.node import Node -from comb.system_dict import SystemDict -from comb.user_dict import UserDict -from comb.user_language_model import UserLanguageModel +from akaza.language_model import LanguageModel +from akaza.node import Node +from akaza.system_dict import SystemDict +from akaza.user_dict import UserDict +from akaza.user_language_model import UserLanguageModel class Graph: diff --git a/comb/language_model.py b/akaza-core/akaza/language_model.py similarity index 90% rename from comb/language_model.py rename to akaza-core/akaza/language_model.py index 0d8e07f..acad352 100644 --- a/comb/language_model.py +++ b/akaza-core/akaza/language_model.py @@ -4,9 +4,9 @@ import math import marisa_trie -from comb.node import Node -from comb.system_language_model import SystemLanguageModel -from comb.user_language_model import UserLanguageModel +from akaza.node import Node +from akaza.system_language_model import SystemLanguageModel +from akaza.user_language_model import UserLanguageModel diff --git a/comb/node.py b/akaza-core/akaza/node.py similarity index 100% rename from comb/node.py rename to akaza-core/akaza/node.py diff --git a/comb/system_dict.py b/akaza-core/akaza/system_dict.py similarity index 74% rename from comb/system_dict.py rename to akaza-core/akaza/system_dict.py index abe0274..8d8c719 100644 --- a/comb/system_dict.py +++ b/akaza-core/akaza/system_dict.py @@ -1,11 +1,8 @@ import logging -import os import marisa_trie from marisa_trie import BytesTrie -from comb.config import DICTIONARY_DIR - class SystemDict: _trie: BytesTrie @@ -18,11 +15,6 @@ class SystemDict: trie.mmap(path) self._trie = trie - @staticmethod - def create(): - path = os.path.join(DICTIONARY_DIR, 'system_dict.trie') - return SystemDict(path) - def prefixes(self, key): return self._trie.prefixes(key) diff --git a/akaza-core/akaza/system_language_model.py b/akaza-core/akaza/system_language_model.py new file mode 100644 index 0000000..ac04b1e --- /dev/null +++ b/akaza-core/akaza/system_language_model.py @@ -0,0 +1,29 @@ +import math + +import marisa_trie + +from akaza.node import Node + +DEFAULT_SCORE = [(math.log10(0.00000000001),)] + + +class SystemLanguageModel: + def __init__(self, score: marisa_trie.RecordTrie, default_score=None): + self.default_score = DEFAULT_SCORE if default_score is None else default_score + self.score = score + + @staticmethod + def create(path: str, default_score=None): + score = marisa_trie.RecordTrie('@f') + score.mmap(path) + + return SystemLanguageModel( + score=score, + default_score=DEFAULT_SCORE if default_score is None else default_score + ) + + def get_unigram_cost(self, key: str) -> float: + return self.score.get(key, self.default_score)[0][0] + + def get_bigram_cost(self, key1: str, key2: str) -> float: + return self.score.get(key1 + "\t" + key2, self.default_score)[0][0] diff --git a/comb/user_dict.py b/akaza-core/akaza/user_dict.py similarity index 93% rename from comb/user_dict.py rename to akaza-core/akaza/user_dict.py index b16d76b..5fad62f 100644 --- a/comb/user_dict.py +++ b/akaza-core/akaza/user_dict.py @@ -3,7 +3,7 @@ from typing import List, Dict import marisa_trie -from comb.skkdict import parse_skkdict, merge_skkdict, ari2nasi +from skkdictutils import parse_skkdict, merge_skkdict, ari2nasi class UserDict: @@ -41,7 +41,6 @@ def load_user_dict_from_json_config(path: str) -> UserDict: t = [] for k, v in merged.items(): t.append((k, '/'.join(v).encode('utf-8'))) - print(t) trie = marisa_trie.BytesTrie(t) return UserDict(trie) diff --git a/comb/user_language_model.py b/akaza-core/akaza/user_language_model.py similarity index 99% rename from comb/user_language_model.py rename to akaza-core/akaza/user_language_model.py index e48aa0b..0200323 100644 --- a/comb/user_language_model.py +++ b/akaza-core/akaza/user_language_model.py @@ -5,7 +5,7 @@ from typing import List, Dict, Optional from atomicwrites import atomic_write -from comb.node import Node +from akaza.node import Node # ユーザーの言語モデル。 diff --git a/requires.txt b/akaza-core/requires.txt similarity index 62% rename from requires.txt rename to akaza-core/requires.txt index b013650..9a8b3ac 100644 --- a/requires.txt +++ b/akaza-core/requires.txt @@ -1,2 +1,3 @@ marisa-trie=0.7.5 jaconv==0.2.4 +skkdictutils=0.0.2 diff --git a/akaza-core/setup.py b/akaza-core/setup.py new file mode 100644 index 0000000..4409fbe --- /dev/null +++ b/akaza-core/setup.py @@ -0,0 +1,12 @@ +from setuptools import setup + +setup( + name="akaza", + version="0.0.1", + install_requires=["marisa-trie==0.7.5", "jaconv==0.2.4"], + extras_require={ + "develop": ["dev-packageA", "dev-packageB"] + }, + entry_points={ + } +) diff --git a/comb/__init__.py b/akaza-core/tests/__init__.py similarity index 100% rename from comb/__init__.py rename to akaza-core/tests/__init__.py diff --git a/tests/test_combromkan.py b/akaza-core/tests/test_combromkan.py similarity index 94% rename from tests/test_combromkan.py rename to akaza-core/tests/test_combromkan.py index 7dd5ee0..725e320 100644 --- a/tests/test_combromkan.py +++ b/akaza-core/tests/test_combromkan.py @@ -1,6 +1,6 @@ import pytest -from comb.combromkan import to_hiragana +from akaza.akazaromkan import to_hiragana def test_foo(): diff --git a/tests/test_engine.py b/akaza-core/tests/test_engine.py similarity index 59% rename from tests/test_engine.py rename to akaza-core/tests/test_engine.py index 9d98bce..2cb32c1 100644 --- a/tests/test_engine.py +++ b/akaza-core/tests/test_engine.py @@ -1,16 +1,26 @@ from tempfile import NamedTemporaryFile +import os + import pytest -from comb.engine import Comb -from comb.system_dict import SystemDict -from comb.user_language_model import UserLanguageModel +from akaza import Akaza +from akaza.system_dict import SystemDict +from akaza.user_language_model import UserLanguageModel +from akaza.system_language_model import SystemLanguageModel tmpfile = NamedTemporaryFile(delete=False) user_language_model = UserLanguageModel(tmpfile.name) -system_dict = SystemDict.create() +system_dict = SystemDict('../akaza-data/system_dict.trie') -comb = Comb(user_language_model=user_language_model, system_dict=system_dict, user_dict=None) +system_language_model = SystemLanguageModel.create('../akaza-data/system_language_model.trie') + +akaza = Akaza( + user_language_model=user_language_model, + system_dict=system_dict, + user_dict=None, + system_language_model=system_language_model +) @pytest.mark.parametrize('src, expected', [ @@ -25,13 +35,13 @@ comb = Comb(user_language_model=user_language_model, system_dict=system_dict, us ('IME', 'IME'), ]) def test_wnn(src, expected): - clauses = comb.convert(src) + clauses = akaza.convert(src) got = ''.join([clause[0].word for clause in clauses]) assert got == expected def test_wnn2(): - clauses = comb.convert("わたし") + clauses = akaza.convert("わたし") hiragana_len = len([True for node in clauses[0] if node.word == 'わたし']) for node in clauses[0]: print(node) diff --git a/tests/test_graph.py b/akaza-core/tests/test_graph.py similarity index 74% rename from tests/test_graph.py rename to akaza-core/tests/test_graph.py index cda0112..fcab35c 100644 --- a/tests/test_graph.py +++ b/akaza-core/tests/test_graph.py @@ -1,30 +1,22 @@ -from tempfile import TemporaryDirectory - -from comb.combromkan import to_hiragana -import pytest -import marisa_trie -from comb.system_dict import SystemDict -from comb.graph import lookup, graph_construct, viterbi -from comb.language_model import LanguageModel import logging +from tempfile import TemporaryDirectory +import os +import pytest -from comb.system_language_model import SystemLanguageModel -from comb.user_language_model import UserLanguageModel +from akaza.graph import lookup, graph_construct, viterbi +from akaza.language_model import LanguageModel +from akaza.system_dict import SystemDict +from akaza.system_language_model import SystemLanguageModel +from akaza.user_language_model import UserLanguageModel -unigram_score = marisa_trie.RecordTrie('@f') -unigram_score.load('model/jawiki.1gram') - -bigram_score = marisa_trie.RecordTrie('@f') -bigram_score.load('model/jawiki.2gram') - -system_language_model = SystemLanguageModel(unigram_score, bigram_score) +system_language_model = SystemLanguageModel.create('../akaza-data/system_language_model.trie') tmpdir = TemporaryDirectory() user_language_model = UserLanguageModel(tmpdir.name) language_model = LanguageModel(system_language_model, user_language_model=user_language_model) -system_dict = SystemDict('model/system_dict.trie') +system_dict = SystemDict('../akaza-data/system_dict.trie') logging.basicConfig(level=logging.DEBUG) diff --git a/tests/test_system_dict.py b/akaza-core/tests/test_system_dict.py similarity index 62% rename from tests/test_system_dict.py rename to akaza-core/tests/test_system_dict.py index bf4f6ae..361a89b 100644 --- a/tests/test_system_dict.py +++ b/akaza-core/tests/test_system_dict.py @@ -1,6 +1,6 @@ -from comb.system_dict import SystemDict +from akaza.system_dict import SystemDict -system_dict = SystemDict('model/system_dict.trie') +system_dict = SystemDict('../akaza-data/system_dict.trie') def test_system_dict(): diff --git a/tests/test_user_dict.py b/akaza-core/tests/test_user_dict.py similarity index 80% rename from tests/test_user_dict.py rename to akaza-core/tests/test_user_dict.py index a9eaddb..400ba1b 100644 --- a/tests/test_user_dict.py +++ b/akaza-core/tests/test_user_dict.py @@ -1,14 +1,7 @@ -from tempfile import NamedTemporaryFile, TemporaryDirectory +from tempfile import TemporaryDirectory -from comb.node import Node -from comb.user_language_model import UserLanguageModel -import marisa_trie - -unigram_score = marisa_trie.RecordTrie('@f') -unigram_score.load('model/jawiki.1gram') - -bigram_score = marisa_trie.RecordTrie('@f') -bigram_score.load('model/jawiki.2gram') +from akaza.node import Node +from akaza.user_language_model import UserLanguageModel def test_read(): diff --git a/model/.gitignore b/akaza-data/.gitignore similarity index 100% rename from model/.gitignore rename to akaza-data/.gitignore diff --git a/model/Makefile b/akaza-data/Makefile similarity index 95% rename from model/Makefile rename to akaza-data/Makefile index 03fca75..e2777ed 100644 --- a/model/Makefile +++ b/akaza-data/Makefile @@ -1,4 +1,4 @@ -all: jawiki.1gram system_dict.trie +all: system_dict.trie system_language_model.trie jawiki-latest-pages-articles.xml.bz2: wget --no-verbose --no-clobber -O jawiki-latest-pages-articles.xml.bz2 https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2 @@ -32,4 +32,3 @@ system_dict.trie: jawiki.vocab python bin/make-system-dict.py .PHONY: all - diff --git a/akaza-data/README.md b/akaza-data/README.md new file mode 100644 index 0000000..ae8b43f --- /dev/null +++ b/akaza-data/README.md @@ -0,0 +1,14 @@ +# akaza-data + +## What's this? + +System dictionary/language model package for Akaza. + +## PyPI's size limit + +*The default size limit on PyPI is 60MB* + + * [unidic-lite](https://www.dampfkraft.com/code/distributing-large-files-with-pypi.html) + +## See also + diff --git a/model/bin/create-system_language_model-from-json.py b/akaza-data/bin/create-system_language_model-from-json.py similarity index 100% rename from model/bin/create-system_language_model-from-json.py rename to akaza-data/bin/create-system_language_model-from-json.py diff --git a/model/bin/dumpngram.py b/akaza-data/bin/dumpngram.py similarity index 100% rename from model/bin/dumpngram.py rename to akaza-data/bin/dumpngram.py diff --git a/model/bin/make-system-dict.py b/akaza-data/bin/make-system-dict.py similarity index 96% rename from model/bin/make-system-dict.py rename to akaza-data/bin/make-system-dict.py index d77610a..234df65 100644 --- a/model/bin/make-system-dict.py +++ b/akaza-data/bin/make-system-dict.py @@ -5,7 +5,7 @@ import marisa_trie sys.path.append('../') -from comb.skkdict import parse_skkdict, merge_skkdict, ari2nasi +from skkdictutils import parse_skkdict, merge_skkdict, ari2nasi # jawiki.vocab から system_dict.trie を作成する。 diff --git a/model/bin/wiki2text-runner.py b/akaza-data/bin/wiki2text-runner.py similarity index 100% rename from model/bin/wiki2text-runner.py rename to akaza-data/bin/wiki2text-runner.py diff --git a/model/bin/wiki2text.py b/akaza-data/bin/wiki2text.py similarity index 96% rename from model/bin/wiki2text.py rename to akaza-data/bin/wiki2text.py index 4069ff4..e493e03 100644 --- a/model/bin/wiki2text.py +++ b/akaza-data/bin/wiki2text.py @@ -1,5 +1,6 @@ import logging import os +import pathlib import sys from Mykytea import Mykytea import re @@ -68,6 +69,9 @@ def main(): total = len(sys.argv[1:]) for ifile in sys.argv[1:]: ofile = ifile.replace('text/', 'dat/') + + pathlib.Path(ofile).parent.mkdir(parents=True, exist_ok=True) + logging.info(f"[{os.getpid()}] {ifile} -> {ofile} ({count}/{total})") with open(ifile, 'r') as rfp, \ open(ofile, 'w') as wfp: diff --git a/akaza-data/requirements.txt b/akaza-data/requirements.txt new file mode 100644 index 0000000..363c6b1 --- /dev/null +++ b/akaza-data/requirements.txt @@ -0,0 +1 @@ +skkdictutils>=0.0.2 diff --git a/model/skk-dev-dict b/akaza-data/skk-dev-dict similarity index 100% rename from model/skk-dev-dict rename to akaza-data/skk-dev-dict diff --git a/model/utils/dump-system-dict.py b/akaza-data/utils/dump-system-dict.py similarity index 100% rename from model/utils/dump-system-dict.py rename to akaza-data/utils/dump-system-dict.py diff --git a/model/utils/dump2gram.py b/akaza-data/utils/dump2gram.py similarity index 100% rename from model/utils/dump2gram.py rename to akaza-data/utils/dump2gram.py diff --git a/comb/config.py.in b/comb/config.py.in deleted file mode 100644 index 54e0c07..0000000 --- a/comb/config.py.in +++ /dev/null @@ -1,3 +0,0 @@ -SYS_CONF_DIR = '@SYSCONFDIR@' -MODEL_DIR = '@MODELDIR@' -DICTIONARY_DIR = '@DICTIONARYDIR@' diff --git a/comb/system_language_model.py b/comb/system_language_model.py deleted file mode 100644 index 23ce14e..0000000 --- a/comb/system_language_model.py +++ /dev/null @@ -1,29 +0,0 @@ -import math - -import marisa_trie - -from comb.config import MODEL_DIR -from comb.node import Node - -DEFAULT_SCORE = [(math.log10(0.00000000001),)] - - -class SystemLanguageModel: - def __init__(self, score: marisa_trie.RecordTrie): - self.score = score - - @staticmethod - def create(): - score = marisa_trie.RecordTrie('@f') - score.mmap(f"{MODEL_DIR}/system_language_model.trie") - - return SystemLanguageModel(score) - - def get_unigram_cost(self, key: str) -> float: - return self.score.get(key, DEFAULT_SCORE)[0][0] - - def get_bigram_cost(self, node1: Node, node2: Node) -> float: - key1 = node1.get_key() - key2 = node2.get_key() - key = key1 + "\t" + key2 - return self.score.get(key, DEFAULT_SCORE)[0][0] diff --git a/ibus-akaza/.gitignore b/ibus-akaza/.gitignore new file mode 100644 index 0000000..d9c3e9f --- /dev/null +++ b/ibus-akaza/.gitignore @@ -0,0 +1 @@ +/akaza.xml \ No newline at end of file diff --git a/ibus-akaza/Makefile b/ibus-akaza/Makefile new file mode 100644 index 0000000..be612d4 --- /dev/null +++ b/ibus-akaza/Makefile @@ -0,0 +1,45 @@ +PREFIX ?= /usr +SYSCONFDIR ?= /etc +DATADIR ?= $(PREFIX)/share +DESTDIR ?= + +PYTHON ?= /usr/bin/python3 + +all: akaza.xml ibus_akaza/config.py + +install: ibus_akaza/config.py akaza.xml + install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza $(DESTDIR)$(SYSCONFDIR)/xdg/akaza $(DESTDIR)$(DATADIR)/ibus/component + + install -m 0644 akaza.svg $(DESTDIR)$(DATADIR)/ibus-akaza + install -m 0644 ibus.py $(DESTDIR)$(DATADIR)/ibus-akaza + install -m 0644 ibus_akaza/ui.py $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/ + install -m 0644 ibus_akaza/config.py $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/ + install -m 0644 akaza.xml $(DESTDIR)$(DATADIR)/ibus/component + +ibus_akaza/config.py: ibus_akaza/config.py.in + sed -e "s:@SYSCONFDIR@:$(SYSCONFDIR):g" \ + -e "s:@MODELDIR@:$(DESTDIR)/$(DATADIR)/ibus-akaza/model:g" \ + -e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-akaza/dictionary:g" \ + $< > $@ + +akaza.xml: akaza.xml.in + sed -e "s:@PYTHON@:$(PYTHON):g;" \ + -e "s:@DATADIR@:$(DATADIR):g" $< > $@ + +check: + python -m py_compile ibus.py + python -m py_compile ibus_akaza/ui.py + +uninstall: + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza.svg + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/ui.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/config.py + rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/ibus.py + rmdir $(DESTDIR)$(DATADIR)/ibus-akaza + + +clean: + rm -f akaza.xml + +.PHONY: all check install uninstall clean + diff --git a/comb.xml.in b/ibus-akaza/akaza.xml.in similarity index 59% rename from comb.xml.in rename to ibus-akaza/akaza.xml.in index 4549b5f..061b858 100644 --- a/comb.xml.in +++ b/ibus-akaza/akaza.xml.in @@ -1,23 +1,23 @@ - + - org.freedesktop.IBus.Comb - Comb - kana kanji converter + org.freedesktop.IBus.Akaza + Akaza - kana kanji converter 0.0.1 GPL Tokuhiro Matsuno <tokuhirom@gmail.com> - https://github.com/tokuhirom/ibus-comb - @PYTHON@ @DATADIR@/ibus-comb/ibus.py --ibus - comb + https://github.com/tokuhirom/ibus-akaza + @PYTHON@ @DATADIR@/ibus-akaza/ibus.py --ibus + akaza - comb - comb - Comb - Kana Kanji Converter + akaza + akaza + Akaza - Kana Kanji Converter ja GPL Tokuhiro Matsuno <tokuhirom@gmail.com> - @DATADIR@/ibus-comb/comb.svg + @DATADIR@/ibus-akaza/akaza.svg us diff --git a/comb.svg b/ibus-akaza/comb.svg similarity index 100% rename from comb.svg rename to ibus-akaza/comb.svg diff --git a/ibus.py b/ibus-akaza/ibus.py similarity index 86% rename from ibus.py rename to ibus-akaza/ibus.py index cd35ac4..67aac54 100755 --- a/ibus.py +++ b/ibus-akaza/ibus.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# ibus-comb: ibus engine for japanese characters +# ibus-akaza: ibus engine for japanese characters # # Copyright (c) 2020 Tokuhiro Matsuno # @@ -33,10 +33,11 @@ import getopt import locale import logging -logging.basicConfig(level=logging.DEBUG, filename='/tmp/ibus-comb.log', filemode='w') -logging.info("Loading ibus-comb") +# TODO: remove log file generation +logging.basicConfig(level=logging.DEBUG, filename='/tmp/ibus-akaza.log', filemode='w') +logging.info("Loading ibus-akaza") -libpath = os.path.join(os.path.dirname(__file__), "comb") +libpath = os.path.join(os.path.dirname(__file__), "akaza") logging.info(f"library path: {libpath}") sys.path.append(libpath) @@ -56,23 +57,23 @@ class IMApp: logging.info("Loading IMApp") - from comb.ui import CombIBusEngine + from akaza.ui import AkazaIBusEngine self.mainloop = GLib.MainLoop() self.bus = IBus.Bus() self.bus.connect("disconnected", self.bus_disconnected_cb) self.factory = IBus.Factory.new(self.bus.get_connection()) - self.factory.add_engine("comb", GObject.type_from_name("CombIBusEngine")) + self.factory.add_engine("akaza", GObject.type_from_name("AkazaIBusEngine")) if exec_by_ibus: self.bus.request_name("org.freedesktop.IBus.Comb", 0) else: - xml_path = os.path.join(__base_dir__, 'comb.xml') + xml_path = os.path.join(__base_dir__, 'AkazaIBusEngine.xml') if os.path.exists(xml_path): component = IBus.Component.new_from_file(xml_path) else: xml_path = os.path.join(os.path.dirname(__base_dir__), - 'ibus', 'component', 'comb.xml') + 'ibus', 'component', 'akaza.xml') component = IBus.Component.new_from_file(xml_path) self.bus.register_component(component) diff --git a/ibus-akaza/ibus_akaza/.gitignore b/ibus-akaza/ibus_akaza/.gitignore new file mode 100644 index 0000000..fe9e6f5 --- /dev/null +++ b/ibus-akaza/ibus_akaza/.gitignore @@ -0,0 +1 @@ +/config.py \ No newline at end of file diff --git a/ibus-akaza/ibus_akaza/config.py.in b/ibus-akaza/ibus_akaza/config.py.in new file mode 100644 index 0000000..4d6e1ba --- /dev/null +++ b/ibus-akaza/ibus_akaza/config.py.in @@ -0,0 +1,5 @@ +import os + +SYS_CONF_DIR = os.environ.get('AKAZA_SYSCONF_DIR', '@SYSCONFDIR@') +MODEL_DIR = os.environ.get('AKAZA_MODEL_DIR', '@MODELDIR@') +DICTIONARY_DIR = os.environ.get('AKAZA_DICTIONARY_DIR', '@DICTIONARYDIR@') diff --git a/comb/ui.py b/ibus-akaza/ibus_akaza/ui.py similarity index 94% rename from comb/ui.py rename to ibus-akaza/ibus_akaza/ui.py index 8f4f1fc..f6d2271 100644 --- a/comb/ui.py +++ b/ibus-akaza/ibus_akaza/ui.py @@ -15,12 +15,13 @@ import pathlib from jaconv import jaconv -from comb import combromkan -from comb.engine import Comb -from comb.node import Node -from comb.user_language_model import UserLanguageModel -from comb.system_dict import SystemDict -from comb.user_dict import load_user_dict_from_json_config +from akaza import akazaromkan +from akaza.engine import Comb +from akaza.node import Node +from akaza.user_language_model import UserLanguageModel +from akaza.system_dict import SystemDict +from akaza.user_dict import load_user_dict_from_json_config +from akaza.config import MODEL_DIR MODE_KANA = 1 MODE_ALPHA = 2 @@ -37,13 +38,14 @@ for n in range(1, 10): numpad_keys.append(getattr(IBus, 'KP_0')) del n -configdir = os.path.join(GLib.get_user_config_dir(), 'ibus-comb') +configdir = os.path.join(GLib.get_user_config_dir(), 'ibus-akaza') pathlib.Path(os.path.join(configdir, 'user-dict')).mkdir(parents=True, exist_ok=True) logging.info(f"Loading user dictionary: {configdir}") user_language_model = UserLanguageModel(os.path.join(configdir, 'user-dict')) logging.info("Loaded user dictionary") -system_dict = SystemDict.create() +system_dict_path = os.path.join(DICTIONARY_DIR, 'system_dict.trie') +system_dict = SystemDict(system_dict_path) logging.info("Loaded system dictionary") try: @@ -60,7 +62,10 @@ try: else: logging.info(f"'{user_dict_conf_path}' does not exist.") - comb = Comb(user_language_model, system_dict, user_dict) + system_language_model_path = f"{MODEL_DIR}/system_language_model.trie" + system_language_model = SystemLanguageModel.create(system_language_model_path) + + akaza = Comb(user_language_model, system_dict, user_dict, system_language_model) logging.info("Finished Comb.") except: logging.error("Cannot initialize.", exc_info=True) @@ -71,27 +76,27 @@ except: # the engine # ---------------------------------------------------------------------- -class CombIBusEngine(IBus.Engine): +class AkazaIBusEngine(IBus.Engine): user_language_model: UserLanguageModel current_clause: int node_selected: Dict[int, int] clauses: List[List[Node]] prop_list: IBus.PropList - comb: Comb + akaza: Comb mode: int force_selected_clause: List[slice] - __gtype_name__ = 'CombIBusEngine' + __gtype_name__ = 'kaza' def __init__(self): - super(CombIBusEngine, self).__init__() + super(AkazaIBusEngine, self).__init__() self.is_invalidate = False # 未確定文字列。 self.preedit_string = '' # 候補文字列 self.lookup_table = IBus.LookupTable.new(page_size=10, cursor_pos=0, cursor_visible=True, round=True) self.prop_list = IBus.PropList() - self.comb = comb + self.akaza = akaza self.user_language_model = user_language_model self.user_dict = user_dict self.logger = logging.getLogger(__name__) @@ -168,7 +173,7 @@ class CombIBusEngine(IBus.Engine): self.commit_candidate() else: # 無変換状態では、ひらがなに変換してコミットします。 - self.commit_string(combromkan.to_hiragana(self.preedit_string)) + self.commit_string(akazaromkan.to_hiragana(self.preedit_string)) return True elif keyval == IBus.Escape: self.preedit_string = '' @@ -289,7 +294,7 @@ class CombIBusEngine(IBus.Engine): self.logger.info("Convert to full katakana") # カタカナ候補のみを表示するようにする。 - hira = combromkan.to_hiragana(self.preedit_string) + hira = akazaromkan.to_hiragana(self.preedit_string) kata = jaconv.hira2kata(hira) self.convert_to_single(hira, kata) @@ -298,14 +303,14 @@ class CombIBusEngine(IBus.Engine): self.logger.info("Convert to full hiragana") # カタカナ候補のみを表示するようにする。 - hira = combromkan.to_hiragana(self.preedit_string) + hira = akazaromkan.to_hiragana(self.preedit_string) self.convert_to_single(hira, hira) def convert_to_half_katakana(self): self.logger.info("Convert to half katakana") # 半角カタカナ候補のみを表示するようにする。 - hira = combromkan.to_hiragana(self.preedit_string) + hira = akazaromkan.to_hiragana(self.preedit_string) kata = jaconv.hira2kata(hira) kata = jaconv.z2h(kata) @@ -315,7 +320,7 @@ class CombIBusEngine(IBus.Engine): self.logger.info("Convert to half romaji") # 半角カタカナ候補のみを表示するようにする。 - hira = combromkan.to_hiragana(self.preedit_string) + hira = akazaromkan.to_hiragana(self.preedit_string) romaji = jaconv.z2h(self.preedit_string) self.convert_to_single(hira, romaji) @@ -323,7 +328,7 @@ class CombIBusEngine(IBus.Engine): def convert_to_full_romaji(self): self.logger.info("Convert to full romaji") - hira = combromkan.to_hiragana(self.preedit_string) + hira = akazaromkan.to_hiragana(self.preedit_string) romaji = jaconv.h2z(self.preedit_string, kana=True, digit=True, ascii=True) self.convert_to_single(hira, romaji) @@ -530,7 +535,7 @@ class CombIBusEngine(IBus.Engine): def _update_candidates(self): if len(self.preedit_string) > 0: # 変換をかける - self.clauses = self.comb.convert(self.preedit_string, self.force_selected_clause) + self.clauses = self.akaza.convert(self.preedit_string, self.force_selected_clause) else: self.clauses = [] self.create_lookup_table() @@ -588,7 +593,7 @@ class CombIBusEngine(IBus.Engine): return # 平仮名にする。 - text = combromkan.to_hiragana(self.preedit_string) + text = akazaromkan.to_hiragana(self.preedit_string) self.clauses = [ [Node(word=text, yomi=text, start_pos=3)] ] diff --git a/tests/test_ui.py b/ibus-akaza/test_ui.py similarity index 92% rename from tests/test_ui.py rename to ibus-akaza/test_ui.py index 43d4a0f..3dcb69c 100644 --- a/tests/test_ui.py +++ b/ibus-akaza/test_ui.py @@ -1,8 +1,13 @@ -from comb.ui import CombIBusEngine +import os + +os.environ['AKAZA_DICTIONARY_DIR'] = 'model/' +os.environ['AKAZA_MODEL_DIR'] = 'model/' + +from akaza.ui import AkazaIBusEngine def test_extend_clause_right(): - ui = CombIBusEngine() + ui = AkazaIBusEngine() ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず ui.update_candidates() @@ -32,7 +37,7 @@ def test_extend_clause_right(): def test_extend_clause_right_most_right(): - ui = CombIBusEngine() + ui = AkazaIBusEngine() ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず ui.update_candidates() @@ -56,7 +61,7 @@ def test_extend_clause_right_most_right(): def test_extend_clause_left(): - ui = CombIBusEngine() + ui = AkazaIBusEngine() ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず ui.update_candidates() @@ -89,7 +94,7 @@ def test_extend_clause_left(): def test_extend_clause_left_most_left(): - ui = CombIBusEngine() + ui = AkazaIBusEngine() ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず ui.update_candidates() diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 96735eb..0000000 --- a/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -log_format = %(asctime)s %(levelname)s %(message)s -log_date_format = %Y-%m-%d %H:%M:%S diff --git a/scratch/marisa-trie-latice.py b/scratch/marisa-trie-latice.py deleted file mode 100644 index 13354a3..0000000 --- a/scratch/marisa-trie-latice.py +++ /dev/null @@ -1,29 +0,0 @@ -from comb.engine import parse_skkdict -import marisa_trie - -dictionary = parse_skkdict('/usr/share/skk/SKK-JISYO.L', encoding='euc-jp') - -print("START") - -t = [] -for k, v in dictionary.items(): - vvv = '/'.join(v).encode('utf-8') - t.append((k, vvv)) - -trie = marisa_trie.BytesTrie(t) - -print("LOADED") - - -def gen_latice(s): - for n in range(len(s) - 1): - print(n) - word = s[0:n] - print(word) - - -src = 'ひつようなことは' -for prefix in reversed(trie.prefixes(src)): - kanjis = trie[prefix][0].decode('utf-8').split('/') - for kanji in kanjis: - print(kanji + src[len(prefix):]) diff --git a/scratch/pygtrie-latice.py b/scratch/pygtrie-latice.py deleted file mode 100644 index 5b43607..0000000 --- a/scratch/pygtrie-latice.py +++ /dev/null @@ -1,28 +0,0 @@ -from comb.engine import parse_skkdict -import pygtrie - -dictionary = parse_skkdict('/usr/share/skk/SKK-JISYO.L', encoding='euc-jp') - -t = pygtrie.CharTrie() - -for k, v in dictionary.items(): - vvv = '/'.join(v).encode('utf-8') - t[k] = v - -print("LOADED") - - -def gen_latice(s): - for n in range(len(s) - 1): - print(n) - word = s[0:n] - print(word) - - -src = 'じゅうかきんぜい' -# print(t.get('じゅうか')) -for s in t.prefixes('たんげつ'): - print(s) - -# f = src[0] -# print(gen_latice(src)) diff --git a/skkdictutils/.gitignore b/skkdictutils/.gitignore new file mode 100644 index 0000000..c73499d --- /dev/null +++ b/skkdictutils/.gitignore @@ -0,0 +1,4 @@ +/.pytest_cache +/dist/ +*.egg-info +build/ \ No newline at end of file diff --git a/skkdictutils/README.md b/skkdictutils/README.md new file mode 100644 index 0000000..dbf546e --- /dev/null +++ b/skkdictutils/README.md @@ -0,0 +1,33 @@ +# skkdictutils + +## Current status + +**This library is under development.** + +API is unstable. + +## Release process + +See https://packaging.python.org/tutorials/packaging-projects/ + +## LICENSE + +Copyright (c) 2020 Tokuhiro Matsuno + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/skkdictutils/setup.py b/skkdictutils/setup.py new file mode 100644 index 0000000..5dcfcc2 --- /dev/null +++ b/skkdictutils/setup.py @@ -0,0 +1,18 @@ +import setuptools + +setuptools.setup( + name="skkdictutils", + version="0.0.2", + install_requires=['romkan==0.2.1'], + extras_require={ + }, + entry_points={ + }, + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', +) diff --git a/comb/skkdict.py b/skkdictutils/skkdictutils/__init__.py similarity index 85% rename from comb/skkdict.py rename to skkdictutils/skkdictutils/__init__.py index 7e260a0..b502318 100644 --- a/comb/skkdict.py +++ b/skkdictutils/skkdictutils/__init__.py @@ -1,11 +1,13 @@ import re from typing import Dict, List -from comb import combromkan +import romkan -BOIN = ['a', 'i', 'u', 'e', 'o'] +__all__ = ['parse_skkdict', 'merge_skkdict', 'ari2nasi', 'write_skkdict', 'expand_okuri'] -LOWER_PATTERN = re.compile('[a-z]') +_BOIN = ['a', 'i', 'u', 'e', 'o'] + +_LOWER_PATTERN = re.compile('[a-z]') def parse_skkdict(path: str, encoding: str = 'euc-jp'): @@ -64,13 +66,13 @@ def merge_skkdict(dicts: List[Dict[str, List[str]]]) -> Dict[str, List[str]]: def expand_okuri(kana: str, kanjis: List[str]): if kana[-1].isalpha(): - if kana[-1] in BOIN: - okuri = combromkan.to_hiragana(kana[-1]) + if kana[-1] in _BOIN: + okuri = romkan.to_hiragana(kana[-1]) yield kana[:-1] + okuri, [kanji + okuri for kanji in kanjis] else: - for b in BOIN: - okuri = combromkan.to_hiragana(kana[-1] + b) - if LOWER_PATTERN.match(okuri): + for b in _BOIN: + okuri = romkan.to_hiragana(kana[-1] + b) + if _LOWER_PATTERN.match(okuri): # wu のように、変換できないものは無視する。 continue yield kana[:-1] + okuri, [kanji + okuri for kanji in kanjis] diff --git a/tests/test_skkdict.py b/skkdictutils/test_skkdict.py similarity index 91% rename from tests/test_skkdict.py rename to skkdictutils/test_skkdict.py index 5b15679..902ff54 100644 --- a/tests/test_skkdict.py +++ b/skkdictutils/test_skkdict.py @@ -1,4 +1,4 @@ -from comb.skkdict import merge_skkdict, expand_okuri +from skkdictutils import merge_skkdict, expand_okuri def test_merge_skkdict(): diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000