This commit is contained in:
Tokuhiro Matsuno
2020-09-14 18:36:22 +09:00
parent 1c061b367d
commit 3ee8b9574f
64 changed files with 408 additions and 285 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@ __pycache__
/comb.xml
/hello.*
/test_graph2.py
/akaza.xml

3
.gitmodules vendored
View File

@ -1,3 +1,6 @@
[submodule "model/skk-dev-dict"]
path = model/skk-dev-dict
url = https://github.com/skk-dev/dict.git
[submodule "akaza-data/skk-dev-dict"]
path = akaza-data/skk-dev-dict
url = https://github.com/skk-dev/dict.git

View File

@ -1,3 +1,12 @@
# 2020-09-14(Mon)
* comb を akaza に改名した。
* ibus 関連部分とそれ以外を分離する。以下のようなモジュール構成を目指す。
* ibus-akaza: ibus 連動部分。
* akaza-core: 変換コアエンジン。ibus 関連部分と独立させることにより、fcitx との連動を可能にすることを目指す
* skkdictutils: SKK 辞書関連ユーティリティライブラリ。単独利用可能なようにパッケージングし、単独レポジトリに独立させる予定。
* akaza-data : システム辞書/システム言語モデル
# 2020-09-13(Sun)
* Rust で書き直そうかなぁ。。
@ -14,7 +23,7 @@
* kytea から得られた結果をもとに、平仮名を連結して ngram を作成したが、この結果は惨憺たるものであった。
* DONE: 文節を伸ばす機能が死んでいる。
* ユーザー辞書を設定できるようにしたい。
* ~/.config/ibus-comb/user-dict.json のなかに設定をいれる。
* ~/.config/ibus-akaza/user-dict.json のなかに設定をいれる。
* `path/to/dict.txt;format=skk;charset=euc-jp` みたいなフォーマットでいいかなぁ。。JSONでもいいかな。。
* 思ったより、簡単に実装できそう。

107
Makefile
View File

@ -8,79 +8,68 @@ DESTDIR ?=
PYTHON ?= /usr/bin/python3
all: comb.xml comb/config.py comb model/jawiki.1gram
all: akaza.xml akaza/config.py akaza akaza-data/system_language_model.trie akaza-data/system_dict.trie
check:
python -m py_compile ibus.py
python -m py_compile comb/combromkan.py
python -m py_compile comb/engine.py
python -m py_compile comb/skkdict.py
python -m py_compile akaza/akazaromkan.py
python -m py_compile akaza/engine.py
python -m py_compile akaza/skkdict.py
pytest
comb.xml: comb.xml.in
sed -e "s:@PYTHON@:$(PYTHON):g;" \
-e "s:@DATADIR@:$(DATADIR):g" $< > $@
comb/config.py: comb/config.py.in
sed -e "s:@SYSCONFDIR@:$(SYSCONFDIR):g" \
-e "s:@MODELDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/model:g" \
-e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-comb/dictionary:g" \
$< > $@
model/system_language_model.trie: model/bin/create-system_language_model-from-json.py
akaza-data/system_language_model.trie: akaza-data/bin/create-system_language_model-from-json.py
make -C model system_language_model.trie
model/system_dict.trie:
akaza-data/system_dict.trie:
make -C model system_dict.trie
install-dict: model/system_dict.trie
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/dictionary
install -p -m 0644 model/system_dict.trie $(DESTDIR)$(DATADIR)/ibus-comb/dictionary/
install-data: model/system_dict.trie
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-akaza/dictionary
install -p -m 0644 model/system_dict.trie $(DESTDIR)$(DATADIR)/ibus-akaza/dictionary/
install -m 0644 model/system_language_model.trie $(DESTDIR)$(DATADIR)/ibus-akaza/model/
install: all comb/config.py model/jawiki.1gram install-dict
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-comb/comb $(DESTDIR)$(SYSCONFDIR)/xdg/comb $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-comb/model $(DESTDIR)$(DATADIR)/ibus-comb/dictionary
install -m 0644 model/system_language_model.trie $(DESTDIR)$(DATADIR)/ibus-comb/model/
install: all akaza/config.py model/system_dict.trie install-data
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-akaza/akaza $(DESTDIR)$(SYSCONFDIR)/xdg/akaza $(DESTDIR)$(DATADIR)/ibus/component $(DESTDIR)$(DATADIR)/ibus-akaza/model $(DESTDIR)$(DATADIR)/ibus-akaza/dictionary
install -m 0644 comb.svg $(DESTDIR)$(DATADIR)/ibus-comb
install -m 0644 comb/__init__.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb/graph.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb/language_model.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb/node.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb/config.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb/skkdict.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb/combromkan.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 ibus.py $(DESTDIR)$(DATADIR)/ibus-comb
install -m 0644 comb/engine.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb/ui.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb/user_language_model.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb/system_language_model.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb/system_dict.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb/user_dict.py $(DESTDIR)$(DATADIR)/ibus-comb/comb/
install -m 0644 comb.xml $(DESTDIR)$(DATADIR)/ibus/component
install -m 0644 akaza.svg $(DESTDIR)$(DATADIR)/ibus-akaza
install -m 0644 ibus.py $(DESTDIR)$(DATADIR)/ibus-akaza
install -m 0644 akaza.xml $(DESTDIR)$(DATADIR)/ibus/component
install -m 0644 akaza/__init__.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/graph.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/language_model.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/node.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/config.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/skkdict.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/akazaromkan.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/engine.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/ui.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/user_language_model.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/system_language_model.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/system_dict.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
install -m 0644 akaza/user_dict.py $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/
uninstall:
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb.svg
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/config.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/engine.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/skkdict.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/combromkan.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/graph.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/language_model.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/node.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/ui.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_language_model.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_language_model.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/user_dict.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/comb/system_dict.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/ibus.py
rm -f $(DESTDIR)$(DATADIR)/ibus-comb/model/system_language_model.trie
rmdir $(DESTDIR)$(DATADIR)/ibus-comb
rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/comb
rm -f $(DESTDIR)$(DATADIR)/ibus/component/comb.xml
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza
rm -f $(DESTDIR)$(DATADIR)/ibus/component/akaza.xml
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/engine.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/skkdict.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/akazaromkan.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/graph.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/language_model.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/node.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/ui.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/user_language_model.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/system_language_model.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/user_dict.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza/system_dict.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/model/system_language_model.trie
rmdir $(DESTDIR)$(SYSCONFDIR)/xdg/akaza
clean:
rm -f comb.xml
rm -f comb/config.py
rm -f akaza.xml
rm -f akaza/config.py
.PHONY: all check install uninstall

View File

@ -12,7 +12,7 @@
* LOUDS はサイズが小さくなるが、動的な追加削除はできない。
* 検索速度は Double Array のほうが速い
* comb には何に trie を作成しているのか?
* akaza には何に trie を作成しているのか?
* ユーザー辞書とシステム辞書とシステム言語モデルに利用している。
* ただし、システム言語モデルは、純粋なキーからの検索しかしていない。

View File

@ -1,4 +1,4 @@
# ibus-comb
# ibus-akaza
Yet another kana-kanji-converter on IBus, written in Python.

View File

@ -7,7 +7,6 @@
## Priority mid
- support 3gram(必要?)
- 2gram のデータがデカすぎる。libkkc と同等の圧縮をすべき
- ユーザー言語モデル学習機 from text file or web.
- クローラーをかく?
- ユーザー辞書機能を実装する
@ -38,6 +37,7 @@
- 共起的なスコアをいれたい?
- 青空文庫をコーパスとして使う?
- 古くさすぎるかも
- 言語モデルを小さくできないか?
# DONE
@ -45,7 +45,7 @@
- カタカナ語辞書の作成
- 連文節変換用の UI を実装する
- Function key とかのショートカットで、全部カタカナにすることができるように。
- ibus-comb がバグってた時に便利。
- ibus-akaza がバグってた時に便利。
- 末尾のアルファベット一文字は、変換しない。
- 前向きDP後ろ向きA* で候補を得る
- 平仮名語辞書もいるのかもしれない。

20
akaza-core/README.md Normal file
View File

@ -0,0 +1,20 @@
# Akaza
## What's this?
Yet another kana-kanji conversion system written in Python 3.
## How do I use it?
### Use as a library
system_language_model = SystemLanguageModel.create('path/to/system_language_model.trie')
system_dict = SystemDictionary.create('path/to/system_language_model.trie')
akaza = Akaza(
system_language_model = system_language_model,
system_dict: system_dict,
user_language_model: user_language_model,
user_dict: user_dict,
)
print(akaza.convert('watasinonamaehanakanodesu.'))
# → 私の名前は中野です。

View File

@ -0,0 +1,9 @@
Metadata-Version: 2.1
Name: akaza
Version: 0.0.1
Summary: UNKNOWN
Home-page: UNKNOWN
License: UNKNOWN
Description: UNKNOWN
Platform: UNKNOWN
Provides-Extra: develop

View File

@ -0,0 +1,6 @@
setup.py
akaza.egg-info/PKG-INFO
akaza.egg-info/SOURCES.txt
akaza.egg-info/dependency_links.txt
akaza.egg-info/requires.txt
akaza.egg-info/top_level.txt

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,6 @@
marisa-trie==0.7.5
jaconv==0.2.4
[develop]
dev-packageA
dev-packageB

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,3 @@
from . import akaza
Akaza = akaza.Akaza

View File

@ -6,26 +6,29 @@ from typing import List, Any, Optional
import jaconv
from comb import combromkan
from comb.graph import graph_construct, viterbi, lookup
from comb.language_model import LanguageModel
from comb.node import Node
from comb.system_dict import SystemDict
from comb.system_language_model import SystemLanguageModel
from comb.user_dict import UserDict
from comb.user_language_model import UserLanguageModel
from akaza import akazaromkan
from akaza.graph import graph_construct, viterbi, lookup
from akaza.language_model import LanguageModel
from akaza.node import Node
from akaza.system_dict import SystemDict
from akaza.system_language_model import SystemLanguageModel
from akaza.user_dict import UserDict
from akaza.user_language_model import UserLanguageModel
# 子音だが、N は NN だと「ん」になるので処理しない。
TRAILING_CONSONANT_PATTERN = re.compile(r'^(.*?)([qwrtypsdfghjklzxcvbm]+)$')
class Comb:
class Akaza:
user_dict: Optional[UserDict]
logger: Logger
dictionaries: List[Any]
def __init__(self, user_language_model: UserLanguageModel, system_dict: SystemDict,
user_dict: Optional[UserDict],
def __init__(self,
system_language_model: SystemLanguageModel,
system_dict: SystemDict,
user_language_model: UserLanguageModel,
user_dict: Optional[UserDict] = None,
logger: Logger = logging.getLogger(__name__)):
assert user_language_model
self.logger = logger
@ -34,8 +37,6 @@ class Comb:
self.system_dict = system_dict
self.user_dict = user_dict
system_language_model = SystemLanguageModel.create()
self.language_model = LanguageModel(system_language_model, user_language_model)
# 連文節変換するバージョン。
@ -52,7 +53,7 @@ class Comb:
)
]]
hiragana: str = combromkan.to_hiragana(src)
hiragana: str = akazaromkan.to_hiragana(src)
# 末尾の子音を変換対象外とする。
m = TRAILING_CONSONANT_PATTERN.match(hiragana)

View File

@ -5,11 +5,11 @@ from typing import Dict, List, Optional
import jaconv
from comb.language_model import LanguageModel
from comb.node import Node
from comb.system_dict import SystemDict
from comb.user_dict import UserDict
from comb.user_language_model import UserLanguageModel
from akaza.language_model import LanguageModel
from akaza.node import Node
from akaza.system_dict import SystemDict
from akaza.user_dict import UserDict
from akaza.user_language_model import UserLanguageModel
class Graph:

View File

@ -4,9 +4,9 @@ import math
import marisa_trie
from comb.node import Node
from comb.system_language_model import SystemLanguageModel
from comb.user_language_model import UserLanguageModel
from akaza.node import Node
from akaza.system_language_model import SystemLanguageModel
from akaza.user_language_model import UserLanguageModel

View File

@ -1,11 +1,8 @@
import logging
import os
import marisa_trie
from marisa_trie import BytesTrie
from comb.config import DICTIONARY_DIR
class SystemDict:
_trie: BytesTrie
@ -18,11 +15,6 @@ class SystemDict:
trie.mmap(path)
self._trie = trie
@staticmethod
def create():
path = os.path.join(DICTIONARY_DIR, 'system_dict.trie')
return SystemDict(path)
def prefixes(self, key):
return self._trie.prefixes(key)

View File

@ -0,0 +1,29 @@
import math
import marisa_trie
from akaza.node import Node
DEFAULT_SCORE = [(math.log10(0.00000000001),)]
class SystemLanguageModel:
def __init__(self, score: marisa_trie.RecordTrie, default_score=None):
self.default_score = DEFAULT_SCORE if default_score is None else default_score
self.score = score
@staticmethod
def create(path: str, default_score=None):
score = marisa_trie.RecordTrie('@f')
score.mmap(path)
return SystemLanguageModel(
score=score,
default_score=DEFAULT_SCORE if default_score is None else default_score
)
def get_unigram_cost(self, key: str) -> float:
return self.score.get(key, self.default_score)[0][0]
def get_bigram_cost(self, key1: str, key2: str) -> float:
return self.score.get(key1 + "\t" + key2, self.default_score)[0][0]

View File

@ -3,7 +3,7 @@ from typing import List, Dict
import marisa_trie
from comb.skkdict import parse_skkdict, merge_skkdict, ari2nasi
from skkdictutils import parse_skkdict, merge_skkdict, ari2nasi
class UserDict:
@ -41,7 +41,6 @@ def load_user_dict_from_json_config(path: str) -> UserDict:
t = []
for k, v in merged.items():
t.append((k, '/'.join(v).encode('utf-8')))
print(t)
trie = marisa_trie.BytesTrie(t)
return UserDict(trie)

View File

@ -5,7 +5,7 @@ from typing import List, Dict, Optional
from atomicwrites import atomic_write
from comb.node import Node
from akaza.node import Node
# ユーザーの言語モデル。

View File

@ -1,2 +1,3 @@
marisa-trie=0.7.5
jaconv==0.2.4
skkdictutils=0.0.2

12
akaza-core/setup.py Normal file
View File

@ -0,0 +1,12 @@
from setuptools import setup
setup(
name="akaza",
version="0.0.1",
install_requires=["marisa-trie==0.7.5", "jaconv==0.2.4"],
extras_require={
"develop": ["dev-packageA", "dev-packageB"]
},
entry_points={
}
)

View File

@ -1,6 +1,6 @@
import pytest
from comb.combromkan import to_hiragana
from akaza.akazaromkan import to_hiragana
def test_foo():

View File

@ -1,16 +1,26 @@
from tempfile import NamedTemporaryFile
import os
import pytest
from comb.engine import Comb
from comb.system_dict import SystemDict
from comb.user_language_model import UserLanguageModel
from akaza import Akaza
from akaza.system_dict import SystemDict
from akaza.user_language_model import UserLanguageModel
from akaza.system_language_model import SystemLanguageModel
tmpfile = NamedTemporaryFile(delete=False)
user_language_model = UserLanguageModel(tmpfile.name)
system_dict = SystemDict.create()
system_dict = SystemDict('../akaza-data/system_dict.trie')
comb = Comb(user_language_model=user_language_model, system_dict=system_dict, user_dict=None)
system_language_model = SystemLanguageModel.create('../akaza-data/system_language_model.trie')
akaza = Akaza(
user_language_model=user_language_model,
system_dict=system_dict,
user_dict=None,
system_language_model=system_language_model
)
@pytest.mark.parametrize('src, expected', [
@ -25,13 +35,13 @@ comb = Comb(user_language_model=user_language_model, system_dict=system_dict, us
('IME', 'IME'),
])
def test_wnn(src, expected):
clauses = comb.convert(src)
clauses = akaza.convert(src)
got = ''.join([clause[0].word for clause in clauses])
assert got == expected
def test_wnn2():
clauses = comb.convert("わたし")
clauses = akaza.convert("わたし")
hiragana_len = len([True for node in clauses[0] if node.word == 'わたし'])
for node in clauses[0]:
print(node)

View File

@ -1,30 +1,22 @@
from tempfile import TemporaryDirectory
from comb.combromkan import to_hiragana
import pytest
import marisa_trie
from comb.system_dict import SystemDict
from comb.graph import lookup, graph_construct, viterbi
from comb.language_model import LanguageModel
import logging
from tempfile import TemporaryDirectory
import os
import pytest
from comb.system_language_model import SystemLanguageModel
from comb.user_language_model import UserLanguageModel
from akaza.graph import lookup, graph_construct, viterbi
from akaza.language_model import LanguageModel
from akaza.system_dict import SystemDict
from akaza.system_language_model import SystemLanguageModel
from akaza.user_language_model import UserLanguageModel
unigram_score = marisa_trie.RecordTrie('@f')
unigram_score.load('model/jawiki.1gram')
bigram_score = marisa_trie.RecordTrie('@f')
bigram_score.load('model/jawiki.2gram')
system_language_model = SystemLanguageModel(unigram_score, bigram_score)
system_language_model = SystemLanguageModel.create('../akaza-data/system_language_model.trie')
tmpdir = TemporaryDirectory()
user_language_model = UserLanguageModel(tmpdir.name)
language_model = LanguageModel(system_language_model, user_language_model=user_language_model)
system_dict = SystemDict('model/system_dict.trie')
system_dict = SystemDict('../akaza-data/system_dict.trie')
logging.basicConfig(level=logging.DEBUG)

View File

@ -1,6 +1,6 @@
from comb.system_dict import SystemDict
from akaza.system_dict import SystemDict
system_dict = SystemDict('model/system_dict.trie')
system_dict = SystemDict('../akaza-data/system_dict.trie')
def test_system_dict():

View File

@ -1,14 +1,7 @@
from tempfile import NamedTemporaryFile, TemporaryDirectory
from tempfile import TemporaryDirectory
from comb.node import Node
from comb.user_language_model import UserLanguageModel
import marisa_trie
unigram_score = marisa_trie.RecordTrie('@f')
unigram_score.load('model/jawiki.1gram')
bigram_score = marisa_trie.RecordTrie('@f')
bigram_score.load('model/jawiki.2gram')
from akaza.node import Node
from akaza.user_language_model import UserLanguageModel
def test_read():

View File

@ -1,4 +1,4 @@
all: jawiki.1gram system_dict.trie
all: system_dict.trie system_language_model.trie
jawiki-latest-pages-articles.xml.bz2:
wget --no-verbose --no-clobber -O jawiki-latest-pages-articles.xml.bz2 https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2
@ -32,4 +32,3 @@ system_dict.trie: jawiki.vocab
python bin/make-system-dict.py
.PHONY: all

14
akaza-data/README.md Normal file
View File

@ -0,0 +1,14 @@
# akaza-data
## What's this?
System dictionary/language model package for Akaza.
## PyPI's size limit
*The default size limit on PyPI is 60MB*
* [unidic-lite](https://www.dampfkraft.com/code/distributing-large-files-with-pypi.html)
## See also

View File

@ -5,7 +5,7 @@ import marisa_trie
sys.path.append('../')
from comb.skkdict import parse_skkdict, merge_skkdict, ari2nasi
from skkdictutils import parse_skkdict, merge_skkdict, ari2nasi
# jawiki.vocab から system_dict.trie を作成する。

View File

@ -1,5 +1,6 @@
import logging
import os
import pathlib
import sys
from Mykytea import Mykytea
import re
@ -68,6 +69,9 @@ def main():
total = len(sys.argv[1:])
for ifile in sys.argv[1:]:
ofile = ifile.replace('text/', 'dat/')
pathlib.Path(ofile).parent.mkdir(parents=True, exist_ok=True)
logging.info(f"[{os.getpid()}] {ifile} -> {ofile} ({count}/{total})")
with open(ifile, 'r') as rfp, \
open(ofile, 'w') as wfp:

View File

@ -0,0 +1 @@
skkdictutils>=0.0.2

View File

@ -1,3 +0,0 @@
SYS_CONF_DIR = '@SYSCONFDIR@'
MODEL_DIR = '@MODELDIR@'
DICTIONARY_DIR = '@DICTIONARYDIR@'

View File

@ -1,29 +0,0 @@
import math
import marisa_trie
from comb.config import MODEL_DIR
from comb.node import Node
DEFAULT_SCORE = [(math.log10(0.00000000001),)]
class SystemLanguageModel:
def __init__(self, score: marisa_trie.RecordTrie):
self.score = score
@staticmethod
def create():
score = marisa_trie.RecordTrie('@f')
score.mmap(f"{MODEL_DIR}/system_language_model.trie")
return SystemLanguageModel(score)
def get_unigram_cost(self, key: str) -> float:
return self.score.get(key, DEFAULT_SCORE)[0][0]
def get_bigram_cost(self, node1: Node, node2: Node) -> float:
key1 = node1.get_key()
key2 = node2.get_key()
key = key1 + "\t" + key2
return self.score.get(key, DEFAULT_SCORE)[0][0]

1
ibus-akaza/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/akaza.xml

45
ibus-akaza/Makefile Normal file
View File

@ -0,0 +1,45 @@
PREFIX ?= /usr
SYSCONFDIR ?= /etc
DATADIR ?= $(PREFIX)/share
DESTDIR ?=
PYTHON ?= /usr/bin/python3
all: akaza.xml ibus_akaza/config.py
install: ibus_akaza/config.py akaza.xml
install -m 0755 -d $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza $(DESTDIR)$(SYSCONFDIR)/xdg/akaza $(DESTDIR)$(DATADIR)/ibus/component
install -m 0644 akaza.svg $(DESTDIR)$(DATADIR)/ibus-akaza
install -m 0644 ibus.py $(DESTDIR)$(DATADIR)/ibus-akaza
install -m 0644 ibus_akaza/ui.py $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/
install -m 0644 ibus_akaza/config.py $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/
install -m 0644 akaza.xml $(DESTDIR)$(DATADIR)/ibus/component
ibus_akaza/config.py: ibus_akaza/config.py.in
sed -e "s:@SYSCONFDIR@:$(SYSCONFDIR):g" \
-e "s:@MODELDIR@:$(DESTDIR)/$(DATADIR)/ibus-akaza/model:g" \
-e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-akaza/dictionary:g" \
$< > $@
akaza.xml: akaza.xml.in
sed -e "s:@PYTHON@:$(PYTHON):g;" \
-e "s:@DATADIR@:$(DATADIR):g" $< > $@
check:
python -m py_compile ibus.py
python -m py_compile ibus_akaza/ui.py
uninstall:
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/akaza.svg
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/ui.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/ibus_akaza/config.py
rm -f $(DESTDIR)$(DATADIR)/ibus-akaza/ibus.py
rmdir $(DESTDIR)$(DATADIR)/ibus-akaza
clean:
rm -f akaza.xml
.PHONY: all check install uninstall clean

View File

@ -1,23 +1,23 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- filename: comb.xml -->
<!-- filename: akaza.xml -->
<component>
<name>org.freedesktop.IBus.Comb</name>
<description>Comb - kana kanji converter</description>
<name>org.freedesktop.IBus.Akaza</name>
<description>Akaza - kana kanji converter</description>
<version>0.0.1</version>
<license>GPL</license>
<author>Tokuhiro Matsuno &lt;tokuhirom@gmail.com&gt;</author>
<homepage>https://github.com/tokuhirom/ibus-comb</homepage>
<exec>@PYTHON@ @DATADIR@/ibus-comb/ibus.py --ibus</exec>
<textdomain>comb</textdomain>
<homepage>https://github.com/tokuhirom/ibus-akaza</homepage>
<exec>@PYTHON@ @DATADIR@/ibus-akaza/ibus.py --ibus</exec>
<textdomain>akaza</textdomain>
<engines>
<engine>
<name>comb</name>
<longname>comb</longname>
<description>Comb - Kana Kanji Converter</description>
<name>akaza</name>
<longname>akaza</longname>
<description>Akaza - Kana Kanji Converter</description>
<language>ja</language>
<license>GPL</license>
<author>Tokuhiro Matsuno &lt;tokuhirom@gmail.com&gt;</author>
<icon>@DATADIR@/ibus-comb/comb.svg</icon>
<icon>@DATADIR@/ibus-akaza/akaza.svg</icon>
<layout>us</layout>
<layout_variant></layout_variant>
<layout_option></layout_option>

View File

Before

Width:  |  Height:  |  Size: 3.4 KiB

After

Width:  |  Height:  |  Size: 3.4 KiB

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ibus-comb: ibus engine for japanese characters
# ibus-akaza: ibus engine for japanese characters
#
# Copyright (c) 2020 Tokuhiro Matsuno <tokuhirom@gmail.com>
#
@ -33,10 +33,11 @@ import getopt
import locale
import logging
logging.basicConfig(level=logging.DEBUG, filename='/tmp/ibus-comb.log', filemode='w')
logging.info("Loading ibus-comb")
# TODO: remove log file generation
logging.basicConfig(level=logging.DEBUG, filename='/tmp/ibus-akaza.log', filemode='w')
logging.info("Loading ibus-akaza")
libpath = os.path.join(os.path.dirname(__file__), "comb")
libpath = os.path.join(os.path.dirname(__file__), "akaza")
logging.info(f"library path: {libpath}")
sys.path.append(libpath)
@ -56,23 +57,23 @@ class IMApp:
logging.info("Loading IMApp")
from comb.ui import CombIBusEngine
from akaza.ui import AkazaIBusEngine
self.mainloop = GLib.MainLoop()
self.bus = IBus.Bus()
self.bus.connect("disconnected", self.bus_disconnected_cb)
self.factory = IBus.Factory.new(self.bus.get_connection())
self.factory.add_engine("comb", GObject.type_from_name("CombIBusEngine"))
self.factory.add_engine("akaza", GObject.type_from_name("AkazaIBusEngine"))
if exec_by_ibus:
self.bus.request_name("org.freedesktop.IBus.Comb", 0)
else:
xml_path = os.path.join(__base_dir__, 'comb.xml')
xml_path = os.path.join(__base_dir__, 'AkazaIBusEngine.xml')
if os.path.exists(xml_path):
component = IBus.Component.new_from_file(xml_path)
else:
xml_path = os.path.join(os.path.dirname(__base_dir__),
'ibus', 'component', 'comb.xml')
'ibus', 'component', 'akaza.xml')
component = IBus.Component.new_from_file(xml_path)
self.bus.register_component(component)

1
ibus-akaza/ibus_akaza/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/config.py

View File

@ -0,0 +1,5 @@
import os
SYS_CONF_DIR = os.environ.get('AKAZA_SYSCONF_DIR', '@SYSCONFDIR@')
MODEL_DIR = os.environ.get('AKAZA_MODEL_DIR', '@MODELDIR@')
DICTIONARY_DIR = os.environ.get('AKAZA_DICTIONARY_DIR', '@DICTIONARYDIR@')

View File

@ -15,12 +15,13 @@ import pathlib
from jaconv import jaconv
from comb import combromkan
from comb.engine import Comb
from comb.node import Node
from comb.user_language_model import UserLanguageModel
from comb.system_dict import SystemDict
from comb.user_dict import load_user_dict_from_json_config
from akaza import akazaromkan
from akaza.engine import Comb
from akaza.node import Node
from akaza.user_language_model import UserLanguageModel
from akaza.system_dict import SystemDict
from akaza.user_dict import load_user_dict_from_json_config
from akaza.config import MODEL_DIR
MODE_KANA = 1
MODE_ALPHA = 2
@ -37,13 +38,14 @@ for n in range(1, 10):
numpad_keys.append(getattr(IBus, 'KP_0'))
del n
configdir = os.path.join(GLib.get_user_config_dir(), 'ibus-comb')
configdir = os.path.join(GLib.get_user_config_dir(), 'ibus-akaza')
pathlib.Path(os.path.join(configdir, 'user-dict')).mkdir(parents=True, exist_ok=True)
logging.info(f"Loading user dictionary: {configdir}")
user_language_model = UserLanguageModel(os.path.join(configdir, 'user-dict'))
logging.info("Loaded user dictionary")
system_dict = SystemDict.create()
system_dict_path = os.path.join(DICTIONARY_DIR, 'system_dict.trie')
system_dict = SystemDict(system_dict_path)
logging.info("Loaded system dictionary")
try:
@ -60,7 +62,10 @@ try:
else:
logging.info(f"'{user_dict_conf_path}' does not exist.")
comb = Comb(user_language_model, system_dict, user_dict)
system_language_model_path = f"{MODEL_DIR}/system_language_model.trie"
system_language_model = SystemLanguageModel.create(system_language_model_path)
akaza = Comb(user_language_model, system_dict, user_dict, system_language_model)
logging.info("Finished Comb.")
except:
logging.error("Cannot initialize.", exc_info=True)
@ -71,27 +76,27 @@ except:
# the engine
# ----------------------------------------------------------------------
class CombIBusEngine(IBus.Engine):
class AkazaIBusEngine(IBus.Engine):
user_language_model: UserLanguageModel
current_clause: int
node_selected: Dict[int, int]
clauses: List[List[Node]]
prop_list: IBus.PropList
comb: Comb
akaza: Comb
mode: int
force_selected_clause: List[slice]
__gtype_name__ = 'CombIBusEngine'
__gtype_name__ = 'kaza'
def __init__(self):
super(CombIBusEngine, self).__init__()
super(AkazaIBusEngine, self).__init__()
self.is_invalidate = False
# 未確定文字列。
self.preedit_string = ''
# 候補文字列
self.lookup_table = IBus.LookupTable.new(page_size=10, cursor_pos=0, cursor_visible=True, round=True)
self.prop_list = IBus.PropList()
self.comb = comb
self.akaza = akaza
self.user_language_model = user_language_model
self.user_dict = user_dict
self.logger = logging.getLogger(__name__)
@ -168,7 +173,7 @@ class CombIBusEngine(IBus.Engine):
self.commit_candidate()
else:
# 無変換状態では、ひらがなに変換してコミットします。
self.commit_string(combromkan.to_hiragana(self.preedit_string))
self.commit_string(akazaromkan.to_hiragana(self.preedit_string))
return True
elif keyval == IBus.Escape:
self.preedit_string = ''
@ -289,7 +294,7 @@ class CombIBusEngine(IBus.Engine):
self.logger.info("Convert to full katakana")
# カタカナ候補のみを表示するようにする。
hira = combromkan.to_hiragana(self.preedit_string)
hira = akazaromkan.to_hiragana(self.preedit_string)
kata = jaconv.hira2kata(hira)
self.convert_to_single(hira, kata)
@ -298,14 +303,14 @@ class CombIBusEngine(IBus.Engine):
self.logger.info("Convert to full hiragana")
# カタカナ候補のみを表示するようにする。
hira = combromkan.to_hiragana(self.preedit_string)
hira = akazaromkan.to_hiragana(self.preedit_string)
self.convert_to_single(hira, hira)
def convert_to_half_katakana(self):
self.logger.info("Convert to half katakana")
# 半角カタカナ候補のみを表示するようにする。
hira = combromkan.to_hiragana(self.preedit_string)
hira = akazaromkan.to_hiragana(self.preedit_string)
kata = jaconv.hira2kata(hira)
kata = jaconv.z2h(kata)
@ -315,7 +320,7 @@ class CombIBusEngine(IBus.Engine):
self.logger.info("Convert to half romaji")
# 半角カタカナ候補のみを表示するようにする。
hira = combromkan.to_hiragana(self.preedit_string)
hira = akazaromkan.to_hiragana(self.preedit_string)
romaji = jaconv.z2h(self.preedit_string)
self.convert_to_single(hira, romaji)
@ -323,7 +328,7 @@ class CombIBusEngine(IBus.Engine):
def convert_to_full_romaji(self):
self.logger.info("Convert to full romaji")
hira = combromkan.to_hiragana(self.preedit_string)
hira = akazaromkan.to_hiragana(self.preedit_string)
romaji = jaconv.h2z(self.preedit_string, kana=True, digit=True, ascii=True)
self.convert_to_single(hira, romaji)
@ -530,7 +535,7 @@ class CombIBusEngine(IBus.Engine):
def _update_candidates(self):
if len(self.preedit_string) > 0:
# 変換をかける
self.clauses = self.comb.convert(self.preedit_string, self.force_selected_clause)
self.clauses = self.akaza.convert(self.preedit_string, self.force_selected_clause)
else:
self.clauses = []
self.create_lookup_table()
@ -588,7 +593,7 @@ class CombIBusEngine(IBus.Engine):
return
# 平仮名にする。
text = combromkan.to_hiragana(self.preedit_string)
text = akazaromkan.to_hiragana(self.preedit_string)
self.clauses = [
[Node(word=text, yomi=text, start_pos=3)]
]

View File

@ -1,8 +1,13 @@
from comb.ui import CombIBusEngine
import os
os.environ['AKAZA_DICTIONARY_DIR'] = 'model/'
os.environ['AKAZA_MODEL_DIR'] = 'model/'
from akaza.ui import AkazaIBusEngine
def test_extend_clause_right():
ui = CombIBusEngine()
ui = AkazaIBusEngine()
ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず
ui.update_candidates()
@ -32,7 +37,7 @@ def test_extend_clause_right():
def test_extend_clause_right_most_right():
ui = CombIBusEngine()
ui = AkazaIBusEngine()
ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず
ui.update_candidates()
@ -56,7 +61,7 @@ def test_extend_clause_right_most_right():
def test_extend_clause_left():
ui = CombIBusEngine()
ui = AkazaIBusEngine()
ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず
ui.update_candidates()
@ -89,7 +94,7 @@ def test_extend_clause_left():
def test_extend_clause_left_most_left():
ui = CombIBusEngine()
ui = AkazaIBusEngine()
ui.preedit_string = "tanosiijikan" # 楽し/い/時間 になるはず
ui.update_candidates()

View File

@ -1,3 +0,0 @@
[pytest]
log_format = %(asctime)s %(levelname)s %(message)s
log_date_format = %Y-%m-%d %H:%M:%S

View File

@ -1,29 +0,0 @@
from comb.engine import parse_skkdict
import marisa_trie
dictionary = parse_skkdict('/usr/share/skk/SKK-JISYO.L', encoding='euc-jp')
print("START")
t = []
for k, v in dictionary.items():
vvv = '/'.join(v).encode('utf-8')
t.append((k, vvv))
trie = marisa_trie.BytesTrie(t)
print("LOADED")
def gen_latice(s):
for n in range(len(s) - 1):
print(n)
word = s[0:n]
print(word)
src = 'ひつようなことは'
for prefix in reversed(trie.prefixes(src)):
kanjis = trie[prefix][0].decode('utf-8').split('/')
for kanji in kanjis:
print(kanji + src[len(prefix):])

View File

@ -1,28 +0,0 @@
from comb.engine import parse_skkdict
import pygtrie
dictionary = parse_skkdict('/usr/share/skk/SKK-JISYO.L', encoding='euc-jp')
t = pygtrie.CharTrie()
for k, v in dictionary.items():
vvv = '/'.join(v).encode('utf-8')
t[k] = v
print("LOADED")
def gen_latice(s):
for n in range(len(s) - 1):
print(n)
word = s[0:n]
print(word)
src = 'じゅうかきんぜい'
# print(t.get('じゅうか'))
for s in t.prefixes('たんげつ'):
print(s)
# f = src[0]
# print(gen_latice(src))

4
skkdictutils/.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
/.pytest_cache
/dist/
*.egg-info
build/

33
skkdictutils/README.md Normal file
View File

@ -0,0 +1,33 @@
# skkdictutils
## Current status
**This library is under development.**
API is unstable.
## Release process
See https://packaging.python.org/tutorials/packaging-projects/
## LICENSE
Copyright (c) 2020 Tokuhiro Matsuno
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

18
skkdictutils/setup.py Normal file
View File

@ -0,0 +1,18 @@
import setuptools
setuptools.setup(
name="skkdictutils",
version="0.0.2",
install_requires=['romkan==0.2.1'],
extras_require={
},
entry_points={
},
packages=setuptools.find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
)

View File

@ -1,11 +1,13 @@
import re
from typing import Dict, List
from comb import combromkan
import romkan
BOIN = ['a', 'i', 'u', 'e', 'o']
__all__ = ['parse_skkdict', 'merge_skkdict', 'ari2nasi', 'write_skkdict', 'expand_okuri']
LOWER_PATTERN = re.compile('[a-z]')
_BOIN = ['a', 'i', 'u', 'e', 'o']
_LOWER_PATTERN = re.compile('[a-z]')
def parse_skkdict(path: str, encoding: str = 'euc-jp'):
@ -64,13 +66,13 @@ def merge_skkdict(dicts: List[Dict[str, List[str]]]) -> Dict[str, List[str]]:
def expand_okuri(kana: str, kanjis: List[str]):
if kana[-1].isalpha():
if kana[-1] in BOIN:
okuri = combromkan.to_hiragana(kana[-1])
if kana[-1] in _BOIN:
okuri = romkan.to_hiragana(kana[-1])
yield kana[:-1] + okuri, [kanji + okuri for kanji in kanjis]
else:
for b in BOIN:
okuri = combromkan.to_hiragana(kana[-1] + b)
if LOWER_PATTERN.match(okuri):
for b in _BOIN:
okuri = romkan.to_hiragana(kana[-1] + b)
if _LOWER_PATTERN.match(okuri):
# wu のように、変換できないものは無視する。
continue
yield kana[:-1] + okuri, [kanji + okuri for kanji in kanjis]

View File

@ -1,4 +1,4 @@
from comb.skkdict import merge_skkdict, expand_okuri
from skkdictutils import merge_skkdict, expand_okuri
def test_merge_skkdict():

View File