mirror of
https://github.com/mii443/AzooKeyKanaKanjiConverter.git
synced 2025-08-22 15:05:26 +00:00
Merge pull request #213 from azooKey/refactor/unify_no_typo_correction_case
This commit is contained in:
@ -242,36 +242,18 @@ public final class DicdataStore {
|
||||
return [louds.searchNodeIndex(chars: charIDs)].compactMap {$0}
|
||||
}
|
||||
|
||||
/// 入力の各prefix(ア、アイ、アイウ...)をすべて順にたどってLOUDS辞書から候補を検索する関数。
|
||||
/// - Parameters:
|
||||
/// - group: 検索対象の辞書をキーにした、検索語(Character配列とcharID配列のペア)のリスト。
|
||||
/// - depth: 各検索語のprefix深さの範囲。例: `2..<4` なら2文字・3文字のprefixを対象にする。
|
||||
/// - Returns: 各辞書ごとに、見つかったノードインデックスの集合。
|
||||
///
|
||||
/// 「アイウ」に対して「ア」「アイ」「アイウ」のすべてをLOUDSで検索するバルク処理を行う。
|
||||
private func movingTowardPrefixSearch(group: [String: [([Character], [UInt8])]], depth: Range<Int>) -> [(key: String, indices: Set<Int>)] {
|
||||
let indices: [(String, Set<Int>)] = group.map {dic in
|
||||
guard let louds = self.loadLOUDS(query: dic.key) else {
|
||||
return (dic.key, [])
|
||||
}
|
||||
// バルク処理用の実装を呼び出す
|
||||
let result = louds.byfixNodeIndices(targets: dic.value.map { $0.1 }, depth: depth)
|
||||
return (dic.key, Set(result))
|
||||
}
|
||||
return indices
|
||||
}
|
||||
|
||||
func movingTowardPrefixSearch(
|
||||
inputs: [ComposingText.InputElement],
|
||||
leftIndex: Int,
|
||||
rightIndexRange: Range<Int>,
|
||||
useMemory: Bool
|
||||
useMemory: Bool,
|
||||
needTypoCorrection: Bool
|
||||
) -> (
|
||||
stringToInfo: [[Character]: (endIndex: Int, penalty: PValue)],
|
||||
indices: [(key: String, indices: [Int])],
|
||||
temporaryMemoryDicdata: [DicdataElement]
|
||||
) {
|
||||
var generator = TypoCorrectionGenerator(inputs: inputs, leftIndex: leftIndex, rightIndexRange: rightIndexRange)
|
||||
var generator = TypoCorrectionGenerator(inputs: inputs, leftIndex: leftIndex, rightIndexRange: rightIndexRange, needTypoCorrection: needTypoCorrection)
|
||||
var targetLOUDS: [String: LOUDS.MovingTowardPrefixSearchHelper] = [:]
|
||||
var stringToInfo: [([Character], (endIndex: Int, penalty: PValue))] = []
|
||||
|
||||
@ -329,7 +311,6 @@ public final class DicdataStore {
|
||||
}
|
||||
}
|
||||
let minCount = stringToInfo.map {$0.0.count}.min() ?? 0
|
||||
print(#function, minCount, stringToInfo.map{$0.0})
|
||||
return (
|
||||
Dictionary(stringToInfo, uniquingKeysWith: {$0.penalty < $1.penalty ? $1 : $0}),
|
||||
targetLOUDS.map { ($0.key, $0.value.indicesInDepth(depth: minCount - 1 ..< .max) )},
|
||||
@ -381,9 +362,6 @@ public final class DicdataStore {
|
||||
/// - from: 起点
|
||||
/// - toIndexRange: `from ..< (toIndexRange)`の範囲で辞書ルックアップを行う。
|
||||
public func getLOUDSDataInRange(inputData: ComposingText, from fromIndex: Int, toIndexRange: Range<Int>? = nil, needTypoCorrection: Bool = true) -> [LatticeNode] {
|
||||
if !needTypoCorrection {
|
||||
return self.getFrozenLOUDSDataInRange(inputData: inputData, from: fromIndex, toIndexRange: toIndexRange)
|
||||
}
|
||||
let toIndexLeft = toIndexRange?.startIndex ?? fromIndex
|
||||
let toIndexRight = min(toIndexRange?.endIndex ?? inputData.input.count, fromIndex + self.maxlength)
|
||||
if fromIndex > toIndexLeft || toIndexLeft >= toIndexRight {
|
||||
@ -395,7 +373,7 @@ public final class DicdataStore {
|
||||
segments.append((segments.last ?? "") + String(inputData.input[rightIndex].character.toKatakana()))
|
||||
}
|
||||
// MARK: 誤り訂正の対象を列挙する。非常に重い処理。
|
||||
var (stringToInfo, indices, dicdata) = self.movingTowardPrefixSearch(inputs: inputData.input, leftIndex: fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight, useMemory: self.learningManager.enabled)
|
||||
var (stringToInfo, indices, dicdata) = self.movingTowardPrefixSearch(inputs: inputData.input, leftIndex: fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight, useMemory: self.learningManager.enabled, needTypoCorrection: needTypoCorrection)
|
||||
// MARK: 検索によって得たindicesから辞書データを実際に取り出していく
|
||||
for (identifier, value) in indices {
|
||||
let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
|
||||
@ -452,173 +430,6 @@ public final class DicdataStore {
|
||||
}
|
||||
}
|
||||
|
||||
/// kana2latticeから参照する。
|
||||
/// - Parameters:
|
||||
/// - inputData: 入力データ
|
||||
/// - from: 起点
|
||||
/// - toIndexRange: `from ..< (toIndexRange)`の範囲で辞書ルックアップを行う。
|
||||
private func getFrozenLOUDSDataInRange(inputData: ComposingText, from fromIndex: Int, toIndexRange: Range<Int>? = nil) -> [LatticeNode] {
|
||||
let toIndexLeft = toIndexRange?.startIndex ?? fromIndex
|
||||
let toIndexRight = min(toIndexRange?.endIndex ?? inputData.input.count, fromIndex + self.maxlength)
|
||||
debug(#function, fromIndex, toIndexRange?.description ?? "nil", toIndexLeft, toIndexRight)
|
||||
if fromIndex > toIndexLeft || toIndexLeft >= toIndexRight {
|
||||
debug(#function, "index is wrong")
|
||||
return []
|
||||
}
|
||||
|
||||
let character = String(inputData.input[fromIndex].character.toKatakana())
|
||||
let characterNode = LatticeNode(data: DicdataElement(word: character, ruby: character, cid: CIDData.一般名詞.cid, mid: MIDData.一般.mid, value: -10), inputRange: fromIndex ..< fromIndex + 1)
|
||||
if fromIndex == .zero {
|
||||
characterNode.prevs.append(.BOSNode())
|
||||
}
|
||||
|
||||
// MARK: 誤り訂正なし
|
||||
let stringToEndIndex = TypoCorrection.getRangesWithoutTypos(inputs: inputData.input, leftIndex: fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
|
||||
// MARK: 検索対象を列挙していく。
|
||||
guard let (minString, maxString) = stringToEndIndex.keys.minAndMax(by: {$0.count < $1.count}) else {
|
||||
debug(#function, "minString/maxString is nil", stringToEndIndex)
|
||||
return [characterNode]
|
||||
}
|
||||
let maxIDs = maxString.map(self.character2charId)
|
||||
var group: [String: [([Character], [UInt8])]] = [
|
||||
String(stringToEndIndex.keys.first!.first!): [(maxString, maxIDs)],
|
||||
"user": [(maxString, maxIDs)],
|
||||
]
|
||||
if learningManager.enabled {
|
||||
group["memory"] = group["user"]
|
||||
}
|
||||
// MARK: 検索によって得たindicesから辞書データを実際に取り出していく
|
||||
var dicdata: [DicdataElement] = []
|
||||
let depth = minString.count - 1 ..< maxString.count
|
||||
for (identifier, indices) in self.movingTowardPrefixSearch(group: group, depth: depth) {
|
||||
dicdata.append(contentsOf: self.getDicdataFromLoudstxt3(identifier: identifier, indices: indices))
|
||||
}
|
||||
if learningManager.enabled {
|
||||
// temporalな学習結果にpenaltyを加えて追加する
|
||||
dicdata.append(
|
||||
contentsOf: self.learningManager.movingTowardPrefixSearchOnTemporaryMemory(charIDs: consume maxIDs, depth: depth).dicdata.flatMap { $0.value }
|
||||
)
|
||||
}
|
||||
for (key, value) in stringToEndIndex {
|
||||
let convertTarget = String(key)
|
||||
dicdata.append(contentsOf: self.getWiseDicdata(convertTarget: convertTarget, inputData: inputData, inputRange: fromIndex ..< value + 1))
|
||||
dicdata.append(contentsOf: self.getMatchDynamicUserDict(convertTarget))
|
||||
}
|
||||
if fromIndex == .zero {
|
||||
return dicdata.compactMap {
|
||||
guard let endIndex = stringToEndIndex[Array($0.ruby)] else {
|
||||
return nil
|
||||
}
|
||||
let node = LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
|
||||
node.prevs.append(RegisteredNode.BOSNode())
|
||||
return node
|
||||
} + [characterNode]
|
||||
} else {
|
||||
return dicdata.compactMap {
|
||||
guard let endIndex = stringToEndIndex[Array($0.ruby)] else {
|
||||
return nil
|
||||
}
|
||||
return LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
|
||||
} + [characterNode]
|
||||
}
|
||||
}
|
||||
|
||||
/// kana2latticeから参照する。louds版。
|
||||
/// - Parameters:
|
||||
/// - inputData: 入力データ
|
||||
/// - from: 始点
|
||||
/// - to: 終点
|
||||
public func getLOUDSData(inputData: ComposingText, from fromIndex: Int, to toIndex: Int, needTypoCorrection: Bool) -> [LatticeNode] {
|
||||
if toIndex - fromIndex > self.maxlength || fromIndex > toIndex {
|
||||
return []
|
||||
}
|
||||
let segment = inputData.input[fromIndex...toIndex].reduce(into: "") {$0.append($1.character)}.toKatakana()
|
||||
|
||||
// TODO: 最適化の余地あり
|
||||
let string2penalty = TypoCorrection.getRangeWithTypos(inputs: inputData.input, leftIndex: fromIndex, rightIndex: toIndex).filter {
|
||||
needTypoCorrection || $0.value == 0.0
|
||||
}
|
||||
|
||||
// MARK: 検索によって得たindicesから辞書データを実際に取り出していく
|
||||
// 先頭の文字: そこで検索したい文字列の集合
|
||||
let strings = string2penalty.keys.map {
|
||||
(key: $0, charIDs: $0.map(self.character2charId))
|
||||
}
|
||||
let group = [Character: [(key: [Character], charIDs: [UInt8])]].init(grouping: strings, by: {$0.key.first!})
|
||||
|
||||
var indices: [(String, Set<Int>)] = group.map {dic in
|
||||
let head = String(dic.key)
|
||||
let set = dic.value.flatMapSet { (_, charIDs) in
|
||||
self.perfectMatchingSearch(query: head, charIDs: charIDs)
|
||||
}
|
||||
return (head, set)
|
||||
}
|
||||
do {
|
||||
let set = strings.flatMapSet { (_, charIDs) in
|
||||
self.perfectMatchingSearch(query: "user", charIDs: charIDs)
|
||||
}
|
||||
indices.append(("user", set))
|
||||
}
|
||||
if learningManager.enabled {
|
||||
let set = strings.flatMapSet { (_, charIDs) in
|
||||
self.perfectMatchingSearch(query: "memory", charIDs: charIDs)
|
||||
}
|
||||
indices.append(("memory", set))
|
||||
}
|
||||
var dicdata: [DicdataElement] = []
|
||||
for (identifier, value) in indices {
|
||||
let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
|
||||
let rubyArray = Array(data.ruby)
|
||||
let penalty = string2penalty[rubyArray, default: .zero]
|
||||
if penalty.isZero {
|
||||
return data
|
||||
}
|
||||
let ratio = Self.penaltyRatio[data.lcid]
|
||||
let pUnit: PValue = Self.getPenalty(data: data) / 2 // 負の値
|
||||
let adjust = pUnit * penalty * ratio
|
||||
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
|
||||
return nil
|
||||
}
|
||||
return data.adjustedData(adjust)
|
||||
}
|
||||
dicdata.append(contentsOf: result)
|
||||
}
|
||||
// temporalな学習結果にpenaltyを加えて追加する
|
||||
for (characters, charIds) in consume strings {
|
||||
for data in self.learningManager.temporaryPerfectMatch(charIDs: consume charIds) {
|
||||
// perfect matchなので、Array(data.ruby)はcharactersに等しい
|
||||
let penalty = string2penalty[characters, default: .zero]
|
||||
if penalty.isZero {
|
||||
dicdata.append(data)
|
||||
}
|
||||
let ratio = Self.penaltyRatio[data.lcid]
|
||||
let pUnit: PValue = Self.getPenalty(data: data) / 2 // 負の値
|
||||
let adjust = pUnit * penalty * ratio
|
||||
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: characters.count) {
|
||||
continue
|
||||
}
|
||||
dicdata.append(data.adjustedData(adjust))
|
||||
}
|
||||
}
|
||||
|
||||
dicdata.append(contentsOf: self.getWiseDicdata(convertTarget: segment, inputData: inputData, inputRange: fromIndex ..< toIndex + 1))
|
||||
for segment in string2penalty.keys {
|
||||
dicdata.append(contentsOf: self.getMatchDynamicUserDict(String(segment)))
|
||||
}
|
||||
|
||||
if fromIndex == .zero {
|
||||
let result: [LatticeNode] = dicdata.map {
|
||||
let node = LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)
|
||||
node.prevs.append(RegisteredNode.BOSNode())
|
||||
return node
|
||||
}
|
||||
return result
|
||||
} else {
|
||||
let result: [LatticeNode] = dicdata.map {LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)}
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
func getZeroHintPredictionDicdata(lastRcid: Int) -> [DicdataElement] {
|
||||
do {
|
||||
let csvString = try String(contentsOf: requestOptions.dictionaryResourceURL.appendingPathComponent("p/pc_\(lastRcid).csv", isDirectory: false), encoding: .utf8)
|
||||
@ -895,15 +706,6 @@ public final class DicdataStore {
|
||||
return self.mmValue[former * self.midCount + latter]
|
||||
}
|
||||
|
||||
private static let possibleLOUDS: Set<Character> = [
|
||||
" ", " ̄", "‐", "―", "〜", "・", "、", "…", "‥", "。", "‘", "’", "“", "”", "〈", "〉", "《", "》", "「", "」", "『", "』", "【", "】", "〔", "〕", "‖", "*", "′", "〃", "※", "´", "¨", "゛", "゜", "←", "→", "↑", "↓", "─", "■", "□", "▲", "△", "▼", "▽", "◆", "◇", "○", "◎", "●", "★", "☆", "々", "ゝ", "ヽ", "ゞ", "ヾ", "ー", "〇", "ァ", "ア", "ィ", "イ", "ゥ", "ウ", "ヴ", "ェ", "エ", "ォ", "オ", "ヵ", "カ", "ガ", "キ", "ギ", "ク", "グ", "ヶ", "ケ", "ゲ", "コ", "ゴ", "サ", "ザ", "シ", "ジ", "〆", "ス", "ズ", "セ", "ゼ", "ソ", "ゾ", "タ", "ダ", "チ", "ヂ", "ッ", "ツ", "ヅ", "テ", "デ", "ト", "ド", "ナ", "ニ", "ヌ", "ネ", "ノ", "ハ", "バ", "パ", "ヒ", "ビ", "ピ", "フ", "ブ", "プ", "ヘ", "ベ", "ペ", "ホ", "ボ", "ポ", "マ", "ミ", "ム", "メ", "モ", "ヤ", "ユ", "ョ", "ヨ", "ラ", "リ", "ル", "レ", "ロ", "ヮ", "ワ", "ヰ", "ヱ", "ヲ", "ン", "仝", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "!", "?", "(", ")", "#", "%", "&", "^", "_", "'", "\""
|
||||
]
|
||||
|
||||
// 誤り訂正候補の構築の際、ファイルが存在しているか事前にチェックし、存在していなければ以後の計算を打ち切ることで、計算を減らす。
|
||||
static func existLOUDS(for character: Character) -> Bool {
|
||||
Self.possibleLOUDS.contains(character)
|
||||
}
|
||||
|
||||
/*
|
||||
文節の切れ目とは
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
import SwiftUtils
|
||||
|
||||
struct TypoCorrectionGenerator {
|
||||
init(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndexRange: Range<Int>) {
|
||||
struct TypoCorrectionGenerator: Sendable {
|
||||
init(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndexRange: Range<Int>, needTypoCorrection: Bool) {
|
||||
self.maxPenalty = needTypoCorrection ? 3.5 * 3 : 0
|
||||
self.inputs = inputs
|
||||
self.left = left
|
||||
self.rightIndexRange = rightIndexRange
|
||||
@ -9,12 +10,12 @@ struct TypoCorrectionGenerator {
|
||||
let count = rightIndexRange.endIndex - left
|
||||
self.count = count
|
||||
self.nodes = (0..<count).map {(i: Int) in
|
||||
TypoCorrection.lengths.flatMap {(k: Int) -> [TypoCorrection.TypoCandidate] in
|
||||
Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
|
||||
let j = i + k
|
||||
if count <= j {
|
||||
return []
|
||||
}
|
||||
return TypoCorrection.getTypo(inputs[left + i ... left + j])
|
||||
return Self.getTypo(inputs[left + i ... left + j], frozen: !needTypoCorrection)
|
||||
}
|
||||
}
|
||||
// 深さ優先で列挙する
|
||||
@ -33,11 +34,11 @@ struct TypoCorrectionGenerator {
|
||||
}
|
||||
}
|
||||
|
||||
let maxPenalty: PValue = 3.5 * 3
|
||||
let maxPenalty: PValue
|
||||
let inputs: [ComposingText.InputElement]
|
||||
let left: Int
|
||||
let rightIndexRange: Range<Int>
|
||||
let nodes: [[TypoCorrection.TypoCandidate]]
|
||||
let nodes: [[TypoCandidate]]
|
||||
let count: Int
|
||||
|
||||
var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)]
|
||||
@ -51,7 +52,6 @@ struct TypoCorrectionGenerator {
|
||||
case .direct:
|
||||
stablePrefix.append(contentsOf: item.string)
|
||||
case .roman2kana:
|
||||
// TODO: impl
|
||||
var stableIndex = item.string.endIndex
|
||||
for suffix in Roman2Kana.unstableSuffixes {
|
||||
if item.string.hasSuffix(suffix) {
|
||||
@ -115,9 +115,6 @@ struct TypoCorrectionGenerator {
|
||||
for element in $0.inputElements {
|
||||
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
|
||||
}
|
||||
if TypoCorrection.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
|
||||
return nil
|
||||
}
|
||||
return (
|
||||
convertTargetElements: convertTargetElements,
|
||||
lastElement: $0.inputElements.last!,
|
||||
@ -133,204 +130,27 @@ struct TypoCorrectionGenerator {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: 誤り訂正用のAPI
|
||||
enum TypoCorrection {
|
||||
fileprivate static func shouldBeRemovedForDicdataStore(components: [ComposingText.ConvertTargetElement]) -> Bool {
|
||||
// 判定に使うのは最初の1エレメントの最初の文字で十分
|
||||
guard let first = components.first?.string.first?.toKatakana() else {
|
||||
return false
|
||||
}
|
||||
return !CharacterUtils.isRomanLetter(first) && !DicdataStore.existLOUDS(for: first)
|
||||
}
|
||||
|
||||
/// closedRangeでもらう
|
||||
/// 例えば`left=4, rightIndexRange=6..<10`の場合、`4...6, 4...7, 4...8, 4...9`の範囲で計算する
|
||||
/// `left <= rightIndexRange.startIndex`が常に成り立つ
|
||||
static func getRangesWithoutTypos(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndexRange: Range<Int>) -> [[Character]: Int] {
|
||||
let count = rightIndexRange.endIndex - left
|
||||
debug(#function, left, rightIndexRange, count)
|
||||
let nodes = (0..<count).map {(i: Int) in
|
||||
Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
|
||||
let j = i + k
|
||||
if count <= j {
|
||||
return []
|
||||
}
|
||||
// frozen: trueとしているため、typo候補は含まれない
|
||||
return Self.getTypo(inputs[left + i ... left + j], frozen: true)
|
||||
}
|
||||
}
|
||||
|
||||
// Performance Tuning Note:直接Dictionaryを作るのではなく、一度Arrayを作ってから最後にDictionaryに変換する方が、高速である
|
||||
var stringToInfo: [([Character], Int)] = []
|
||||
|
||||
// 深さ優先で列挙する
|
||||
var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int)] = nodes[0].compactMap { typoCandidate in
|
||||
guard let firstElement = typoCandidate.inputElements.first else {
|
||||
return nil
|
||||
}
|
||||
if ComposingText.isLeftSideValid(first: firstElement, of: inputs, from: left) {
|
||||
var convertTargetElements = [ComposingText.ConvertTargetElement]()
|
||||
for element in typoCandidate.inputElements {
|
||||
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
|
||||
}
|
||||
return (convertTargetElements, typoCandidate.inputElements.last!, typoCandidate.inputElements.count)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
while case .some((var convertTargetElements, let lastElement, let count)) = stack.popLast() {
|
||||
if rightIndexRange.contains(count + left - 1) {
|
||||
if let convertTarget = ComposingText.getConvertTargetIfRightSideIsValid(lastElement: lastElement, of: inputs, to: count + left, convertTargetElements: convertTargetElements)?.map({$0.toKatakana()}) {
|
||||
stringToInfo.append((convertTarget, (count + left - 1)))
|
||||
}
|
||||
}
|
||||
// エスケープ
|
||||
if nodes.endIndex <= count {
|
||||
continue
|
||||
}
|
||||
stack.append(contentsOf: nodes[count].compactMap {
|
||||
if count + $0.inputElements.count > nodes.endIndex {
|
||||
return nil
|
||||
}
|
||||
for element in $0.inputElements {
|
||||
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
|
||||
}
|
||||
if Self.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
|
||||
return nil
|
||||
}
|
||||
return (
|
||||
convertTargetElements: convertTargetElements,
|
||||
lastElement: $0.inputElements.last!,
|
||||
count: count + $0.inputElements.count
|
||||
)
|
||||
})
|
||||
}
|
||||
return Dictionary(stringToInfo, uniquingKeysWith: {$0 < $1 ? $1 : $0})
|
||||
}
|
||||
|
||||
|
||||
static func getRangeWithTypos(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndex right: Int) -> [[Character]: PValue] {
|
||||
// 各iから始まる候補を列挙する
|
||||
// 例えばinput = [d(あ), r(s), r(i), r(t), r(s), d(は), d(は), d(れ)]の場合
|
||||
// nodes = [[d(あ)], [r(s)], [r(i)], [r(t), [r(t), r(a)]], [r(s)], [d(は), d(ば), d(ぱ)], [d(れ)]]
|
||||
// となる
|
||||
let count = right - left + 1
|
||||
let nodes = (0..<count).map {(i: Int) in
|
||||
Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
|
||||
let j = i + k
|
||||
if count <= j {
|
||||
return []
|
||||
}
|
||||
return Self.getTypo(inputs[left + i ... left + j])
|
||||
}
|
||||
}
|
||||
|
||||
let maxPenalty: PValue = 3.5 * 3
|
||||
|
||||
// 深さ優先で列挙する
|
||||
var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)] = nodes[0].compactMap { typoCandidate in
|
||||
guard let firstElement = typoCandidate.inputElements.first else {
|
||||
return nil
|
||||
}
|
||||
if ComposingText.isLeftSideValid(first: firstElement, of: inputs, from: left) {
|
||||
var convertTargetElements = [ComposingText.ConvertTargetElement]()
|
||||
for element in typoCandidate.inputElements {
|
||||
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
|
||||
}
|
||||
return (convertTargetElements, typoCandidate.inputElements.last!, typoCandidate.inputElements.count, typoCandidate.weight)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var stringToPenalty: [([Character], PValue)] = []
|
||||
|
||||
while let (convertTargetElements, lastElement, count, penalty) = stack.popLast() {
|
||||
if count + left - 1 == right {
|
||||
if let convertTarget = ComposingText.getConvertTargetIfRightSideIsValid(lastElement: lastElement, of: inputs, to: count + left, convertTargetElements: convertTargetElements)?.map({$0.toKatakana()}) {
|
||||
stringToPenalty.append((convertTarget, penalty))
|
||||
}
|
||||
continue
|
||||
}
|
||||
// エスケープ
|
||||
if nodes.endIndex <= count {
|
||||
continue
|
||||
}
|
||||
// 訂正数上限(3個)
|
||||
if penalty >= maxPenalty {
|
||||
var convertTargetElements = convertTargetElements
|
||||
let correct = [inputs[left + count]].map {ComposingText.InputElement(character: $0.character.toKatakana(), inputStyle: $0.inputStyle)}
|
||||
if count + correct.count > nodes.endIndex {
|
||||
continue
|
||||
}
|
||||
for element in correct {
|
||||
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
|
||||
}
|
||||
stack.append((convertTargetElements, correct.last!, count + correct.count, penalty))
|
||||
} else {
|
||||
stack.append(contentsOf: nodes[count].compactMap {
|
||||
if count + $0.inputElements.count > nodes.endIndex {
|
||||
return nil
|
||||
}
|
||||
var convertTargetElements = convertTargetElements
|
||||
for element in $0.inputElements {
|
||||
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
|
||||
}
|
||||
if Self.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
|
||||
return nil
|
||||
}
|
||||
return (
|
||||
convertTargetElements: convertTargetElements,
|
||||
lastElement: $0.inputElements.last!,
|
||||
count: count + $0.inputElements.count,
|
||||
penalty: penalty + $0.weight
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
return Dictionary(stringToPenalty, uniquingKeysWith: max)
|
||||
}
|
||||
|
||||
fileprivate static func getTypo(_ elements: some Collection<ComposingText.InputElement>, frozen: Bool = false) -> [TypoCandidate] {
|
||||
let key = elements.reduce(into: "") {$0.append($1.character)}.toKatakana()
|
||||
let key = elements.reduce(into: "") {$0.append($1.character.toKatakana())}
|
||||
|
||||
if (elements.allSatisfy {$0.inputStyle == .direct}) {
|
||||
let dictionary: [String: [TypoUnit]] = frozen ? [:] : Self.directPossibleTypo
|
||||
let dictionary: [String: [TypoCandidate]] = frozen ? [:] : Self.directPossibleTypo
|
||||
if key.count > 1 {
|
||||
return dictionary[key, default: []].map {
|
||||
TypoCandidate(
|
||||
inputElements: $0.value.map {ComposingText.InputElement(character: $0, inputStyle: .direct)},
|
||||
weight: $0.weight
|
||||
)
|
||||
}
|
||||
return dictionary[key, default: []]
|
||||
} else if key.count == 1 {
|
||||
var result = dictionary[key, default: []].map {
|
||||
TypoCandidate(
|
||||
inputElements: $0.value.map {ComposingText.InputElement(character: $0, inputStyle: .direct)},
|
||||
weight: $0.weight
|
||||
)
|
||||
}
|
||||
var result = dictionary[key, default: []]
|
||||
// そのまま
|
||||
result.append(TypoCandidate(inputElements: key.map {ComposingText.InputElement(character: $0, inputStyle: .direct)}, weight: 0))
|
||||
return result
|
||||
}
|
||||
}
|
||||
if (elements.allSatisfy {$0.inputStyle == .roman2kana}) {
|
||||
let dictionary: [String: [String]] = frozen ? [:] : Self.roman2KanaPossibleTypo
|
||||
let dictionary: [String: [TypoCandidate]] = frozen ? [:] : Self.roman2KanaPossibleTypo
|
||||
if key.count > 1 {
|
||||
return dictionary[key, default: []].map {
|
||||
TypoCandidate(
|
||||
inputElements: $0.map {ComposingText.InputElement(character: $0, inputStyle: .roman2kana)},
|
||||
weight: 3.5
|
||||
)
|
||||
}
|
||||
return dictionary[key, default: []]
|
||||
} else if key.count == 1 {
|
||||
var result = dictionary[key, default: []].map {
|
||||
TypoCandidate(
|
||||
inputElements: $0.map {ComposingText.InputElement(character: $0, inputStyle: .roman2kana)},
|
||||
weight: 3.5
|
||||
)
|
||||
}
|
||||
var result = dictionary[key, default: []]
|
||||
// そのまま
|
||||
result.append(
|
||||
TypoCandidate(inputElements: key.map {ComposingText.InputElement(character: $0, inputStyle: .roman2kana)}, weight: 0)
|
||||
@ -353,13 +173,13 @@ enum TypoCorrection {
|
||||
}
|
||||
}
|
||||
|
||||
struct TypoCandidate: Equatable {
|
||||
struct TypoCandidate: Sendable, Equatable {
|
||||
var inputElements: [ComposingText.InputElement]
|
||||
var weight: PValue
|
||||
}
|
||||
|
||||
/// ダイレクト入力用
|
||||
private static let directPossibleTypo: [String: [TypoUnit]] = [
|
||||
private static let directPossibleTypo: [String: [TypoCandidate]] = [
|
||||
"カ": [TypoUnit("ガ", weight: 7.0)],
|
||||
"キ": [TypoUnit("ギ")],
|
||||
"ク": [TypoUnit("グ")],
|
||||
@ -388,9 +208,16 @@ enum TypoCorrection {
|
||||
"ヤ": [TypoUnit("ャ")],
|
||||
"ユ": [TypoUnit("ュ")],
|
||||
"ヨ": [TypoUnit("ョ")]
|
||||
]
|
||||
].mapValues {
|
||||
$0.map {
|
||||
TypoCandidate(
|
||||
inputElements: $0.value.map {ComposingText.InputElement(character: $0, inputStyle: .direct)},
|
||||
weight: $0.weight
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private static let roman2KanaPossibleTypo: [String: [String]] = [
|
||||
private static let roman2KanaPossibleTypo: [String: [TypoCandidate]] = [
|
||||
"bs": ["ba"],
|
||||
"no": ["bo"],
|
||||
"li": ["ki"],
|
||||
@ -401,5 +228,12 @@ enum TypoCorrection {
|
||||
"ts": ["ta"],
|
||||
"wi": ["wo"],
|
||||
"pu": ["ou"]
|
||||
]
|
||||
].mapValues {
|
||||
$0.map {
|
||||
TypoCandidate(
|
||||
inputElements: $0.map {ComposingText.InputElement(character: $0, inputStyle: .roman2kana)},
|
||||
weight: 3.5
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -35,7 +35,7 @@ extension Kana2Kanji {
|
||||
|
||||
// (1)
|
||||
let addedNodes: [[LatticeNode]] = (0...count).map {(i: Int) in
|
||||
self.dicdataStore.getLOUDSData(inputData: inputData, from: i, to: count, needTypoCorrection: needTypoCorrection)
|
||||
self.dicdataStore.getLOUDSDataInRange(inputData: inputData, from: i, toIndexRange: count ..< count+1, needTypoCorrection: needTypoCorrection)
|
||||
}
|
||||
|
||||
// ココが一番時間がかかっていた。
|
||||
|
@ -199,54 +199,6 @@ package struct LOUDS: Sendable {
|
||||
return self.prefixNodeIndices(nodeIndex: nodeIndex, maxDepth: maxDepth, maxCount: maxCount)
|
||||
}
|
||||
|
||||
/// 部分前方一致検索を実行する
|
||||
///
|
||||
/// 「しかい」を入力した場合、「しかい」だけでなく「し」「しか」の検索も行う。
|
||||
/// - Parameter chars: CharIDに変換した文字列
|
||||
/// - Returns: 対応するloudstxt3ファイル内のインデックスのリスト
|
||||
/// - Note: より適切な名前に変更したい
|
||||
@inlinable func byfixNodeIndices(chars: [UInt8]) -> [Int] {
|
||||
var indices = [1]
|
||||
for char in chars {
|
||||
if let nodeIndex = self.searchCharNodeIndex(from: indices.last!, char: char) {
|
||||
indices.append(nodeIndex)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
return indices
|
||||
}
|
||||
|
||||
/// 辞書順ソート
|
||||
private static func lexLessThan(_ lhs: [UInt8], _ rhs: [UInt8]) -> Bool {
|
||||
let minCount = Swift.min(lhs.count, rhs.count)
|
||||
for i in 0..<minCount {
|
||||
let l = lhs[i]
|
||||
let r = rhs[i]
|
||||
if l != r {
|
||||
return l < r
|
||||
}
|
||||
}
|
||||
return lhs.count < rhs.count
|
||||
}
|
||||
|
||||
/// 部分前方一致検索を実行する
|
||||
///
|
||||
/// 「しかい」を入力した場合、「しかい」だけでなく「し」「しか」の検索も行う。
|
||||
/// - Parameter chars: CharIDに変換した文字列
|
||||
/// - Returns: 対応するloudstxt3ファイル内のインデックスのリスト
|
||||
/// - Note: より適切な名前に変更したい
|
||||
@inlinable func byfixNodeIndices(targets: [[UInt8]], depth: Range<Int>) -> [Int] {
|
||||
// 辞書順でソートする
|
||||
var targets = targets
|
||||
targets.sort(by: Self.lexLessThan)
|
||||
var helper = MovingTowardPrefixSearchHelper(louds: self)
|
||||
for target in targets {
|
||||
_ = helper.update(target: target)
|
||||
}
|
||||
return helper.indicesInDepth(depth: depth)
|
||||
}
|
||||
|
||||
struct MovingTowardPrefixSearchHelper {
|
||||
init(louds: LOUDS) {
|
||||
self.louds = louds
|
||||
|
@ -129,7 +129,7 @@ final class DicdataStoreTests: XCTestCase {
|
||||
for (key, word) in mustWords {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition(key, inputStyle: .direct)
|
||||
let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: false)
|
||||
let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: false)
|
||||
// 冗長な書き方だが、こうすることで「どの項目でエラーが発生したのか」がはっきりするため、こう書いている。
|
||||
XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
|
||||
}
|
||||
@ -150,7 +150,7 @@ final class DicdataStoreTests: XCTestCase {
|
||||
for (key, word) in mustWords {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition(key, inputStyle: .direct)
|
||||
let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: false)
|
||||
let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: false)
|
||||
XCTAssertNil(result.first(where: {$0.data.word == word && $0.data.ruby == key}))
|
||||
}
|
||||
}
|
||||
@ -170,7 +170,7 @@ final class DicdataStoreTests: XCTestCase {
|
||||
for (key, word) in mustWords {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition(key, inputStyle: .direct)
|
||||
let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: true)
|
||||
let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: true)
|
||||
XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user