diff --git a/Sources/KanaKanjiConverterModule/DictionaryManagement/TypoCorrection.swift b/Sources/KanaKanjiConverterModule/DictionaryManagement/TypoCorrection.swift index 40abbad..1a60070 100644 --- a/Sources/KanaKanjiConverterModule/DictionaryManagement/TypoCorrection.swift +++ b/Sources/KanaKanjiConverterModule/DictionaryManagement/TypoCorrection.swift @@ -18,18 +18,27 @@ struct TypoCorrectionGenerator: Sendable { } } // 深さ優先で列挙する + var leftConvertTargetElements: [ComposingText.ConvertTargetElement] = [] + for element in inputs[0 ..< range.leftIndex] { + ComposingText.updateConvertTargetElements(currentElements: &leftConvertTargetElements, newElement: element) + } + let actualLeftConvertTarget = leftConvertTargetElements.reduce(into: "") { $0 += $1.string} + self.stack = nodes[0].compactMap { typoCandidate in - guard let firstElement = typoCandidate.inputElements.first else { + var convertTargetElements = [ComposingText.ConvertTargetElement]() + var fullConvertTargetElements = leftConvertTargetElements + for element in typoCandidate.inputElements { + ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element) + ComposingText.updateConvertTargetElements(currentElements: &fullConvertTargetElements, newElement: element) + } + let fullConvertTarget = fullConvertTargetElements.reduce(into: "") { $0 += $1.string} + let convertTarget = convertTargetElements.reduce(into: "") { $0 += $1.string} + + if fullConvertTarget == actualLeftConvertTarget + convertTarget { + return (convertTargetElements, typoCandidate.inputElements.count, typoCandidate.weight) + } else { return nil } - if ComposingText.isLeftSideValid(first: firstElement, of: inputs, from: range.leftIndex) { - var convertTargetElements = [ComposingText.ConvertTargetElement]() - for element in typoCandidate.inputElements { - ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element) - } - return (convertTargetElements, typoCandidate.inputElements.last!, typoCandidate.inputElements.count, typoCandidate.weight) - } - return nil } } @@ -44,11 +53,45 @@ struct TypoCorrectionGenerator: Sendable { var rightIndexRange: Range } - var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)] + var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], count: Int, penalty: PValue)] + + private static func check( + _ leftConvertTargetElements: [ComposingText.ConvertTargetElement], + isPrefixOf rightConvertTargetElements: [ComposingText.ConvertTargetElement] + ) -> Bool { + if leftConvertTargetElements.count > rightConvertTargetElements.count { + // 常に不成立 + return false + } else if leftConvertTargetElements.count == rightConvertTargetElements.count { + let lastIndex = leftConvertTargetElements.count - 1 + if lastIndex == -1 { + // この場合、両者emptyの配列なのでtrueを返す。 + return true + } + // 最後の1つのエレメントがprefixの関係にあれば成立 + for (lhs, rhs) in zip(leftConvertTargetElements[0 ..< lastIndex], rightConvertTargetElements[0 ..< lastIndex]) { + if lhs != rhs { + return false + } + } + if leftConvertTargetElements[lastIndex].inputStyle != rightConvertTargetElements[lastIndex].inputStyle { + return false + } + return rightConvertTargetElements[lastIndex].string.hasPrefix(leftConvertTargetElements[lastIndex].string) + } else { + // leftConvertTargetElementsのインデックスの範囲ですべて一致していればprefixが成立 + for (lhs, rhs) in zip(leftConvertTargetElements, rightConvertTargetElements[0 ..< leftConvertTargetElements.endIndex]) { + if lhs != rhs { + return false + } + } + return true + } + } /// `target`で始まる場合は到達不可能であることを知らせる mutating func setUnreachablePath(target: some Collection) { - self.stack = self.stack.filter { (convertTargetElements, lastElement, count, penalty) in + self.stack = self.stack.filter { (convertTargetElements, count, penalty) in var stablePrefix: [Character] = [] loop: for item in convertTargetElements { switch item.inputStyle { @@ -79,11 +122,18 @@ struct TypoCorrectionGenerator: Sendable { } mutating func next() -> ([Character], (endIndex: Lattice.LatticeIndex, penalty: PValue))? { - while let (convertTargetElements, lastElement, count, penalty) = self.stack.popLast() { + while let (convertTargetElements, count, penalty) = self.stack.popLast() { var result: ([Character], (endIndex: Lattice.LatticeIndex, penalty: PValue))? = nil if self.range.rightIndexRange.contains(count + self.range.leftIndex - 1) { - if let convertTarget = ComposingText.getConvertTargetIfRightSideIsValid(lastElement: lastElement, of: inputs, to: count + self.range.leftIndex, convertTargetElements: convertTargetElements)?.map({$0.toKatakana()}) { - result = (convertTarget, (.input(count + self.range.leftIndex - 1), penalty)) + let originalConvertTarget = convertTargetElements.reduce(into: []) { $0 += $1.string.map { $0.toKatakana() } } + if self.range.leftIndex + count < self.inputs.endIndex { + var newConvertTargetElements = convertTargetElements + ComposingText.updateConvertTargetElements(currentElements: &newConvertTargetElements, newElement: inputs[self.range.leftIndex + count]) + if Self.check(convertTargetElements, isPrefixOf: newConvertTargetElements) { + result = (originalConvertTarget, (.input(count + self.range.leftIndex - 1), penalty)) + } + } else { + result = (originalConvertTarget, (.input(count + self.range.leftIndex - 1), penalty)) } } // エスケープ @@ -108,7 +158,7 @@ struct TypoCorrectionGenerator: Sendable { for element in correct { ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element) } - stack.append((convertTargetElements, correct.last!, count + correct.count, penalty)) + stack.append((convertTargetElements, count + correct.count, penalty)) } else { stack.append(contentsOf: self.nodes[count].compactMap { if count + $0.inputElements.count > self.nodes.endIndex { @@ -120,7 +170,6 @@ struct TypoCorrectionGenerator: Sendable { } return ( convertTargetElements: convertTargetElements, - lastElement: $0.inputElements.last!, count: count + $0.inputElements.count, penalty: penalty + $0.weight ) diff --git a/Sources/KanaKanjiConverterModule/InputManagement/ComposingText.swift b/Sources/KanaKanjiConverterModule/InputManagement/ComposingText.swift index 9e90b16..32fe406 100644 --- a/Sources/KanaKanjiConverterModule/InputManagement/ComposingText.swift +++ b/Sources/KanaKanjiConverterModule/InputManagement/ComposingText.swift @@ -412,154 +412,6 @@ extension ComposingText { return convertTargetElements.reduce(into: "") {$0 += $1.string} } - static func shouldEscapeOtherValidation(convertTargetElement: [ConvertTargetElement], of originalElements: [InputElement]) -> Bool { - let string = convertTargetElement.reduce(into: "") {$0 += $1.string} - // 句読点や矢印のエスケープ - if !string.containsRomanAlphabet { - return true - } - if ["→", "↓", "↑", "←"].contains(string) { - return true - } - return false - } - - static func isLeftSideValid(first firstElement: InputElement, of originalElements: [InputElement], from leftIndex: Int) -> Bool { - // leftIndexの位置にある`el`のチェック - // 許されるパターンは以下の通り - // * leftIndex == startIndex - // * el:direct - // * (_:direct) -> el - // * (a|i|u|e|o:roman2kana) -> el // aka、のような場合、ka部分を正当とみなす - // * (e-1:roman2kana and not n) && e-1 == es // tta、のような場合、ta部分を正当とみなすが、nnaはだめ。 - // * (n:roman2kana) -> el && el not a|i|u|e|o|y|n // nka、のような場合、ka部分を正当とみなすが、nnaはだめ。 - - if leftIndex < originalElements.startIndex { - return false - } - // 左端か、directなElementである場合 - guard leftIndex != originalElements.startIndex && firstElement.inputStyle == .roman2kana else { - return true - } - - let prevLastElement = originalElements[leftIndex - 1] - if prevLastElement.inputStyle != .roman2kana || !CharacterUtils.isRomanLetter(prevLastElement.character) { - return true - } - - if ["a", "i", "u", "e", "o"].contains(prevLastElement.character) { - return true - } - if prevLastElement.character != "n" && prevLastElement.character == firstElement.character { - return true - } - let last_2 = originalElements[0 ..< leftIndex].suffix(2) - if ["zl", "zk", "zj", "zh", "xn"].contains(last_2.reduce(into: "") {$0.append($1.character)}) { - return true - } - let n_suffix = originalElements[0 ..< leftIndex].suffix(while: {$0.inputStyle == .roman2kana && $0.character == "n"}) - // 末尾のnが偶数個で右側にnがなければvalid - if n_suffix.count % 2 == 0 && !n_suffix.isEmpty { - return true - } - // 末尾のnが奇数個で、なお直後の文字が母音・ny-・nnではない場合はvalid - if n_suffix.count % 2 == 1 && !["a", "i", "u", "e", "o", "y", "n"].contains(firstElement.character) { - return true - } - // 末尾のnが奇数個で、なおかつその1つ前の文字がxであればvalid (xn→ん、への対応) - if n_suffix.count % 2 == 1 && originalElements.dropLast(n_suffix.count).last == .init(character: "x", inputStyle: .roman2kana) { - return true - } - return false - } - - /// 右側がvalidか調べる - /// - Parameters: - /// - lastElement: 領域の最後の要素 - /// - convertTargetElements: 領域内まで読んで作成した`convertTarget` - /// - originalElements: 領域を取り出した元の`input` - /// - rightIndex: 領域の右隣の要素のインデックス - /// - Returns: 正当か否か - static func isRightSideValid(lastElement: InputElement, convertTargetElements: [ConvertTargetElement], of originalElements: [InputElement], to rightIndex: Int) -> Bool { - // rightIndexの位置にあるerのチェック - // 許されるパターンは以下の通り - // * rightIndex == endIndex - // * er:direct - // * er -> (_:direct) - // * er == a|i|u|e|o // aka、のような場合、a部分を正当とみなす - // * er != n && er -> er == e+1 // kka、のような場合、k部分を正当とみなす - // * er == n && er -> (e+1:roman2kana and not a|i|u|e|o|n|y) // (nn)*nka、のような場合、(nn)*n部分を正当とみなす - // * er == n && er -> (e+1:roman2kana) // (nn)*a、のような場合、nn部分を正当とみなす - // 左端か、directなElementである場合 - guard rightIndex != originalElements.endIndex && lastElement.inputStyle == .roman2kana else { - return true - } - if lastElement.inputStyle != .roman2kana { - return true - } - let nextFirstElement = originalElements[rightIndex] - if nextFirstElement.inputStyle != .roman2kana || !CharacterUtils.isRomanLetter(nextFirstElement.character) { - return true - } - if ["a", "i", "u", "e", "o"].contains(lastElement.character) { - return true - } - if lastElement.character != "n" && lastElement.character == nextFirstElement.character { - return true - } - guard let lastConvertTargetElements = convertTargetElements.last else { - return false - } - // nnが偶数個なら許す - if lastElement.character == "n" && lastConvertTargetElements.string.last != "n" { - return true - } - // nが最後に1つ余っていて、characterが条件を満たせば許す - if lastElement.character == "n" && lastConvertTargetElements.inputStyle == .roman2kana && lastConvertTargetElements.string.last == "n" && !["a", "i", "u", "e", "o", "y", "n"].contains(nextFirstElement.character) { - return true - } - return false - } - - /// 「正当な」部分領域を返す関数 - /// - Parameters: - /// - lastElement: 領域の最後の要素 - /// - originalElements: 領域を取り出した元の`input` - /// - rightIndex: 領域の右隣の要素のインデックス - /// - convertTargetElements: 領域内まで読んで作成した`convertTarget` - /// - Returns: 領域がvalidであれば`convertTarget`を返し、invalidなら`nil`を返す。 - /// - Note: `elements = [r(k, a, n, s, h, a)]`のとき、`k,a,n,s,h,a`や`k, a`は正当だが`a, n`や`s, h`は正当ではない。`k, a, n`は特に正当であるとみなす。 - static func getConvertTargetIfRightSideIsValid(lastElement: InputElement, of originalElements: [InputElement], to rightIndex: Int, convertTargetElements: [ConvertTargetElement]) -> [Character]? { - debug(#function, lastElement, rightIndex) - if originalElements.endIndex < rightIndex { - return nil - } - // 正当性のチェックを行う - // 基本的に、convertTargetと正しく対応する部分のみを取り出したい。 - let shouldEscapeValidation = Self.shouldEscapeOtherValidation(convertTargetElement: convertTargetElements, of: originalElements) - if !shouldEscapeValidation && !Self.isRightSideValid(lastElement: lastElement, convertTargetElements: convertTargetElements, of: originalElements, to: rightIndex) { - return nil - } - // ここまで来たらvalid - var convertTargetElements = convertTargetElements - if let lastElement = convertTargetElements.last, lastElement.inputStyle == .roman2kana, rightIndex < originalElements.endIndex { - let nextFirstElement = originalElements[rightIndex] - - if !lastElement.string.hasSuffix("n") && lastElement.string.last == nextFirstElement.character && CharacterUtils.isRomanLetter(nextFirstElement.character) { - // 書き換える - convertTargetElements[convertTargetElements.endIndex - 1].string.removeLast() - convertTargetElements.append(ConvertTargetElement(string: ["っ"], inputStyle: .direct)) - } - - if lastElement.string.hasSuffix("n") && !["a", "i", "u", "e", "o", "y", "n"].contains(nextFirstElement.character) { - // 書き換える - convertTargetElements[convertTargetElements.endIndex - 1].string.removeLast() - convertTargetElements.append(ConvertTargetElement(string: ["ん"], inputStyle: .direct)) - } - } - return convertTargetElements.reduce(into: []) {$0 += $1.string} - } - // inputStyleが同一であるような文字列を集積したもの // k, o, r, e, h, aまでをローマ字入力し、p, e, nをダイレクト入力、d, e, s, uをローマ字入力した場合、 // originalInputに対して[ElementComposition(これは, roman2kana), ElementComposition(pen, direct), ElementComposition(です, roman2kana)]、のようになる。 diff --git a/Tests/KanaKanjiConverterModuleTests/ComposingTextTests.swift b/Tests/KanaKanjiConverterModuleTests/ComposingTextTests.swift index 40197ba..e992de1 100644 --- a/Tests/KanaKanjiConverterModuleTests/ComposingTextTests.swift +++ b/Tests/KanaKanjiConverterModuleTests/ComposingTextTests.swift @@ -148,52 +148,6 @@ final class ComposingTextTests: XCTestCase { } - func testIsRightSideValid() throws { - do { - var c = ComposingText() - c.insertAtCursorPosition("akafatta", inputStyle: .roman2kana) // あかふぁった| - XCTAssertTrue(ComposingText.isRightSideValid(lastElement: ComposingText.InputElement(character: "a", inputStyle: .roman2kana), convertTargetElements: [ComposingText.ConvertTargetElement(string: ["あ"], inputStyle: .roman2kana)], of: c.input, to: 1)) - XCTAssertFalse(ComposingText.isRightSideValid(lastElement: ComposingText.InputElement(character: "k", inputStyle: .roman2kana), convertTargetElements: [ComposingText.ConvertTargetElement(string: ["あ", "k"], inputStyle: .roman2kana)], of: c.input, to: 2)) - XCTAssertTrue(ComposingText.isRightSideValid(lastElement: ComposingText.InputElement(character: "a", inputStyle: .roman2kana), convertTargetElements: [ComposingText.ConvertTargetElement(string: ["あ", "か"], inputStyle: .roman2kana)], of: c.input, to: 3)) - XCTAssertFalse(ComposingText.isRightSideValid(lastElement: ComposingText.InputElement(character: "f", inputStyle: .roman2kana), convertTargetElements: [ComposingText.ConvertTargetElement(string: ["あ", "か", "f"], inputStyle: .roman2kana)], of: c.input, to: 4)) - XCTAssertTrue(ComposingText.isRightSideValid(lastElement: ComposingText.InputElement(character: "a", inputStyle: .roman2kana), convertTargetElements: [ComposingText.ConvertTargetElement(string: ["あ", "か", "ふ", "ぁ"], inputStyle: .roman2kana)], of: c.input, to: 5)) - // これはtrueにしている - XCTAssertTrue(ComposingText.isRightSideValid(lastElement: ComposingText.InputElement(character: "t", inputStyle: .roman2kana), convertTargetElements: [ComposingText.ConvertTargetElement(string: ["あ", "か", "ふ", "ぁ", "t"], inputStyle: .roman2kana)], of: c.input, to: 6)) - // これはfalse - XCTAssertFalse(ComposingText.isRightSideValid(lastElement: ComposingText.InputElement(character: "t", inputStyle: .roman2kana), convertTargetElements: [ComposingText.ConvertTargetElement(string: ["あ", "か", "ふ", "ぁ", "t", "t"], inputStyle: .roman2kana)], of: c.input, to: 7)) - XCTAssertTrue(ComposingText.isRightSideValid(lastElement: ComposingText.InputElement(character: "a", inputStyle: .roman2kana), convertTargetElements: [ComposingText.ConvertTargetElement(string: ["あ", "か", "ふ", "ぁ", "っ", "た"], inputStyle: .roman2kana)], of: c.input, to: 8)) - } - } - - func testGetConvertTargetIfRightSideIsValid() throws { - do { - var c = ComposingText() - c.insertAtCursorPosition("akafatta", inputStyle: .roman2kana) // あかふぁった| - XCTAssertEqual( - ComposingText.getConvertTargetIfRightSideIsValid( - lastElement: ComposingText.InputElement(character: "t", inputStyle: .roman2kana), - of: c.input, - to: 6, - convertTargetElements: [ComposingText.ConvertTargetElement(string: Array("あかふぁt"), inputStyle: .roman2kana)] - ), - Array("あかふぁっ") - ) - } - do { - var c = ComposingText() - c.insertAtCursorPosition("kintarou", inputStyle: .roman2kana) // きんたろう| - XCTAssertEqual( - ComposingText.getConvertTargetIfRightSideIsValid( - lastElement: ComposingText.InputElement(character: "n", inputStyle: .roman2kana), - of: c.input, - to: 3, - convertTargetElements: [ComposingText.ConvertTargetElement(string: Array("きn"), inputStyle: .roman2kana)] - ), - Array("きん") - ) - } - } - func testDifferenceSuffix() throws { do { var c1 = ComposingText() diff --git a/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift b/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift index 31a219d..185fea5 100644 --- a/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift +++ b/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift @@ -175,6 +175,21 @@ final class DicdataStoreTests: XCTestCase { } } + /// 入力誤りを確実に修正できてほしい語群 + func testMustCorrectTypoRoman2Kana() throws { + let dicdataStore = DicdataStore(convertRequestOptions: requestOptions()) + let mustWords = [ + ("tskamatsu", "高松"), // ts -> タ + ("kitsmura", "北村"), // ts -> タ + ] + for (key, word) in mustWords { + var c = ComposingText() + c.insertAtCursorPosition(key, inputStyle: .roman2kana) + let result = dicdataStore.lookupDicdata(composingText: c, inputRange: (0, c.input.endIndex - 1 ..< c.input.endIndex), needTypoCorrection: true) + XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word) + } + } + func testLookupDicdata() throws { let dicdataStore = DicdataStore(convertRequestOptions: requestOptions()) do { @@ -209,6 +224,12 @@ final class DicdataStoreTests: XCTestCase { var c = ComposingText() sequentialInput(&c, sequence: "tukatt", inputStyle: .roman2kana) let result = dicdataStore.lookupDicdata(composingText: c, inputRange: (0, 4..<6)) + XCTAssertFalse(result.contains(where: {$0.data.word == "使っ"})) + } + do { + var c = ComposingText() + sequentialInput(&c, sequence: "tukatt", inputStyle: .roman2kana) + let result = dicdataStore.lookupDicdata(composingText: c, surfaceRange: (0, nil)) XCTAssertTrue(result.contains(where: {$0.data.word == "使っ"})) } } @@ -288,9 +309,14 @@ final class DicdataStoreTests: XCTestCase { do { var c = ComposingText() sequentialInput(&c, sequence: "tesutowaーdo", inputStyle: .roman2kana) - let result = dicdataStore.lookupDicdata(composingText: c, inputRange: (0, c.input.endIndex - 1 ..< c.input.endIndex), needTypoCorrection: false) + let result = dicdataStore.lookupDicdata( + composingText: c, + inputRange: (0, c.input.endIndex - 1 ..< c.input.endIndex), + surfaceRange: (0, c.convertTarget.count - 1 ..< c.convertTarget.count), + needTypoCorrection: false + ) XCTAssertTrue(result.contains(where: {$0.data.word == "テストワード"})) - XCTAssertEqual(result.first(where: {$0.data.word == "テストワード"})?.range, .input(from: 0, to: 11)) + XCTAssertEqual(result.first(where: {$0.data.word == "テストワード"})?.range, .surface(from: 0, to: 6)) } // 動的ユーザ辞書の単語が通常の辞書よりも優先されることのテスト