From f5037e393cc833b69c853620fb8871f7a48dcfcb Mon Sep 17 00:00:00 2001 From: Miwa <63481257+ensan-hcl@users.noreply.github.com> Date: Fri, 27 Jun 2025 22:32:46 +0900 Subject: [PATCH] =?UTF-8?q?perf:=20=E5=90=8C=E3=81=98louds=E3=81=AB?= =?UTF-8?q?=E5=AF=BE=E3=81=99=E3=82=8B=E6=A4=9C=E7=B4=A2=E3=82=92=E3=83=90?= =?UTF-8?q?=E3=83=AB=E3=82=AF=E5=87=A6=E7=90=86=E3=81=99=E3=82=8B=E3=81=93?= =?UTF-8?q?=E3=81=A8=E3=81=AB=E3=82=88=E3=81=A3=E3=81=A6=E3=80=81=E5=87=A6?= =?UTF-8?q?=E7=90=86=E3=81=AE=E5=8A=B9=E7=8E=87=E5=8C=96=E3=82=92=E5=AE=9F?= =?UTF-8?q?=E7=8F=BE=20(#208)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * perf: 同じloudsに対する検索をバルク処理することによって、処理の効率化を実現 * fix: bug * test: add typo correction test * chore: finalize imp; --- .../DicdataStore/DicdataStore.swift | 34 +++++++---- .../LOUDS/LOUDS.swift | 56 +++++++++++++++++++ .../DicdataStoreTests/DicdataStoreTests.swift | 20 +++++++ 3 files changed, 99 insertions(+), 11 deletions(-) diff --git a/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift b/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift index 0095226..c3ec2fd 100644 --- a/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift +++ b/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift @@ -243,6 +243,23 @@ public final class DicdataStore { return Array(result[min(depth.lowerBound + 1, result.endIndex) ..< min(depth.upperBound + 1, result.endIndex)]) } + /// 辞書検索用関数 + /// - Parameters: + /// - group: ファイルのプレフィックスとなる文字列(通常、最初の1文字)と、その文字列で始まる文字IDのプレフィックスの集合 + /// - depth: 検索対象となる深さ。`2..<4`の場合は2文字・3文字の候補のみ取りだす + /// - Returns: 発見されたすべてのインデックス + private func throughMatchLOUDS(group: [String: [([Character], [UInt8])]], depth: Range) -> [(key: String, indices: Set)] { + let indices: [(String, Set)] = group.map {dic in + guard let louds = self.loadLOUDS(query: dic.key) else { + return (dic.key, []) + } + // バルク処理用の実装を呼び出す + let result = louds.byfixNodeIndices(targets: dic.value.map { $0.1 }, depth: depth) + return (dic.key, Set(result)) + } + return indices + } + private func prefixMatchLOUDS(query: String, charIDs: [UInt8], depth: Int = .max, maxCount: Int = .max) -> [Int] { guard let louds = self.loadLOUDS(query: query) else { return [] @@ -292,20 +309,15 @@ public final class DicdataStore { // MARK: 誤り訂正の対象を列挙する。非常に重い処理。 var stringToInfo = inputData.getRangesWithTypos(fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight) // MARK: 検索対象を列挙していく。 - let stringSet = stringToInfo.keys.map {($0, $0.map(self.character2charId))} + let stringSet: [([Character], [UInt8])] = stringToInfo.keys.map {($0, $0.map(self.character2charId))} let (minCharIDsCount, maxCharIDsCount) = stringSet.lazy.map {$0.1.count}.minAndMax() ?? (0, -1) - // 先頭の文字: そこで検索したい文字列の集合 - let group = [Character: [([Character], [UInt8])]].init(grouping: stringSet, by: {$0.0.first!}) - let depth = minCharIDsCount - 1 ..< maxCharIDsCount - var indices: [(String, Set)] = group.map {dic in - let key = String(dic.key) - let set = dic.value.flatMapSet {(_, charIDs) in self.throughMatchLOUDS(query: key, charIDs: charIDs, depth: depth)} - return (key, set) - } - indices.append(("user", stringSet.flatMapSet {self.throughMatchLOUDS(query: "user", charIDs: $0.1, depth: depth)})) + let group = [String: [([Character], [UInt8])]].init(grouping: stringSet, by: {String($0.0.first!)}) + var indices = self.throughMatchLOUDS(group: group, depth: depth) if learningManager.enabled { - indices.append(("memory", stringSet.flatMapSet {self.throughMatchLOUDS(query: "memory", charIDs: $0.1, depth: depth)})) + indices.append(contentsOf: self.throughMatchLOUDS(group: ["user": stringSet, "memory": stringSet], depth: depth)) + } else { + indices.append(contentsOf: self.throughMatchLOUDS(group: ["user": stringSet], depth: depth)) } // MARK: 検索によって得たindicesから辞書データを実際に取り出していく var dicdata: [DicdataElement] = [] diff --git a/Sources/KanaKanjiConverterModule/LOUDS/LOUDS.swift b/Sources/KanaKanjiConverterModule/LOUDS/LOUDS.swift index 0513d55..07d87ac 100644 --- a/Sources/KanaKanjiConverterModule/LOUDS/LOUDS.swift +++ b/Sources/KanaKanjiConverterModule/LOUDS/LOUDS.swift @@ -216,4 +216,60 @@ package struct LOUDS: Sendable { } return indices } + + /// 辞書順ソート + private static func lexLessThan(_ lhs: [UInt8], _ rhs: [UInt8]) -> Bool { + let minCount = Swift.min(lhs.count, rhs.count) + for i in 0..) -> [Int] { + // 辞書順でソートする +// let targets = targets.sorted(by: Self.lexLessThan) + var targets = targets + targets.sort(by: Self.lexLessThan) + // 最終出力となる + var indices: [Int] = [] + // 現在の探索結果を保存しておく + var stack: [(nodeIndex: Int, char: UInt8)] = [] + for chars in targets { + // iがupperBoundを超えない範囲で検索を行う + for (i, char) in chars.enumerated() where i < depth.upperBound { + if i < stack.count, stack[i].char == char { + // すでに探索済み + continue + } else if i < stack.count, stack[i].char != char { + // 異なる文字が見つかったら、その時点でそこから先のstackを破棄 + stack = Array(stack[..= stack.count, "stack[\(i)] must not exist for logical reason.") + // このケースでは、探索を行う + // 直前のstackを取り出し、そのnodeIndexから次のcharを探索する + if let nodeIndex = self.searchCharNodeIndex(from: stack.last?.nodeIndex ?? 1, char: char) { + if depth.contains(i) { + indices.append(nodeIndex) + } + stack.append((nodeIndex, char)) + } else { + // 見つからなかった場合、打ち切る + break + } + } + } + return indices + } } diff --git a/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift b/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift index 8c16c9c..d738518 100644 --- a/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift +++ b/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift @@ -155,6 +155,26 @@ final class DicdataStoreTests: XCTestCase { } } + /// 入力誤りを確実に修正できてほしい語群 + func testMustCorrectTypo() throws { + let dicdataStore = DicdataStore(convertRequestOptions: requestOptions()) + let mustWords = [ + ("タイカクセイ", "大学生"), + ("シヨック", "ショック"), + ("キヨクイン", "局員"), + ("シヨーク", "ジョーク"), + ("サリカニ", "ザリガニ"), + ("ノクチヒテヨ", "野口英世"), + ("オタノフナカ", "織田信長"), + ] + for (key, word) in mustWords { + var c = ComposingText() + c.insertAtCursorPosition(key, inputStyle: .direct) + let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: true) + XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word) + } + } + func testGetLOUDSDataInRange() throws { let dicdataStore = DicdataStore(convertRequestOptions: requestOptions()) do {