perf: 同じloudsに対する検索をバルク処理することによって、処理の効率化を実現 (#208)

* perf: 同じloudsに対する検索をバルク処理することによって、処理の効率化を実現 * fix: bug * test: add typo correction test * chore: finalize imp;
2025-08-22 15:05:26 +00:00 · 2025-06-27 22:32:46 +09:00
parent f60a9a8738
commit f5037e393c
3 changed files with 99 additions and 11 deletions
--- a/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift
+++ b/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift
@ -243,6 +243,23 @@ public final class DicdataStore {
        return Array(result[min(depth.lowerBound + 1, result.endIndex) ..< min(depth.upperBound + 1, result.endIndex)])
    }

+    /// 辞書検索用関数
+    /// - Parameters:
+    ///   - group: ファイルのプレフィックスとなる文字列（通常、最初の1文字）と、その文字列で始まる文字IDのプレフィックスの集合
+    ///   - depth: 検索対象となる深さ。`2..<4`の場合は2文字・3文字の候補のみ取りだす
+    /// - Returns: 発見されたすべてのインデックス
+    private func throughMatchLOUDS(group: [String: [([Character], [UInt8])]], depth: Range<Int>) -> [(key: String, indices: Set<Int>)] {
+        let indices: [(String, Set<Int>)] = group.map {dic in
+            guard let louds = self.loadLOUDS(query: dic.key) else {
+                return (dic.key, [])
+            }
+            // バルク処理用の実装を呼び出す
+            let result = louds.byfixNodeIndices(targets: dic.value.map { $0.1 }, depth: depth)
+            return (dic.key, Set(result))
+        }
+        return indices
+    }
+
    private func prefixMatchLOUDS(query: String, charIDs: [UInt8], depth: Int = .max, maxCount: Int = .max) -> [Int] {
        guard let louds = self.loadLOUDS(query: query) else {
            return []
@ -292,20 +309,15 @@ public final class DicdataStore {
        // MARK: 誤り訂正の対象を列挙する。非常に重い処理。
        var stringToInfo = inputData.getRangesWithTypos(fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
        // MARK: 検索対象を列挙していく。
-        let stringSet = stringToInfo.keys.map {($0, $0.map(self.character2charId))}
+        let stringSet: [([Character], [UInt8])] = stringToInfo.keys.map {($0, $0.map(self.character2charId))}
        let (minCharIDsCount, maxCharIDsCount) = stringSet.lazy.map {$0.1.count}.minAndMax() ?? (0, -1)
-        // 先頭の文字: そこで検索したい文字列の集合
-        let group = [Character: [([Character], [UInt8])]].init(grouping: stringSet, by: {$0.0.first!})
-
        let depth = minCharIDsCount - 1 ..< maxCharIDsCount
-        var indices: [(String, Set<Int>)] = group.map {dic in
-            let key = String(dic.key)
-            let set = dic.value.flatMapSet {(_, charIDs) in self.throughMatchLOUDS(query: key, charIDs: charIDs, depth: depth)}
-            return (key, set)
-        }
-        indices.append(("user", stringSet.flatMapSet {self.throughMatchLOUDS(query: "user", charIDs: $0.1, depth: depth)}))
+        let group = [String: [([Character], [UInt8])]].init(grouping: stringSet, by: {String($0.0.first!)})
+        var indices = self.throughMatchLOUDS(group: group, depth: depth)
        if learningManager.enabled {
-            indices.append(("memory", stringSet.flatMapSet {self.throughMatchLOUDS(query: "memory", charIDs: $0.1, depth: depth)}))
+            indices.append(contentsOf: self.throughMatchLOUDS(group: ["user": stringSet, "memory": stringSet], depth: depth))
+        } else {
+            indices.append(contentsOf: self.throughMatchLOUDS(group: ["user": stringSet], depth: depth))
        }
        // MARK: 検索によって得たindicesから辞書データを実際に取り出していく
        var dicdata: [DicdataElement] = []
--- a/Sources/KanaKanjiConverterModule/LOUDS/LOUDS.swift
+++ b/Sources/KanaKanjiConverterModule/LOUDS/LOUDS.swift
@ -216,4 +216,60 @@ package struct LOUDS: Sendable {
        }
        return indices
    }
+
+    /// 辞書順ソート
+    private static func lexLessThan(_ lhs: [UInt8], _ rhs: [UInt8]) -> Bool {
+        let minCount = Swift.min(lhs.count, rhs.count)
+        for i in 0..<minCount {
+            let l = lhs[i]
+            let r = rhs[i]
+            if l != r {
+                return l < r
+            }
+        }
+        return lhs.count < rhs.count
+    }
+
+    /// 部分前方一致検索を実行する
+    ///
+    /// 「しかい」を入力した場合、「しかい」だけでなく「し」「しか」の検索も行う。
+    /// - Parameter chars: CharIDに変換した文字列
+    /// - Returns: 対応するloudstxt3ファイル内のインデックスのリスト
+    /// - Note: より適切な名前に変更したい
+    @inlinable func byfixNodeIndices(targets: [[UInt8]], depth: Range<Int>) -> [Int] {
+        // 辞書順でソートする
+//        let targets = targets.sorted(by: Self.lexLessThan)
+        var targets = targets
+        targets.sort(by: Self.lexLessThan)
+        // 最終出力となる
+        var indices: [Int] = []
+        // 現在の探索結果を保存しておく
+        var stack: [(nodeIndex: Int, char: UInt8)] = []
+        for chars in targets {
+            // iがupperBoundを超えない範囲で検索を行う
+            for (i, char) in chars.enumerated() where i < depth.upperBound {
+                if i < stack.count, stack[i].char == char {
+                    // すでに探索済み
+                    continue
+                } else if i < stack.count, stack[i].char != char {
+                    // 異なる文字が見つかったら、その時点でそこから先のstackを破棄
+                    stack = Array(stack[..<i])
+                }
+                // ここに到達する場合、stack[i]は存在しない。
+                assert(i >= stack.count, "stack[\(i)] must not exist for logical reason.")
+                // このケースでは、探索を行う
+                // 直前のstackを取り出し、そのnodeIndexから次のcharを探索する
+                if let nodeIndex = self.searchCharNodeIndex(from: stack.last?.nodeIndex ?? 1, char: char) {
+                    if depth.contains(i) {
+                        indices.append(nodeIndex)
+                    }
+                    stack.append((nodeIndex, char))
+                } else {
+                    // 見つからなかった場合、打ち切る
+                    break
+                }
+            }
+        }
+        return indices
+    }
 }
--- a/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift
+++ b/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift
@ -155,6 +155,26 @@ final class DicdataStoreTests: XCTestCase {
        }
    }

+    /// 入力誤りを確実に修正できてほしい語群
+    func testMustCorrectTypo() throws {
+        let dicdataStore = DicdataStore(convertRequestOptions: requestOptions())
+        let mustWords = [
+            ("タイカクセイ", "大学生"),
+            ("シヨック", "ショック"),
+            ("キヨクイン", "局員"),
+            ("シヨーク", "ジョーク"),
+            ("サリカニ", "ザリガニ"),
+            ("ノクチヒテヨ", "野口英世"),
+            ("オタノフナカ", "織田信長"),
+        ]
+        for (key, word) in mustWords {
+            var c = ComposingText()
+            c.insertAtCursorPosition(key, inputStyle: .direct)
+            let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: true)
+            XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
+        }
+    }
+
    func testGetLOUDSDataInRange() throws {
        let dicdataStore = DicdataStore(convertRequestOptions: requestOptions())
        do {