perf: 同じloudsに対する検索をバルク処理することによって、処理の効率化を実現 (#208)

* perf: 同じloudsに対する検索をバルク処理することによって、処理の効率化を実現 * fix: bug * test: add typo correction test * chore: finalize imp;
2025-08-22 15:05:26 +00:00 · 2025-06-27 22:32:46 +09:00
parent f60a9a8738
commit f5037e393c
3 changed files with 99 additions and 11 deletions
--- a/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift
+++ b/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift
@ -243,6 +243,23 @@ public final class DicdataStore {
        return Array(result[min(depth.lowerBound + 1, result.endIndex) ..< min(depth.upperBound + 1, result.endIndex)])
    }
    /// 辞書検索用関数
    /// - Parameters:
    ///   - group: ファイルのプレフィックスとなる文字列（通常、最初の1文字）と、その文字列で始まる文字IDのプレフィックスの集合
    ///   - depth: 検索対象となる深さ。`2..<4`の場合は2文字・3文字の候補のみ取りだす
    /// - Returns: 発見されたすべてのインデックス
    private func throughMatchLOUDS(group: [String: [([Character], [UInt8])]], depth: Range<Int>) -> [(key: String, indices: Set<Int>)] {
        let indices: [(String, Set<Int>)] = group.map {dic in
            guard let louds = self.loadLOUDS(query: dic.key) else {
                return (dic.key, [])
            }
            // バルク処理用の実装を呼び出す
            let result = louds.byfixNodeIndices(targets: dic.value.map { $0.1 }, depth: depth)
            return (dic.key, Set(result))
        }
        return indices
    }
    private func prefixMatchLOUDS(query: String, charIDs: [UInt8], depth: Int = .max, maxCount: Int = .max) -> [Int] {
        guard let louds = self.loadLOUDS(query: query) else {
            return []
@ -292,20 +309,15 @@ public final class DicdataStore {
        // MARK: 誤り訂正の対象を列挙する。非常に重い処理。
        var stringToInfo = inputData.getRangesWithTypos(fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
        // MARK: 検索対象を列挙していく。
-        let stringSet = stringToInfo.keys.map {($0, $0.map(self.character2charId))}
+        let stringSet: [([Character], [UInt8])] = stringToInfo.keys.map {($0, $0.map(self.character2charId))}
        let (minCharIDsCount, maxCharIDsCount) = stringSet.lazy.map {$0.1.count}.minAndMax() ?? (0, -1)
        // 先頭の文字: そこで検索したい文字列の集合
        let group = [Character: [([Character], [UInt8])]].init(grouping: stringSet, by: {$0.0.first!})
        let depth = minCharIDsCount - 1 ..< maxCharIDsCount
-        var indices: [(String, Set<Int>)] = group.map {dic in
+        let group = [String: [([Character], [UInt8])]].init(grouping: stringSet, by: {String($0.0.first!)})
-            let key = String(dic.key)
+        var indices = self.throughMatchLOUDS(group: group, depth: depth)
            let set = dic.value.flatMapSet {(_, charIDs) in self.throughMatchLOUDS(query: key, charIDs: charIDs, depth: depth)}
            return (key, set)
        }
        indices.append(("user", stringSet.flatMapSet {self.throughMatchLOUDS(query: "user", charIDs: $0.1, depth: depth)}))
        if learningManager.enabled {
-            indices.append(("memory", stringSet.flatMapSet {self.throughMatchLOUDS(query: "memory", charIDs: $0.1, depth: depth)}))
+            indices.append(contentsOf: self.throughMatchLOUDS(group: ["user": stringSet, "memory": stringSet], depth: depth))
        } else {
            indices.append(contentsOf: self.throughMatchLOUDS(group: ["user": stringSet], depth: depth))
        }
        // MARK: 検索によって得たindicesから辞書データを実際に取り出していく
        var dicdata: [DicdataElement] = []
--- a/Sources/KanaKanjiConverterModule/LOUDS/LOUDS.swift
+++ b/Sources/KanaKanjiConverterModule/LOUDS/LOUDS.swift
@ -216,4 +216,60 @@ package struct LOUDS: Sendable {
        }
        return indices
    }
    /// 辞書順ソート
    private static func lexLessThan(_ lhs: [UInt8], _ rhs: [UInt8]) -> Bool {
        let minCount = Swift.min(lhs.count, rhs.count)
        for i in 0..<minCount {
            let l = lhs[i]
            let r = rhs[i]
            if l != r {
                return l < r
            }
        }
        return lhs.count < rhs.count
    }
    /// 部分前方一致検索を実行する
    ///
    /// 「しかい」を入力した場合、「しかい」だけでなく「し」「しか」の検索も行う。
    /// - Parameter chars: CharIDに変換した文字列
    /// - Returns: 対応するloudstxt3ファイル内のインデックスのリスト
    /// - Note: より適切な名前に変更したい
    @inlinable func byfixNodeIndices(targets: [[UInt8]], depth: Range<Int>) -> [Int] {
        // 辞書順でソートする
 //        let targets = targets.sorted(by: Self.lexLessThan)
        var targets = targets
        targets.sort(by: Self.lexLessThan)
        // 最終出力となる
        var indices: [Int] = []
        // 現在の探索結果を保存しておく
        var stack: [(nodeIndex: Int, char: UInt8)] = []
        for chars in targets {
            // iがupperBoundを超えない範囲で検索を行う
            for (i, char) in chars.enumerated() where i < depth.upperBound {
                if i < stack.count, stack[i].char == char {
                    // すでに探索済み
                    continue
                } else if i < stack.count, stack[i].char != char {
                    // 異なる文字が見つかったら、その時点でそこから先のstackを破棄
                    stack = Array(stack[..<i])
                }
                // ここに到達する場合、stack[i]は存在しない。
                assert(i >= stack.count, "stack[\(i)] must not exist for logical reason.")
                // このケースでは、探索を行う
                // 直前のstackを取り出し、そのnodeIndexから次のcharを探索する
                if let nodeIndex = self.searchCharNodeIndex(from: stack.last?.nodeIndex ?? 1, char: char) {
                    if depth.contains(i) {
                        indices.append(nodeIndex)
                    }
                    stack.append((nodeIndex, char))
                } else {
                    // 見つからなかった場合、打ち切る
                    break
                }
            }
        }
        return indices
    }
 }
--- a/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift
+++ b/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift
@ -155,6 +155,26 @@ final class DicdataStoreTests: XCTestCase {
        }
    }
    /// 入力誤りを確実に修正できてほしい語群
    func testMustCorrectTypo() throws {
        let dicdataStore = DicdataStore(convertRequestOptions: requestOptions())
        let mustWords = [
            ("タイカクセイ", "大学生"),
            ("シヨック", "ショック"),
            ("キヨクイン", "局員"),
            ("シヨーク", "ジョーク"),
            ("サリカニ", "ザリガニ"),
            ("ノクチヒテヨ", "野口英世"),
            ("オタノフナカ", "織田信長"),
        ]
        for (key, word) in mustWords {
            var c = ComposingText()
            c.insertAtCursorPosition(key, inputStyle: .direct)
            let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: true)
            XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
        }
    }
    func testGetLOUDSDataInRange() throws {
        let dicdataStore = DicdataStore(convertRequestOptions: requestOptions())
        do {