feat: getLOUDSData関数をgetLOUDSDataInRange関数に統合。最適化を行っていない実装にもかかわらず、従来実装の1.2倍程度の高速化効果が得られる

2025-08-22 15:05:26 +00:00 · 2025-06-29 19:52:27 +09:00
parent 14fa82bee9
commit fe2c1ec4ae
4 changed files with 10 additions and 190 deletions
--- a/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift
+++ b/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift
@ -430,102 +430,6 @@ public final class DicdataStore {
        }
    }

-    /// kana2latticeから参照する。louds版。
-    /// - Parameters:
-    ///   - inputData: 入力データ
-    ///   - from: 始点
-    ///   - to: 終点
-    public func getLOUDSData(inputData: ComposingText, from fromIndex: Int, to toIndex: Int, needTypoCorrection: Bool) -> [LatticeNode] {
-        if toIndex - fromIndex > self.maxlength || fromIndex > toIndex {
-            return []
-        }
-        let segment = inputData.input[fromIndex...toIndex].reduce(into: "") {$0.append($1.character)}.toKatakana()
-
-        // TODO: 最適化の余地あり
-        let string2penalty = TypoCorrection.getRangeWithTypos(inputs: inputData.input, leftIndex: fromIndex, rightIndex: toIndex).filter {
-            needTypoCorrection || $0.value == 0.0
-        }
-
-        // MARK: 検索によって得たindicesから辞書データを実際に取り出していく
-        // 先頭の文字: そこで検索したい文字列の集合
-        let strings = string2penalty.keys.map {
-            (key: $0, charIDs: $0.map(self.character2charId))
-        }
-        let group = [Character: [(key: [Character], charIDs: [UInt8])]].init(grouping: strings, by: {$0.key.first!})
-
-        var indices: [(String, Set<Int>)] = group.map {dic in
-            let head = String(dic.key)
-            let set = dic.value.flatMapSet { (_, charIDs) in
-                self.perfectMatchingSearch(query: head, charIDs: charIDs)
-            }
-            return (head, set)
-        }
-        do {
-            let set = strings.flatMapSet { (_, charIDs) in
-                self.perfectMatchingSearch(query: "user", charIDs: charIDs)
-            }
-            indices.append(("user", set))
-        }
-        if learningManager.enabled {
-            let set = strings.flatMapSet { (_, charIDs) in
-                self.perfectMatchingSearch(query: "memory", charIDs: charIDs)
-            }
-            indices.append(("memory", set))
-        }
-        var dicdata: [DicdataElement] = []
-        for (identifier, value) in indices {
-            let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
-                let rubyArray = Array(data.ruby)
-                let penalty = string2penalty[rubyArray, default: .zero]
-                if penalty.isZero {
-                    return data
-                }
-                let ratio = Self.penaltyRatio[data.lcid]
-                let pUnit: PValue = Self.getPenalty(data: data) / 2   // 負の値
-                let adjust = pUnit * penalty * ratio
-                if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
-                    return nil
-                }
-                return data.adjustedData(adjust)
-            }
-            dicdata.append(contentsOf: result)
-        }
-        // temporalな学習結果にpenaltyを加えて追加する
-        for (characters, charIds) in consume strings {
-            for data in self.learningManager.temporaryPerfectMatch(charIDs: consume charIds) {
-                // perfect matchなので、Array(data.ruby)はcharactersに等しい
-                let penalty = string2penalty[characters, default: .zero]
-                if penalty.isZero {
-                    dicdata.append(data)
-                }
-                let ratio = Self.penaltyRatio[data.lcid]
-                let pUnit: PValue = Self.getPenalty(data: data) / 2   // 負の値
-                let adjust = pUnit * penalty * ratio
-                if self.shouldBeRemoved(value: data.value() + adjust, wordCount: characters.count) {
-                    continue
-                }
-                dicdata.append(data.adjustedData(adjust))
-            }
-        }
-
-        dicdata.append(contentsOf: self.getWiseDicdata(convertTarget: segment, inputData: inputData, inputRange: fromIndex ..< toIndex + 1))
-        for segment in string2penalty.keys {
-            dicdata.append(contentsOf: self.getMatchDynamicUserDict(String(segment)))
-        }
-
-        if fromIndex == .zero {
-            let result: [LatticeNode] = dicdata.map {
-                let node = LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)
-                node.prevs.append(RegisteredNode.BOSNode())
-                return node
-            }
-            return result
-        } else {
-            let result: [LatticeNode] = dicdata.map {LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)}
-            return result
-        }
-    }
-
    func getZeroHintPredictionDicdata(lastRcid: Int) -> [DicdataElement] {
        do {
            let csvString = try String(contentsOf: requestOptions.dictionaryResourceURL.appendingPathComponent("p/pc_\(lastRcid).csv", isDirectory: false), encoding: .utf8)
--- a/Sources/KanaKanjiConverterModule/DicdataStore/TypoCorrection.swift
+++ b/Sources/KanaKanjiConverterModule/DicdataStore/TypoCorrection.swift
@ -1,6 +1,6 @@
 import SwiftUtils

-struct TypoCorrectionGenerator {
+struct TypoCorrectionGenerator: Sendable {
    init(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndexRange: Range<Int>, needTypoCorrection: Bool) {
        self.maxPenalty = needTypoCorrection ? 3.5 * 3 : 0
        self.inputs = inputs
@ -10,12 +10,12 @@ struct TypoCorrectionGenerator {
        let count = rightIndexRange.endIndex - left
        self.count = count
        self.nodes = (0..<count).map {(i: Int) in
-            TypoCorrection.lengths.flatMap {(k: Int) -> [TypoCorrection.TypoCandidate] in
+            Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
                let j = i + k
                if count <= j {
                    return []
                }
-                return TypoCorrection.getTypo(inputs[left + i ... left + j], frozen: !needTypoCorrection)
+                return Self.getTypo(inputs[left + i ... left + j], frozen: !needTypoCorrection)
            }
        }
        // 深さ優先で列挙する
@ -38,7 +38,7 @@ struct TypoCorrectionGenerator {
    let inputs: [ComposingText.InputElement]
    let left: Int
    let rightIndexRange: Range<Int>
-    let nodes: [[TypoCorrection.TypoCandidate]]
+    let nodes: [[TypoCandidate]]
    let count: Int

    var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)]
@ -116,7 +116,7 @@ struct TypoCorrectionGenerator {
                    for element in $0.inputElements {
                        ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
                    }
-                    if TypoCorrection.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
+                    if Self.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
                        return nil
                    }
                    return (
@ -134,10 +134,7 @@ struct TypoCorrectionGenerator {
        }
        return nil
    }
-}

-// MARK: 誤り訂正用のAPI
-enum TypoCorrection {
    fileprivate static func shouldBeRemovedForDicdataStore(components: [ComposingText.ConvertTargetElement]) -> Bool {
        // 判定に使うのは最初の1エレメントの最初の文字で十分
        guard let first = components.first?.string.first?.toKatakana() else {
@ -146,87 +143,6 @@ enum TypoCorrection {
        return !CharacterUtils.isRomanLetter(first) && !DicdataStore.existLOUDS(for: first)
    }

-    static func getRangeWithTypos(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndex right: Int) -> [[Character]: PValue] {
-        // 各iから始まる候補を列挙する
-        // 例えばinput = [d(あ), r(s), r(i), r(t), r(s), d(は), d(は), d(れ)]の場合
-        // nodes =      [[d(あ)], [r(s)], [r(i)], [r(t), [r(t), r(a)]], [r(s)], [d(は), d(ば), d(ぱ)], [d(れ)]]
-        // となる
-        let count = right - left + 1
-        let nodes = (0..<count).map {(i: Int) in
-            Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
-                let j = i + k
-                if count <= j {
-                    return []
-                }
-                return Self.getTypo(inputs[left + i ... left + j])
-            }
-        }
-
-        let maxPenalty: PValue = 3.5 * 3
-
-        // 深さ優先で列挙する
-        var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)] = nodes[0].compactMap { typoCandidate in
-            guard let firstElement = typoCandidate.inputElements.first else {
-                return nil
-            }
-            if ComposingText.isLeftSideValid(first: firstElement, of: inputs, from: left) {
-                var convertTargetElements = [ComposingText.ConvertTargetElement]()
-                for element in typoCandidate.inputElements {
-                    ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
-                }
-                return (convertTargetElements, typoCandidate.inputElements.last!, typoCandidate.inputElements.count, typoCandidate.weight)
-            }
-            return nil
-        }
-
-        var stringToPenalty: [([Character], PValue)] = []
-
-        while let (convertTargetElements, lastElement, count, penalty) = stack.popLast() {
-            if count + left - 1 == right {
-                if let convertTarget = ComposingText.getConvertTargetIfRightSideIsValid(lastElement: lastElement, of: inputs, to: count + left, convertTargetElements: convertTargetElements)?.map({$0.toKatakana()}) {
-                    stringToPenalty.append((convertTarget, penalty))
-                }
-                continue
-            }
-            // エスケープ
-            if nodes.endIndex <= count {
-                continue
-            }
-            // 訂正数上限(3個)
-            if penalty >= maxPenalty {
-                var convertTargetElements = convertTargetElements
-                let correct = [inputs[left + count]].map {ComposingText.InputElement(character: $0.character.toKatakana(), inputStyle: $0.inputStyle)}
-                if count + correct.count > nodes.endIndex {
-                    continue
-                }
-                for element in correct {
-                    ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
-                }
-                stack.append((convertTargetElements, correct.last!, count + correct.count, penalty))
-            } else {
-                stack.append(contentsOf: nodes[count].compactMap {
-                    if count + $0.inputElements.count > nodes.endIndex {
-                        return nil
-                    }
-                    var convertTargetElements = convertTargetElements
-                    for element in $0.inputElements {
-                        ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
-                    }
-                    if Self.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
-                        return nil
-                    }
-                    return (
-                        convertTargetElements: convertTargetElements,
-                        lastElement: $0.inputElements.last!,
-                        count: count + $0.inputElements.count,
-                        penalty: penalty + $0.weight
-                    )
-                })
-            }
-        }
-        return Dictionary(stringToPenalty, uniquingKeysWith: max)
-    }
-
    fileprivate static func getTypo(_ elements: some Collection<ComposingText.InputElement>, frozen: Bool = false) -> [TypoCandidate] {
        let key = elements.reduce(into: "") {$0.append($1.character)}.toKatakana()

@ -289,7 +205,7 @@ enum TypoCorrection {
        }
    }

-    struct TypoCandidate: Equatable {
+    struct TypoCandidate: Sendable, Equatable {
        var inputElements: [ComposingText.InputElement]
        var weight: PValue
    }
--- a/Sources/KanaKanjiConverterModule/Kana2Kanji/added_last_1_character.swift
+++ b/Sources/KanaKanjiConverterModule/Kana2Kanji/added_last_1_character.swift
@ -35,7 +35,7 @@ extension Kana2Kanji {

        // (1)
        let addedNodes: [[LatticeNode]] = (0...count).map {(i: Int) in
-            self.dicdataStore.getLOUDSData(inputData: inputData, from: i, to: count, needTypoCorrection: needTypoCorrection)
+            self.dicdataStore.getLOUDSDataInRange(inputData: inputData, from: i, toIndexRange: count ..< count+1, needTypoCorrection: needTypoCorrection)
        }

        // ココが一番時間がかかっていた。
--- a/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift
+++ b/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift
@ -129,7 +129,7 @@ final class DicdataStoreTests: XCTestCase {
        for (key, word) in mustWords {
            var c = ComposingText()
            c.insertAtCursorPosition(key, inputStyle: .direct)
-            let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: false)
+            let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: false)
            // 冗長な書き方だが、こうすることで「どの項目でエラーが発生したのか」がはっきりするため、こう書いている。
            XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
        }
@ -150,7 +150,7 @@ final class DicdataStoreTests: XCTestCase {
        for (key, word) in mustWords {
            var c = ComposingText()
            c.insertAtCursorPosition(key, inputStyle: .direct)
-            let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: false)
+            let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: false)
            XCTAssertNil(result.first(where: {$0.data.word == word && $0.data.ruby == key}))
        }
    }
@ -170,7 +170,7 @@ final class DicdataStoreTests: XCTestCase {
        for (key, word) in mustWords {
            var c = ComposingText()
            c.insertAtCursorPosition(key, inputStyle: .direct)
-            let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: true)
+            let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: true)
            XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
        }
    }