feat: getLOUDSData関数をgetLOUDSDataInRange関数に統合。最適化を行っていない実装にもかかわらず、従来実装の1.2倍程度の高速化効果が得られる

2025-08-22 15:05:26 +00:00 · 2025-06-29 19:52:27 +09:00
parent 14fa82bee9
commit fe2c1ec4ae
4 changed files with 10 additions and 190 deletions
--- a/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift
+++ b/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift
@ -430,102 +430,6 @@ public final class DicdataStore {
        }
    }
    /// kana2latticeから参照する。louds版。
    /// - Parameters:
    ///   - inputData: 入力データ
    ///   - from: 始点
    ///   - to: 終点
    public func getLOUDSData(inputData: ComposingText, from fromIndex: Int, to toIndex: Int, needTypoCorrection: Bool) -> [LatticeNode] {
        if toIndex - fromIndex > self.maxlength || fromIndex > toIndex {
            return []
        }
        let segment = inputData.input[fromIndex...toIndex].reduce(into: "") {$0.append($1.character)}.toKatakana()
        // TODO: 最適化の余地あり
        let string2penalty = TypoCorrection.getRangeWithTypos(inputs: inputData.input, leftIndex: fromIndex, rightIndex: toIndex).filter {
            needTypoCorrection || $0.value == 0.0
        }
        // MARK: 検索によって得たindicesから辞書データを実際に取り出していく
        // 先頭の文字: そこで検索したい文字列の集合
        let strings = string2penalty.keys.map {
            (key: $0, charIDs: $0.map(self.character2charId))
        }
        let group = [Character: [(key: [Character], charIDs: [UInt8])]].init(grouping: strings, by: {$0.key.first!})
        var indices: [(String, Set<Int>)] = group.map {dic in
            let head = String(dic.key)
            let set = dic.value.flatMapSet { (_, charIDs) in
                self.perfectMatchingSearch(query: head, charIDs: charIDs)
            }
            return (head, set)
        }
        do {
            let set = strings.flatMapSet { (_, charIDs) in
                self.perfectMatchingSearch(query: "user", charIDs: charIDs)
            }
            indices.append(("user", set))
        }
        if learningManager.enabled {
            let set = strings.flatMapSet { (_, charIDs) in
                self.perfectMatchingSearch(query: "memory", charIDs: charIDs)
            }
            indices.append(("memory", set))
        }
        var dicdata: [DicdataElement] = []
        for (identifier, value) in indices {
            let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
                let rubyArray = Array(data.ruby)
                let penalty = string2penalty[rubyArray, default: .zero]
                if penalty.isZero {
                    return data
                }
                let ratio = Self.penaltyRatio[data.lcid]
                let pUnit: PValue = Self.getPenalty(data: data) / 2   // 負の値
                let adjust = pUnit * penalty * ratio
                if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
                    return nil
                }
                return data.adjustedData(adjust)
            }
            dicdata.append(contentsOf: result)
        }
        // temporalな学習結果にpenaltyを加えて追加する
        for (characters, charIds) in consume strings {
            for data in self.learningManager.temporaryPerfectMatch(charIDs: consume charIds) {
                // perfect matchなので、Array(data.ruby)はcharactersに等しい
                let penalty = string2penalty[characters, default: .zero]
                if penalty.isZero {
                    dicdata.append(data)
                }
                let ratio = Self.penaltyRatio[data.lcid]
                let pUnit: PValue = Self.getPenalty(data: data) / 2   // 負の値
                let adjust = pUnit * penalty * ratio
                if self.shouldBeRemoved(value: data.value() + adjust, wordCount: characters.count) {
                    continue
                }
                dicdata.append(data.adjustedData(adjust))
            }
        }
        dicdata.append(contentsOf: self.getWiseDicdata(convertTarget: segment, inputData: inputData, inputRange: fromIndex ..< toIndex + 1))
        for segment in string2penalty.keys {
            dicdata.append(contentsOf: self.getMatchDynamicUserDict(String(segment)))
        }
        if fromIndex == .zero {
            let result: [LatticeNode] = dicdata.map {
                let node = LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)
                node.prevs.append(RegisteredNode.BOSNode())
                return node
            }
            return result
        } else {
            let result: [LatticeNode] = dicdata.map {LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)}
            return result
        }
    }
    func getZeroHintPredictionDicdata(lastRcid: Int) -> [DicdataElement] {
        do {
            let csvString = try String(contentsOf: requestOptions.dictionaryResourceURL.appendingPathComponent("p/pc_\(lastRcid).csv", isDirectory: false), encoding: .utf8)
--- a/Sources/KanaKanjiConverterModule/DicdataStore/TypoCorrection.swift
+++ b/Sources/KanaKanjiConverterModule/DicdataStore/TypoCorrection.swift
@ -1,6 +1,6 @@
 import SwiftUtils
-struct TypoCorrectionGenerator {
+struct TypoCorrectionGenerator: Sendable {
    init(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndexRange: Range<Int>, needTypoCorrection: Bool) {
        self.maxPenalty = needTypoCorrection ? 3.5 * 3 : 0
        self.inputs = inputs
@ -10,12 +10,12 @@ struct TypoCorrectionGenerator {
        let count = rightIndexRange.endIndex - left
        self.count = count
        self.nodes = (0..<count).map {(i: Int) in
-            TypoCorrection.lengths.flatMap {(k: Int) -> [TypoCorrection.TypoCandidate] in
+            Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
                let j = i + k
                if count <= j {
                    return []
                }
-                return TypoCorrection.getTypo(inputs[left + i ... left + j], frozen: !needTypoCorrection)
+                return Self.getTypo(inputs[left + i ... left + j], frozen: !needTypoCorrection)
            }
        }
        // 深さ優先で列挙する
@ -38,7 +38,7 @@ struct TypoCorrectionGenerator {
    let inputs: [ComposingText.InputElement]
    let left: Int
    let rightIndexRange: Range<Int>
-    let nodes: [[TypoCorrection.TypoCandidate]]
+    let nodes: [[TypoCandidate]]
    let count: Int
    var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)]
@ -116,7 +116,7 @@ struct TypoCorrectionGenerator {
                    for element in $0.inputElements {
                        ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
                    }
-                    if TypoCorrection.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
+                    if Self.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
                        return nil
                    }
                    return (
@ -134,10 +134,7 @@ struct TypoCorrectionGenerator {
        }
        return nil
    }
 }
 // MARK: 誤り訂正用のAPI
 enum TypoCorrection {
    fileprivate static func shouldBeRemovedForDicdataStore(components: [ComposingText.ConvertTargetElement]) -> Bool {
        // 判定に使うのは最初の1エレメントの最初の文字で十分
        guard let first = components.first?.string.first?.toKatakana() else {
@ -146,87 +143,6 @@ enum TypoCorrection {
        return !CharacterUtils.isRomanLetter(first) && !DicdataStore.existLOUDS(for: first)
    }
    static func getRangeWithTypos(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndex right: Int) -> [[Character]: PValue] {
        // 各iから始まる候補を列挙する
        // 例えばinput = [d(あ), r(s), r(i), r(t), r(s), d(は), d(は), d(れ)]の場合
        // nodes =      [[d(あ)], [r(s)], [r(i)], [r(t), [r(t), r(a)]], [r(s)], [d(は), d(ば), d(ぱ)], [d(れ)]]
        // となる
        let count = right - left + 1
        let nodes = (0..<count).map {(i: Int) in
            Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
                let j = i + k
                if count <= j {
                    return []
                }
                return Self.getTypo(inputs[left + i ... left + j])
            }
        }
        let maxPenalty: PValue = 3.5 * 3
        // 深さ優先で列挙する
        var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)] = nodes[0].compactMap { typoCandidate in
            guard let firstElement = typoCandidate.inputElements.first else {
                return nil
            }
            if ComposingText.isLeftSideValid(first: firstElement, of: inputs, from: left) {
                var convertTargetElements = [ComposingText.ConvertTargetElement]()
                for element in typoCandidate.inputElements {
                    ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
                }
                return (convertTargetElements, typoCandidate.inputElements.last!, typoCandidate.inputElements.count, typoCandidate.weight)
            }
            return nil
        }
        var stringToPenalty: [([Character], PValue)] = []
        while let (convertTargetElements, lastElement, count, penalty) = stack.popLast() {
            if count + left - 1 == right {
                if let convertTarget = ComposingText.getConvertTargetIfRightSideIsValid(lastElement: lastElement, of: inputs, to: count + left, convertTargetElements: convertTargetElements)?.map({$0.toKatakana()}) {
                    stringToPenalty.append((convertTarget, penalty))
                }
                continue
            }
            // エスケープ
            if nodes.endIndex <= count {
                continue
            }
            // 訂正数上限(3個)
            if penalty >= maxPenalty {
                var convertTargetElements = convertTargetElements
                let correct = [inputs[left + count]].map {ComposingText.InputElement(character: $0.character.toKatakana(), inputStyle: $0.inputStyle)}
                if count + correct.count > nodes.endIndex {
                    continue
                }
                for element in correct {
                    ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
                }
                stack.append((convertTargetElements, correct.last!, count + correct.count, penalty))
            } else {
                stack.append(contentsOf: nodes[count].compactMap {
                    if count + $0.inputElements.count > nodes.endIndex {
                        return nil
                    }
                    var convertTargetElements = convertTargetElements
                    for element in $0.inputElements {
                        ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
                    }
                    if Self.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
                        return nil
                    }
                    return (
                        convertTargetElements: convertTargetElements,
                        lastElement: $0.inputElements.last!,
                        count: count + $0.inputElements.count,
                        penalty: penalty + $0.weight
                    )
                })
            }
        }
        return Dictionary(stringToPenalty, uniquingKeysWith: max)
    }
    fileprivate static func getTypo(_ elements: some Collection<ComposingText.InputElement>, frozen: Bool = false) -> [TypoCandidate] {
        let key = elements.reduce(into: "") {$0.append($1.character)}.toKatakana()
@ -289,7 +205,7 @@ enum TypoCorrection {
        }
    }
-    struct TypoCandidate: Equatable {
+    struct TypoCandidate: Sendable, Equatable {
        var inputElements: [ComposingText.InputElement]
        var weight: PValue
    }
--- a/Sources/KanaKanjiConverterModule/Kana2Kanji/added_last_1_character.swift
+++ b/Sources/KanaKanjiConverterModule/Kana2Kanji/added_last_1_character.swift
@ -35,7 +35,7 @@ extension Kana2Kanji {
        // (1)
        let addedNodes: [[LatticeNode]] = (0...count).map {(i: Int) in
-            self.dicdataStore.getLOUDSData(inputData: inputData, from: i, to: count, needTypoCorrection: needTypoCorrection)
+            self.dicdataStore.getLOUDSDataInRange(inputData: inputData, from: i, toIndexRange: count ..< count+1, needTypoCorrection: needTypoCorrection)
        }
        // ココが一番時間がかかっていた。
--- a/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift
+++ b/Tests/KanaKanjiConverterModuleWithDefaultDictionaryTests/DicdataStoreTests/DicdataStoreTests.swift
@ -129,7 +129,7 @@ final class DicdataStoreTests: XCTestCase {
        for (key, word) in mustWords {
            var c = ComposingText()
            c.insertAtCursorPosition(key, inputStyle: .direct)
-            let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: false)
+            let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: false)
            // 冗長な書き方だが、こうすることで「どの項目でエラーが発生したのか」がはっきりするため、こう書いている。
            XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
        }
@ -150,7 +150,7 @@ final class DicdataStoreTests: XCTestCase {
        for (key, word) in mustWords {
            var c = ComposingText()
            c.insertAtCursorPosition(key, inputStyle: .direct)
-            let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: false)
+            let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: false)
            XCTAssertNil(result.first(where: {$0.data.word == word && $0.data.ruby == key}))
        }
    }
@ -170,7 +170,7 @@ final class DicdataStoreTests: XCTestCase {
        for (key, word) in mustWords {
            var c = ComposingText()
            c.insertAtCursorPosition(key, inputStyle: .direct)
-            let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: true)
+            let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: true)
            XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
        }
    }