feat: getLOUDSData関数をgetLOUDSDataInRange関数に統合。最適化を行っていない実装にもかかわらず、従来実装の1.2倍程度の高速化効果が得られる

This commit is contained in:
Miwa / Ensan
2025-06-29 19:52:27 +09:00
parent 14fa82bee9
commit fe2c1ec4ae
4 changed files with 10 additions and 190 deletions

View File

@ -430,102 +430,6 @@ public final class DicdataStore {
} }
} }
/// kana2latticelouds
/// - Parameters:
/// - inputData:
/// - from:
/// - to:
public func getLOUDSData(inputData: ComposingText, from fromIndex: Int, to toIndex: Int, needTypoCorrection: Bool) -> [LatticeNode] {
if toIndex - fromIndex > self.maxlength || fromIndex > toIndex {
return []
}
let segment = inputData.input[fromIndex...toIndex].reduce(into: "") {$0.append($1.character)}.toKatakana()
// TODO:
let string2penalty = TypoCorrection.getRangeWithTypos(inputs: inputData.input, leftIndex: fromIndex, rightIndex: toIndex).filter {
needTypoCorrection || $0.value == 0.0
}
// MARK: indices
// :
let strings = string2penalty.keys.map {
(key: $0, charIDs: $0.map(self.character2charId))
}
let group = [Character: [(key: [Character], charIDs: [UInt8])]].init(grouping: strings, by: {$0.key.first!})
var indices: [(String, Set<Int>)] = group.map {dic in
let head = String(dic.key)
let set = dic.value.flatMapSet { (_, charIDs) in
self.perfectMatchingSearch(query: head, charIDs: charIDs)
}
return (head, set)
}
do {
let set = strings.flatMapSet { (_, charIDs) in
self.perfectMatchingSearch(query: "user", charIDs: charIDs)
}
indices.append(("user", set))
}
if learningManager.enabled {
let set = strings.flatMapSet { (_, charIDs) in
self.perfectMatchingSearch(query: "memory", charIDs: charIDs)
}
indices.append(("memory", set))
}
var dicdata: [DicdataElement] = []
for (identifier, value) in indices {
let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
let rubyArray = Array(data.ruby)
let penalty = string2penalty[rubyArray, default: .zero]
if penalty.isZero {
return data
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
return nil
}
return data.adjustedData(adjust)
}
dicdata.append(contentsOf: result)
}
// temporalpenalty
for (characters, charIds) in consume strings {
for data in self.learningManager.temporaryPerfectMatch(charIDs: consume charIds) {
// perfect matchArray(data.ruby)characters
let penalty = string2penalty[characters, default: .zero]
if penalty.isZero {
dicdata.append(data)
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: characters.count) {
continue
}
dicdata.append(data.adjustedData(adjust))
}
}
dicdata.append(contentsOf: self.getWiseDicdata(convertTarget: segment, inputData: inputData, inputRange: fromIndex ..< toIndex + 1))
for segment in string2penalty.keys {
dicdata.append(contentsOf: self.getMatchDynamicUserDict(String(segment)))
}
if fromIndex == .zero {
let result: [LatticeNode] = dicdata.map {
let node = LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)
node.prevs.append(RegisteredNode.BOSNode())
return node
}
return result
} else {
let result: [LatticeNode] = dicdata.map {LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)}
return result
}
}
func getZeroHintPredictionDicdata(lastRcid: Int) -> [DicdataElement] { func getZeroHintPredictionDicdata(lastRcid: Int) -> [DicdataElement] {
do { do {
let csvString = try String(contentsOf: requestOptions.dictionaryResourceURL.appendingPathComponent("p/pc_\(lastRcid).csv", isDirectory: false), encoding: .utf8) let csvString = try String(contentsOf: requestOptions.dictionaryResourceURL.appendingPathComponent("p/pc_\(lastRcid).csv", isDirectory: false), encoding: .utf8)

View File

@ -1,6 +1,6 @@
import SwiftUtils import SwiftUtils
struct TypoCorrectionGenerator { struct TypoCorrectionGenerator: Sendable {
init(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndexRange: Range<Int>, needTypoCorrection: Bool) { init(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndexRange: Range<Int>, needTypoCorrection: Bool) {
self.maxPenalty = needTypoCorrection ? 3.5 * 3 : 0 self.maxPenalty = needTypoCorrection ? 3.5 * 3 : 0
self.inputs = inputs self.inputs = inputs
@ -10,12 +10,12 @@ struct TypoCorrectionGenerator {
let count = rightIndexRange.endIndex - left let count = rightIndexRange.endIndex - left
self.count = count self.count = count
self.nodes = (0..<count).map {(i: Int) in self.nodes = (0..<count).map {(i: Int) in
TypoCorrection.lengths.flatMap {(k: Int) -> [TypoCorrection.TypoCandidate] in Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
let j = i + k let j = i + k
if count <= j { if count <= j {
return [] return []
} }
return TypoCorrection.getTypo(inputs[left + i ... left + j], frozen: !needTypoCorrection) return Self.getTypo(inputs[left + i ... left + j], frozen: !needTypoCorrection)
} }
} }
// //
@ -38,7 +38,7 @@ struct TypoCorrectionGenerator {
let inputs: [ComposingText.InputElement] let inputs: [ComposingText.InputElement]
let left: Int let left: Int
let rightIndexRange: Range<Int> let rightIndexRange: Range<Int>
let nodes: [[TypoCorrection.TypoCandidate]] let nodes: [[TypoCandidate]]
let count: Int let count: Int
var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)] var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)]
@ -116,7 +116,7 @@ struct TypoCorrectionGenerator {
for element in $0.inputElements { for element in $0.inputElements {
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element) ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
} }
if TypoCorrection.shouldBeRemovedForDicdataStore(components: convertTargetElements) { if Self.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
return nil return nil
} }
return ( return (
@ -134,10 +134,7 @@ struct TypoCorrectionGenerator {
} }
return nil return nil
} }
}
// MARK: API
enum TypoCorrection {
fileprivate static func shouldBeRemovedForDicdataStore(components: [ComposingText.ConvertTargetElement]) -> Bool { fileprivate static func shouldBeRemovedForDicdataStore(components: [ComposingText.ConvertTargetElement]) -> Bool {
// 使1 // 使1
guard let first = components.first?.string.first?.toKatakana() else { guard let first = components.first?.string.first?.toKatakana() else {
@ -146,87 +143,6 @@ enum TypoCorrection {
return !CharacterUtils.isRomanLetter(first) && !DicdataStore.existLOUDS(for: first) return !CharacterUtils.isRomanLetter(first) && !DicdataStore.existLOUDS(for: first)
} }
static func getRangeWithTypos(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndex right: Int) -> [[Character]: PValue] {
// i
// input = [d(), r(s), r(i), r(t), r(s), d(), d(), d()]
// nodes = [[d()], [r(s)], [r(i)], [r(t), [r(t), r(a)]], [r(s)], [d(), d(), d()], [d()]]
//
let count = right - left + 1
let nodes = (0..<count).map {(i: Int) in
Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
let j = i + k
if count <= j {
return []
}
return Self.getTypo(inputs[left + i ... left + j])
}
}
let maxPenalty: PValue = 3.5 * 3
//
var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)] = nodes[0].compactMap { typoCandidate in
guard let firstElement = typoCandidate.inputElements.first else {
return nil
}
if ComposingText.isLeftSideValid(first: firstElement, of: inputs, from: left) {
var convertTargetElements = [ComposingText.ConvertTargetElement]()
for element in typoCandidate.inputElements {
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
}
return (convertTargetElements, typoCandidate.inputElements.last!, typoCandidate.inputElements.count, typoCandidate.weight)
}
return nil
}
var stringToPenalty: [([Character], PValue)] = []
while let (convertTargetElements, lastElement, count, penalty) = stack.popLast() {
if count + left - 1 == right {
if let convertTarget = ComposingText.getConvertTargetIfRightSideIsValid(lastElement: lastElement, of: inputs, to: count + left, convertTargetElements: convertTargetElements)?.map({$0.toKatakana()}) {
stringToPenalty.append((convertTarget, penalty))
}
continue
}
//
if nodes.endIndex <= count {
continue
}
// (3)
if penalty >= maxPenalty {
var convertTargetElements = convertTargetElements
let correct = [inputs[left + count]].map {ComposingText.InputElement(character: $0.character.toKatakana(), inputStyle: $0.inputStyle)}
if count + correct.count > nodes.endIndex {
continue
}
for element in correct {
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
}
stack.append((convertTargetElements, correct.last!, count + correct.count, penalty))
} else {
stack.append(contentsOf: nodes[count].compactMap {
if count + $0.inputElements.count > nodes.endIndex {
return nil
}
var convertTargetElements = convertTargetElements
for element in $0.inputElements {
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
}
if Self.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
return nil
}
return (
convertTargetElements: convertTargetElements,
lastElement: $0.inputElements.last!,
count: count + $0.inputElements.count,
penalty: penalty + $0.weight
)
})
}
}
return Dictionary(stringToPenalty, uniquingKeysWith: max)
}
fileprivate static func getTypo(_ elements: some Collection<ComposingText.InputElement>, frozen: Bool = false) -> [TypoCandidate] { fileprivate static func getTypo(_ elements: some Collection<ComposingText.InputElement>, frozen: Bool = false) -> [TypoCandidate] {
let key = elements.reduce(into: "") {$0.append($1.character)}.toKatakana() let key = elements.reduce(into: "") {$0.append($1.character)}.toKatakana()
@ -289,7 +205,7 @@ enum TypoCorrection {
} }
} }
struct TypoCandidate: Equatable { struct TypoCandidate: Sendable, Equatable {
var inputElements: [ComposingText.InputElement] var inputElements: [ComposingText.InputElement]
var weight: PValue var weight: PValue
} }

View File

@ -35,7 +35,7 @@ extension Kana2Kanji {
// (1) // (1)
let addedNodes: [[LatticeNode]] = (0...count).map {(i: Int) in let addedNodes: [[LatticeNode]] = (0...count).map {(i: Int) in
self.dicdataStore.getLOUDSData(inputData: inputData, from: i, to: count, needTypoCorrection: needTypoCorrection) self.dicdataStore.getLOUDSDataInRange(inputData: inputData, from: i, toIndexRange: count ..< count+1, needTypoCorrection: needTypoCorrection)
} }
// //

View File

@ -129,7 +129,7 @@ final class DicdataStoreTests: XCTestCase {
for (key, word) in mustWords { for (key, word) in mustWords {
var c = ComposingText() var c = ComposingText()
c.insertAtCursorPosition(key, inputStyle: .direct) c.insertAtCursorPosition(key, inputStyle: .direct)
let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: false) let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: false)
// //
XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word) XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
} }
@ -150,7 +150,7 @@ final class DicdataStoreTests: XCTestCase {
for (key, word) in mustWords { for (key, word) in mustWords {
var c = ComposingText() var c = ComposingText()
c.insertAtCursorPosition(key, inputStyle: .direct) c.insertAtCursorPosition(key, inputStyle: .direct)
let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: false) let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: false)
XCTAssertNil(result.first(where: {$0.data.word == word && $0.data.ruby == key})) XCTAssertNil(result.first(where: {$0.data.word == word && $0.data.ruby == key}))
} }
} }
@ -170,7 +170,7 @@ final class DicdataStoreTests: XCTestCase {
for (key, word) in mustWords { for (key, word) in mustWords {
var c = ComposingText() var c = ComposingText()
c.insertAtCursorPosition(key, inputStyle: .direct) c.insertAtCursorPosition(key, inputStyle: .direct)
let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: true) let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: true)
XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word) XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
} }
} }