feat: TypoCorrectionとMovingTowardPrefixSearchを同時実行する関数を追加

This commit is contained in:
Miwa / Ensan
2025-06-29 01:51:08 +09:00
parent 04d7edffdd
commit d636926e83
3 changed files with 97 additions and 45 deletions

View File

@@ -261,6 +261,66 @@ public final class DicdataStore {
return indices
}
func movingTowardPrefixSearch(
inputs: [ComposingText.InputElement],
leftIndex: Int,
rightIndexRange: Range<Int>,
useMemory: Bool
) -> (
stringToInfo: [[Character]: (endIndex: Int, penalty: PValue)],
indices: [(key: String, indices: [Int])],
temporaryMemoryDicdata: [DicdataElement]
) {
var generator = TypoCorrectionGenerator(inputs: inputs, leftIndex: leftIndex, rightIndexRange: rightIndexRange)
var targetLOUDS: [String: LOUDS.MovingTowardPrefixSearchHelper] = [:]
var stringToInfo: [([Character], (endIndex: Int, penalty: PValue))] = []
var temporaryMemoryDicdata: [DicdataElement] = []
//
while let (characters, info) = generator.next() {
guard let firstCharacter = characters.first else {
continue
}
let charIDs = characters.map(self.character2charId(_:))
let keys: [String] = if useMemory {
[String(firstCharacter), "user", "memory"]
} else {
[String(firstCharacter), "user"]
}
var updated = false
for key in keys {
withMutableValue(&targetLOUDS[key]) { helper in
if helper == nil, let louds = self.loadLOUDS(query: key) {
helper = LOUDS.MovingTowardPrefixSearchHelper(louds: louds, depth: 0 ..< .max)
}
let hasUpdate = helper?.update(target: charIDs) ?? false
updated = updated || hasUpdate
}
}
//
for data in self.learningManager.temporaryThroughMatch(charIDs: consume charIDs, depth: 0 ..< .max) {
if info.penalty.isZero {
temporaryMemoryDicdata.append(data)
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * info.penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: data.ruby.count) {
continue
}
temporaryMemoryDicdata.append(data.adjustedData(adjust))
}
if updated {
stringToInfo.append((characters, info))
}
}
return (
Dictionary(stringToInfo, uniquingKeysWith: {$0.penalty < $1.penalty ? $1 : $0}),
targetLOUDS.map { ($0.key, $0.value.indices)},
temporaryMemoryDicdata
)
}
/// prefixprefix matchLOUDS
/// - Parameters:
/// - query: 1"user"
@@ -318,20 +378,8 @@ public final class DicdataStore {
segments.append((segments.last ?? "") + String(inputData.input[rightIndex].character.toKatakana()))
}
// MARK:
var stringToInfo = TypoCorrection.getRangesWithTypos(inputs: inputData.input, leftIndex: fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
// MARK:
let stringSet: [([Character], [UInt8])] = stringToInfo.keys.map {($0, $0.map(self.character2charId))}
let (minCharIDsCount, maxCharIDsCount) = stringSet.lazy.map {$0.1.count}.minAndMax() ?? (0, -1)
let depth = minCharIDsCount - 1 ..< maxCharIDsCount
let group = [String: [([Character], [UInt8])]].init(grouping: stringSet, by: {String($0.0.first!)})
var indices = self.movingTowardPrefixSearch(group: group, depth: depth)
if learningManager.enabled {
indices.append(contentsOf: self.movingTowardPrefixSearch(group: ["user": stringSet, "memory": stringSet], depth: depth))
} else {
indices.append(contentsOf: self.movingTowardPrefixSearch(group: ["user": stringSet], depth: depth))
}
var (stringToInfo, indices, dicdata) = self.movingTowardPrefixSearch(inputs: inputData.input, leftIndex: fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight, useMemory: self.learningManager.enabled)
// MARK: indices
var dicdata: [DicdataElement] = []
for (identifier, value) in indices {
let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
let rubyArray = Array(data.ruby)
@@ -349,23 +397,6 @@ public final class DicdataStore {
}
dicdata.append(contentsOf: result)
}
// temporalpenalty
for (_, charIds) in consume stringSet {
for data in self.learningManager.temporaryThroughMatch(charIDs: consume charIds, depth: depth) {
let rubyArray = Array(data.ruby)
let penalty = stringToInfo[rubyArray, default: (0, .zero)].penalty
if penalty.isZero {
dicdata.append(data)
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
continue
}
dicdata.append(data.adjustedData(adjust))
}
}
for i in toIndexLeft ..< toIndexRight {
do {
@@ -1087,4 +1118,4 @@ public final class DicdataStore {
"w": ["", "ウィ", "ウェ", ""],
"wy": ["", ""]
]
}
}

View File

@@ -42,7 +42,7 @@ struct TypoCorrectionGenerator {
var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)]
mutating func generate(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndexRange: Range<Int>) -> ([Character], (endIndex: Int, penalty: PValue))? {
mutating func next() -> ([Character], (endIndex: Int, penalty: PValue))? {
while let (convertTargetElements, lastElement, count, penalty) = self.stack.popLast() {
var result: ([Character], (endIndex: Int, penalty: PValue))? = nil
if rightIndexRange.contains(count + left - 1) {
@@ -52,14 +52,18 @@ struct TypoCorrectionGenerator {
}
//
if self.nodes.endIndex <= count {
continue
if let result {
return result
}
}
// (3)
if penalty >= maxPenalty {
var convertTargetElements = convertTargetElements
let correct = [inputs[left + count]].map {ComposingText.InputElement(character: $0.character.toKatakana(), inputStyle: $0.inputStyle)}
if count + correct.count > self.nodes.endIndex {
continue
if let result {
return result
}
}
for element in correct {
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)

View File

@@ -240,35 +240,52 @@ package struct LOUDS: Sendable {
//
var targets = targets
targets.sort(by: Self.lexLessThan)
var helper = MovingTowardPrefixSearchHelper(louds: self, depth: depth)
for target in targets {
helper.update(target: target)
}
return helper.indices
}
struct MovingTowardPrefixSearchHelper {
init(louds: LOUDS, depth: Range<Int>) {
self.louds = louds
self.depth = depth
}
let louds: LOUDS
let depth: Range<Int>
//
var indices: [Int] = []
//
var stack: [(nodeIndex: Int, char: UInt8)] = []
for chars in targets {
@inlinable mutating func update(target: [UInt8]) -> Bool {
var updated = false
// iupperBound
for (i, char) in chars.enumerated() where i < depth.upperBound {
if i < stack.count, stack[i].char == char {
for (i, char) in target.enumerated() where i < self.depth.upperBound {
if i < self.stack.count, self.stack[i].char == char {
//
continue
} else if i < stack.count, stack[i].char != char {
} else if i < self.stack.count, self.stack[i].char != char {
// stack
stack = Array(stack[..<i])
self.stack = Array(self.stack[..<i])
}
// stack[i]
assert(i >= stack.count, "stack[\(i)] must not exist for logical reason.")
assert(i >= self.stack.count, "stack[\(i)] must not exist for logical reason.")
//
// stacknodeIndexchar
if let nodeIndex = self.searchCharNodeIndex(from: stack.last?.nodeIndex ?? 1, char: char) {
if depth.contains(i) {
indices.append(nodeIndex)
if let nodeIndex = self.louds.searchCharNodeIndex(from: self.stack.last?.nodeIndex ?? 1, char: char) {
if self.depth.contains(i) {
self.indices.append(nodeIndex)
updated = true
}
stack.append((nodeIndex, char))
self.stack.append((nodeIndex, char))
} else {
//
break
}
}
return updated
}
return indices
}
}