feat: typo correctionの枝刈りを実装し、direct入力のケースでTestFullConversionの実行速度が1.6倍に向上

This commit is contained in:
Miwa / Ensan
2025-06-29 12:30:34 +09:00
parent b44fc3f3bd
commit eb4e669b2b
5 changed files with 55 additions and 14 deletions

View File

@ -288,17 +288,25 @@ public final class DicdataStore {
[String(firstCharacter), "user"]
}
var updated = false
var availableMaxIndex = 0
for key in keys {
withMutableValue(&targetLOUDS[key]) { helper in
if helper == nil, let louds = self.loadLOUDS(query: key) {
helper = LOUDS.MovingTowardPrefixSearchHelper(louds: louds, depth: 0 ..< .max)
}
let hasUpdate = helper?.update(target: charIDs) ?? false
updated = updated || hasUpdate
guard helper != nil else {
return
}
let result = helper!.update(target: charIDs)
updated = updated || result.updated
availableMaxIndex = max(availableMaxIndex, result.availableMaxIndex)
}
}
//
for data in self.learningManager.movingTowardPrefixSearchOnTemporaryMemory(charIDs: consume charIDs, depth: 0 ..< .max) {
let result = self.learningManager.movingTowardPrefixSearchOnTemporaryMemory(charIDs: consume charIDs, depth: 0 ..< .max)
updated = updated || !(result.dicdata.isEmpty)
availableMaxIndex = max(availableMaxIndex, result.availableMaxIndex)
for data in result.dicdata {
if info.penalty.isZero {
temporaryMemoryDicdata.append(data)
}
@ -310,6 +318,10 @@ public final class DicdataStore {
}
temporaryMemoryDicdata.append(data.adjustedData(adjust))
}
if availableMaxIndex < characters.endIndex - 1 {
//
generator.setUnreachablePath(target: characters[...(availableMaxIndex + 1)])
}
if updated {
stringToInfo.append((characters, info))
}
@ -478,7 +490,7 @@ public final class DicdataStore {
}
if learningManager.enabled {
// temporalpenalty
dicdata.append(contentsOf: self.learningManager.movingTowardPrefixSearchOnTemporaryMemory(charIDs: consume maxIDs, depth: depth))
dicdata.append(contentsOf: self.learningManager.movingTowardPrefixSearchOnTemporaryMemory(charIDs: consume maxIDs, depth: depth).dicdata)
}
for (key, value) in stringToEndIndex {
let convertTarget = String(key)

View File

@ -584,20 +584,22 @@ struct TemporalLearningMemoryTrie {
return nodes[index].dataIndices.map {self.dicdata[$0]}
}
func movingTowardPrefixSearch(chars: [UInt8], depth: Range<Int>) -> [DicdataElement] {
func movingTowardPrefixSearch(chars: [UInt8], depth: Range<Int>) -> (dicdata: [DicdataElement], availableMaxIndex: Int) {
var index = 0
var availableMaxIndex = 0
var indices: [Int] = []
for (offset, char) in chars.enumerated() {
if let nextIndex = nodes[index].children[char] {
availableMaxIndex = index
index = nextIndex
if depth.contains(offset) {
indices.append(contentsOf: nodes[index].dataIndices)
}
} else {
return indices.map {self.dicdata[$0]}
return (indices.map {self.dicdata[$0]}, availableMaxIndex)
}
}
return indices.map {self.dicdata[$0]}
return (indices.map {self.dicdata[$0]}, availableMaxIndex)
}
func prefixMatch(chars: [UInt8]) -> [DicdataElement] {
@ -718,9 +720,9 @@ final class LearningManager {
return self.temporaryMemory.perfectMatch(chars: charIDs)
}
func movingTowardPrefixSearchOnTemporaryMemory(charIDs: [UInt8], depth: Range<Int>) -> [DicdataElement] {
func movingTowardPrefixSearchOnTemporaryMemory(charIDs: [UInt8], depth: Range<Int>) -> (dicdata: [DicdataElement], availableMaxIndex: Int) {
guard let options, options.learningType.needUsingMemory else {
return []
return ([], 0)
}
return self.temporaryMemory.movingTowardPrefixSearch(chars: charIDs, depth: depth)
}

View File

@ -42,6 +42,27 @@ struct TypoCorrectionGenerator {
var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)]
/// `target`
mutating func setUnreachablePath(target: some Collection<Character>) {
self.stack = self.stack.filter { (convertTargetElements, lastElement, count, penalty) in
var stablePrefix: [Character] = []
loop: for item in convertTargetElements {
switch item.inputStyle {
case .direct:
stablePrefix.append(contentsOf: item.string)
case .roman2kana:
// TODO: impl
break loop
}
// prefix
if stablePrefix.hasPrefix(target) {
return false
}
}
return true
}
}
mutating func next() -> ([Character], (endIndex: Int, penalty: PValue))? {
while let (convertTargetElements, lastElement, count, penalty) = self.stack.popLast() {
var result: ([Character], (endIndex: Int, penalty: PValue))? = nil

View File

@ -242,7 +242,7 @@ package struct LOUDS: Sendable {
targets.sort(by: Self.lexLessThan)
var helper = MovingTowardPrefixSearchHelper(louds: self, depth: depth)
for target in targets {
helper.update(target: target)
_ = helper.update(target: target)
}
return helper.indices
}
@ -258,13 +258,18 @@ package struct LOUDS: Sendable {
var indices: [Int] = []
//
var stack: [(nodeIndex: Int, char: UInt8)] = []
@inlinable mutating func update(target: [UInt8]) -> Bool {
/// `target`
/// - Parameter target: `CharID`
/// - Returns: `updated``indices``availableMaxIndex`
@inlinable mutating func update(target: [UInt8]) -> (updated: Bool, availableMaxIndex: Int) {
var updated = false
var availableMaxIndex = 0
// iupperBound
for (i, char) in target.enumerated() where i < self.depth.upperBound {
if i < self.stack.count, self.stack[i].char == char {
//
availableMaxIndex = i
continue
} else if i < self.stack.count, self.stack[i].char != char {
// stack
@ -278,6 +283,7 @@ package struct LOUDS: Sendable {
if self.depth.contains(i) {
self.indices.append(nodeIndex)
updated = true
availableMaxIndex = i
}
self.stack.append((nodeIndex, char))
} else {
@ -285,7 +291,7 @@ package struct LOUDS: Sendable {
break
}
}
return updated
return (updated, availableMaxIndex)
}
}
}

View File

@ -27,7 +27,7 @@ final class TemporalLearningMemoryTrieTests: XCTestCase {
XCTAssertEqual(result1.first?.word, element1.word)
XCTAssertTrue(result1.first?.metadata.contains(.isLearned) ?? false)
let result2 = trie.movingTowardPrefixSearch(chars: chars(for: element2.ruby), depth: (element2.ruby.count - 1)..<element2.ruby.count)
let result2 = trie.movingTowardPrefixSearch(chars: chars(for: element2.ruby), depth: (element2.ruby.count - 1)..<element2.ruby.count).dicdata
XCTAssertEqual(result2.map { $0.word }, [element2.word])
let prefixResult = trie.prefixMatch(chars: chars(for: "テス"))