perf: 同じloudsに対する検索をバルク処理することによって、処理の効率化を実現 (#208)

* perf: 同じloudsに対する検索をバルク処理することによって、処理の効率化を実現

* fix: bug

* test: add typo correction test

* chore: finalize imp;
This commit is contained in:
Miwa
2025-06-27 22:32:46 +09:00
committed by GitHub
parent f60a9a8738
commit f5037e393c
3 changed files with 99 additions and 11 deletions

View File

@ -243,6 +243,23 @@ public final class DicdataStore {
return Array(result[min(depth.lowerBound + 1, result.endIndex) ..< min(depth.upperBound + 1, result.endIndex)])
}
///
/// - Parameters:
/// - group: 1ID
/// - depth: `2..<4`23
/// - Returns:
private func throughMatchLOUDS(group: [String: [([Character], [UInt8])]], depth: Range<Int>) -> [(key: String, indices: Set<Int>)] {
let indices: [(String, Set<Int>)] = group.map {dic in
guard let louds = self.loadLOUDS(query: dic.key) else {
return (dic.key, [])
}
//
let result = louds.byfixNodeIndices(targets: dic.value.map { $0.1 }, depth: depth)
return (dic.key, Set(result))
}
return indices
}
private func prefixMatchLOUDS(query: String, charIDs: [UInt8], depth: Int = .max, maxCount: Int = .max) -> [Int] {
guard let louds = self.loadLOUDS(query: query) else {
return []
@ -292,20 +309,15 @@ public final class DicdataStore {
// MARK:
var stringToInfo = inputData.getRangesWithTypos(fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
// MARK:
let stringSet = stringToInfo.keys.map {($0, $0.map(self.character2charId))}
let stringSet: [([Character], [UInt8])] = stringToInfo.keys.map {($0, $0.map(self.character2charId))}
let (minCharIDsCount, maxCharIDsCount) = stringSet.lazy.map {$0.1.count}.minAndMax() ?? (0, -1)
// :
let group = [Character: [([Character], [UInt8])]].init(grouping: stringSet, by: {$0.0.first!})
let depth = minCharIDsCount - 1 ..< maxCharIDsCount
var indices: [(String, Set<Int>)] = group.map {dic in
let key = String(dic.key)
let set = dic.value.flatMapSet {(_, charIDs) in self.throughMatchLOUDS(query: key, charIDs: charIDs, depth: depth)}
return (key, set)
}
indices.append(("user", stringSet.flatMapSet {self.throughMatchLOUDS(query: "user", charIDs: $0.1, depth: depth)}))
let group = [String: [([Character], [UInt8])]].init(grouping: stringSet, by: {String($0.0.first!)})
var indices = self.throughMatchLOUDS(group: group, depth: depth)
if learningManager.enabled {
indices.append(("memory", stringSet.flatMapSet {self.throughMatchLOUDS(query: "memory", charIDs: $0.1, depth: depth)}))
indices.append(contentsOf: self.throughMatchLOUDS(group: ["user": stringSet, "memory": stringSet], depth: depth))
} else {
indices.append(contentsOf: self.throughMatchLOUDS(group: ["user": stringSet], depth: depth))
}
// MARK: indices
var dicdata: [DicdataElement] = []

View File

@ -216,4 +216,60 @@ package struct LOUDS: Sendable {
}
return indices
}
///
private static func lexLessThan(_ lhs: [UInt8], _ rhs: [UInt8]) -> Bool {
let minCount = Swift.min(lhs.count, rhs.count)
for i in 0..<minCount {
let l = lhs[i]
let r = rhs[i]
if l != r {
return l < r
}
}
return lhs.count < rhs.count
}
///
///
///
/// - Parameter chars: CharID
/// - Returns: loudstxt3
/// - Note:
@inlinable func byfixNodeIndices(targets: [[UInt8]], depth: Range<Int>) -> [Int] {
//
// let targets = targets.sorted(by: Self.lexLessThan)
var targets = targets
targets.sort(by: Self.lexLessThan)
//
var indices: [Int] = []
//
var stack: [(nodeIndex: Int, char: UInt8)] = []
for chars in targets {
// iupperBound
for (i, char) in chars.enumerated() where i < depth.upperBound {
if i < stack.count, stack[i].char == char {
//
continue
} else if i < stack.count, stack[i].char != char {
// stack
stack = Array(stack[..<i])
}
// stack[i]
assert(i >= stack.count, "stack[\(i)] must not exist for logical reason.")
//
// stacknodeIndexchar
if let nodeIndex = self.searchCharNodeIndex(from: stack.last?.nodeIndex ?? 1, char: char) {
if depth.contains(i) {
indices.append(nodeIndex)
}
stack.append((nodeIndex, char))
} else {
//
break
}
}
}
return indices
}
}

View File

@ -155,6 +155,26 @@ final class DicdataStoreTests: XCTestCase {
}
}
///
func testMustCorrectTypo() throws {
let dicdataStore = DicdataStore(convertRequestOptions: requestOptions())
let mustWords = [
("タイカクセイ", "大学生"),
("シヨック", "ショック"),
("キヨクイン", "局員"),
("シヨーク", "ジョーク"),
("サリカニ", "ザリガニ"),
("ノクチヒテヨ", "野口英世"),
("オタノフナカ", "織田信長"),
]
for (key, word) in mustWords {
var c = ComposingText()
c.insertAtCursorPosition(key, inputStyle: .direct)
let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: true)
XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
}
}
func testGetLOUDSDataInRange() throws {
let dicdataStore = DicdataStore(convertRequestOptions: requestOptions())
do {