mirror of
https://github.com/mii443/AzooKeyKanaKanjiConverter.git
synced 2025-08-22 15:05:26 +00:00
feat: getLOUDSData関数をgetLOUDSDataInRange関数に統合。最適化を行っていない実装にもかかわらず、従来実装の1.2倍程度の高速化効果が得られる
This commit is contained in:
@ -430,102 +430,6 @@ public final class DicdataStore {
|
||||
}
|
||||
}
|
||||
|
||||
/// kana2latticeから参照する。louds版。
|
||||
/// - Parameters:
|
||||
/// - inputData: 入力データ
|
||||
/// - from: 始点
|
||||
/// - to: 終点
|
||||
public func getLOUDSData(inputData: ComposingText, from fromIndex: Int, to toIndex: Int, needTypoCorrection: Bool) -> [LatticeNode] {
|
||||
if toIndex - fromIndex > self.maxlength || fromIndex > toIndex {
|
||||
return []
|
||||
}
|
||||
let segment = inputData.input[fromIndex...toIndex].reduce(into: "") {$0.append($1.character)}.toKatakana()
|
||||
|
||||
// TODO: 最適化の余地あり
|
||||
let string2penalty = TypoCorrection.getRangeWithTypos(inputs: inputData.input, leftIndex: fromIndex, rightIndex: toIndex).filter {
|
||||
needTypoCorrection || $0.value == 0.0
|
||||
}
|
||||
|
||||
// MARK: 検索によって得たindicesから辞書データを実際に取り出していく
|
||||
// 先頭の文字: そこで検索したい文字列の集合
|
||||
let strings = string2penalty.keys.map {
|
||||
(key: $0, charIDs: $0.map(self.character2charId))
|
||||
}
|
||||
let group = [Character: [(key: [Character], charIDs: [UInt8])]].init(grouping: strings, by: {$0.key.first!})
|
||||
|
||||
var indices: [(String, Set<Int>)] = group.map {dic in
|
||||
let head = String(dic.key)
|
||||
let set = dic.value.flatMapSet { (_, charIDs) in
|
||||
self.perfectMatchingSearch(query: head, charIDs: charIDs)
|
||||
}
|
||||
return (head, set)
|
||||
}
|
||||
do {
|
||||
let set = strings.flatMapSet { (_, charIDs) in
|
||||
self.perfectMatchingSearch(query: "user", charIDs: charIDs)
|
||||
}
|
||||
indices.append(("user", set))
|
||||
}
|
||||
if learningManager.enabled {
|
||||
let set = strings.flatMapSet { (_, charIDs) in
|
||||
self.perfectMatchingSearch(query: "memory", charIDs: charIDs)
|
||||
}
|
||||
indices.append(("memory", set))
|
||||
}
|
||||
var dicdata: [DicdataElement] = []
|
||||
for (identifier, value) in indices {
|
||||
let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
|
||||
let rubyArray = Array(data.ruby)
|
||||
let penalty = string2penalty[rubyArray, default: .zero]
|
||||
if penalty.isZero {
|
||||
return data
|
||||
}
|
||||
let ratio = Self.penaltyRatio[data.lcid]
|
||||
let pUnit: PValue = Self.getPenalty(data: data) / 2 // 負の値
|
||||
let adjust = pUnit * penalty * ratio
|
||||
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
|
||||
return nil
|
||||
}
|
||||
return data.adjustedData(adjust)
|
||||
}
|
||||
dicdata.append(contentsOf: result)
|
||||
}
|
||||
// temporalな学習結果にpenaltyを加えて追加する
|
||||
for (characters, charIds) in consume strings {
|
||||
for data in self.learningManager.temporaryPerfectMatch(charIDs: consume charIds) {
|
||||
// perfect matchなので、Array(data.ruby)はcharactersに等しい
|
||||
let penalty = string2penalty[characters, default: .zero]
|
||||
if penalty.isZero {
|
||||
dicdata.append(data)
|
||||
}
|
||||
let ratio = Self.penaltyRatio[data.lcid]
|
||||
let pUnit: PValue = Self.getPenalty(data: data) / 2 // 負の値
|
||||
let adjust = pUnit * penalty * ratio
|
||||
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: characters.count) {
|
||||
continue
|
||||
}
|
||||
dicdata.append(data.adjustedData(adjust))
|
||||
}
|
||||
}
|
||||
|
||||
dicdata.append(contentsOf: self.getWiseDicdata(convertTarget: segment, inputData: inputData, inputRange: fromIndex ..< toIndex + 1))
|
||||
for segment in string2penalty.keys {
|
||||
dicdata.append(contentsOf: self.getMatchDynamicUserDict(String(segment)))
|
||||
}
|
||||
|
||||
if fromIndex == .zero {
|
||||
let result: [LatticeNode] = dicdata.map {
|
||||
let node = LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)
|
||||
node.prevs.append(RegisteredNode.BOSNode())
|
||||
return node
|
||||
}
|
||||
return result
|
||||
} else {
|
||||
let result: [LatticeNode] = dicdata.map {LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)}
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
func getZeroHintPredictionDicdata(lastRcid: Int) -> [DicdataElement] {
|
||||
do {
|
||||
let csvString = try String(contentsOf: requestOptions.dictionaryResourceURL.appendingPathComponent("p/pc_\(lastRcid).csv", isDirectory: false), encoding: .utf8)
|
||||
|
@ -1,6 +1,6 @@
|
||||
import SwiftUtils
|
||||
|
||||
struct TypoCorrectionGenerator {
|
||||
struct TypoCorrectionGenerator: Sendable {
|
||||
init(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndexRange: Range<Int>, needTypoCorrection: Bool) {
|
||||
self.maxPenalty = needTypoCorrection ? 3.5 * 3 : 0
|
||||
self.inputs = inputs
|
||||
@ -10,12 +10,12 @@ struct TypoCorrectionGenerator {
|
||||
let count = rightIndexRange.endIndex - left
|
||||
self.count = count
|
||||
self.nodes = (0..<count).map {(i: Int) in
|
||||
TypoCorrection.lengths.flatMap {(k: Int) -> [TypoCorrection.TypoCandidate] in
|
||||
Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
|
||||
let j = i + k
|
||||
if count <= j {
|
||||
return []
|
||||
}
|
||||
return TypoCorrection.getTypo(inputs[left + i ... left + j], frozen: !needTypoCorrection)
|
||||
return Self.getTypo(inputs[left + i ... left + j], frozen: !needTypoCorrection)
|
||||
}
|
||||
}
|
||||
// 深さ優先で列挙する
|
||||
@ -38,7 +38,7 @@ struct TypoCorrectionGenerator {
|
||||
let inputs: [ComposingText.InputElement]
|
||||
let left: Int
|
||||
let rightIndexRange: Range<Int>
|
||||
let nodes: [[TypoCorrection.TypoCandidate]]
|
||||
let nodes: [[TypoCandidate]]
|
||||
let count: Int
|
||||
|
||||
var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)]
|
||||
@ -116,7 +116,7 @@ struct TypoCorrectionGenerator {
|
||||
for element in $0.inputElements {
|
||||
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
|
||||
}
|
||||
if TypoCorrection.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
|
||||
if Self.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
|
||||
return nil
|
||||
}
|
||||
return (
|
||||
@ -134,10 +134,7 @@ struct TypoCorrectionGenerator {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: 誤り訂正用のAPI
|
||||
enum TypoCorrection {
|
||||
fileprivate static func shouldBeRemovedForDicdataStore(components: [ComposingText.ConvertTargetElement]) -> Bool {
|
||||
// 判定に使うのは最初の1エレメントの最初の文字で十分
|
||||
guard let first = components.first?.string.first?.toKatakana() else {
|
||||
@ -146,87 +143,6 @@ enum TypoCorrection {
|
||||
return !CharacterUtils.isRomanLetter(first) && !DicdataStore.existLOUDS(for: first)
|
||||
}
|
||||
|
||||
static func getRangeWithTypos(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndex right: Int) -> [[Character]: PValue] {
|
||||
// 各iから始まる候補を列挙する
|
||||
// 例えばinput = [d(あ), r(s), r(i), r(t), r(s), d(は), d(は), d(れ)]の場合
|
||||
// nodes = [[d(あ)], [r(s)], [r(i)], [r(t), [r(t), r(a)]], [r(s)], [d(は), d(ば), d(ぱ)], [d(れ)]]
|
||||
// となる
|
||||
let count = right - left + 1
|
||||
let nodes = (0..<count).map {(i: Int) in
|
||||
Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
|
||||
let j = i + k
|
||||
if count <= j {
|
||||
return []
|
||||
}
|
||||
return Self.getTypo(inputs[left + i ... left + j])
|
||||
}
|
||||
}
|
||||
|
||||
let maxPenalty: PValue = 3.5 * 3
|
||||
|
||||
// 深さ優先で列挙する
|
||||
var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)] = nodes[0].compactMap { typoCandidate in
|
||||
guard let firstElement = typoCandidate.inputElements.first else {
|
||||
return nil
|
||||
}
|
||||
if ComposingText.isLeftSideValid(first: firstElement, of: inputs, from: left) {
|
||||
var convertTargetElements = [ComposingText.ConvertTargetElement]()
|
||||
for element in typoCandidate.inputElements {
|
||||
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
|
||||
}
|
||||
return (convertTargetElements, typoCandidate.inputElements.last!, typoCandidate.inputElements.count, typoCandidate.weight)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var stringToPenalty: [([Character], PValue)] = []
|
||||
|
||||
while let (convertTargetElements, lastElement, count, penalty) = stack.popLast() {
|
||||
if count + left - 1 == right {
|
||||
if let convertTarget = ComposingText.getConvertTargetIfRightSideIsValid(lastElement: lastElement, of: inputs, to: count + left, convertTargetElements: convertTargetElements)?.map({$0.toKatakana()}) {
|
||||
stringToPenalty.append((convertTarget, penalty))
|
||||
}
|
||||
continue
|
||||
}
|
||||
// エスケープ
|
||||
if nodes.endIndex <= count {
|
||||
continue
|
||||
}
|
||||
// 訂正数上限(3個)
|
||||
if penalty >= maxPenalty {
|
||||
var convertTargetElements = convertTargetElements
|
||||
let correct = [inputs[left + count]].map {ComposingText.InputElement(character: $0.character.toKatakana(), inputStyle: $0.inputStyle)}
|
||||
if count + correct.count > nodes.endIndex {
|
||||
continue
|
||||
}
|
||||
for element in correct {
|
||||
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
|
||||
}
|
||||
stack.append((convertTargetElements, correct.last!, count + correct.count, penalty))
|
||||
} else {
|
||||
stack.append(contentsOf: nodes[count].compactMap {
|
||||
if count + $0.inputElements.count > nodes.endIndex {
|
||||
return nil
|
||||
}
|
||||
var convertTargetElements = convertTargetElements
|
||||
for element in $0.inputElements {
|
||||
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
|
||||
}
|
||||
if Self.shouldBeRemovedForDicdataStore(components: convertTargetElements) {
|
||||
return nil
|
||||
}
|
||||
return (
|
||||
convertTargetElements: convertTargetElements,
|
||||
lastElement: $0.inputElements.last!,
|
||||
count: count + $0.inputElements.count,
|
||||
penalty: penalty + $0.weight
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
return Dictionary(stringToPenalty, uniquingKeysWith: max)
|
||||
}
|
||||
|
||||
fileprivate static func getTypo(_ elements: some Collection<ComposingText.InputElement>, frozen: Bool = false) -> [TypoCandidate] {
|
||||
let key = elements.reduce(into: "") {$0.append($1.character)}.toKatakana()
|
||||
|
||||
@ -289,7 +205,7 @@ enum TypoCorrection {
|
||||
}
|
||||
}
|
||||
|
||||
struct TypoCandidate: Equatable {
|
||||
struct TypoCandidate: Sendable, Equatable {
|
||||
var inputElements: [ComposingText.InputElement]
|
||||
var weight: PValue
|
||||
}
|
||||
|
@ -35,7 +35,7 @@ extension Kana2Kanji {
|
||||
|
||||
// (1)
|
||||
let addedNodes: [[LatticeNode]] = (0...count).map {(i: Int) in
|
||||
self.dicdataStore.getLOUDSData(inputData: inputData, from: i, to: count, needTypoCorrection: needTypoCorrection)
|
||||
self.dicdataStore.getLOUDSDataInRange(inputData: inputData, from: i, toIndexRange: count ..< count+1, needTypoCorrection: needTypoCorrection)
|
||||
}
|
||||
|
||||
// ココが一番時間がかかっていた。
|
||||
|
@ -129,7 +129,7 @@ final class DicdataStoreTests: XCTestCase {
|
||||
for (key, word) in mustWords {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition(key, inputStyle: .direct)
|
||||
let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: false)
|
||||
let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: false)
|
||||
// 冗長な書き方だが、こうすることで「どの項目でエラーが発生したのか」がはっきりするため、こう書いている。
|
||||
XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
|
||||
}
|
||||
@ -150,7 +150,7 @@ final class DicdataStoreTests: XCTestCase {
|
||||
for (key, word) in mustWords {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition(key, inputStyle: .direct)
|
||||
let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: false)
|
||||
let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: false)
|
||||
XCTAssertNil(result.first(where: {$0.data.word == word && $0.data.ruby == key}))
|
||||
}
|
||||
}
|
||||
@ -170,7 +170,7 @@ final class DicdataStoreTests: XCTestCase {
|
||||
for (key, word) in mustWords {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition(key, inputStyle: .direct)
|
||||
let result = dicdataStore.getLOUDSData(inputData: c, from: 0, to: c.input.endIndex - 1, needTypoCorrection: true)
|
||||
let result = dicdataStore.getLOUDSDataInRange(inputData: c, from: 0, toIndexRange: c.input.endIndex - 1 ..< c.input.endIndex, needTypoCorrection: true)
|
||||
XCTAssertEqual(result.first(where: {$0.data.word == word})?.data.word, word)
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user