Uniquify candidates

This commit is contained in:
ensan-hcl
2023-09-19 22:30:48 +09:00
parent f5cfe1cf11
commit 5ec51132ff
4 changed files with 157 additions and 121 deletions

View File

@@ -116,6 +116,27 @@ import SwiftUtils
}
return result
}
///
/// - Parameters:
/// - candidates: unique
/// - Returns:
/// `candidates`
private func getUniquePredictionCandidate(_ candidates: some Sequence<PredictionCandidate>, seenCandidates: Set<String> = []) -> [PredictionCandidate] {
var result = [PredictionCandidate]()
for candidate in candidates where !candidate.text.isEmpty && !seenCandidates.contains(candidate.text) {
if let index = result.firstIndex(where: {$0.text == candidate.text}) {
if result[index].value < candidate.value {
result[index] = candidate
}
} else {
result.append(candidate)
}
}
return result
}
///
/// - Parameters:
/// - inputData:
@@ -596,10 +617,14 @@ import SwiftUtils
///
public func requestPredictionCandidates(leftSideCandidate: Candidate, options: ConvertRequestOptions) -> [PredictionCandidate] {
var seenCandidates: Set<String> = []
//
let zeroHintResults = self.converter.getZeroHintPredictionCandidates(preparts: [leftSideCandidate], N_best: 10)
let zeroHintResults = self.getUniquePredictionCandidate(self.converter.getZeroHintPredictionCandidates(preparts: [leftSideCandidate], N_best: 15))
seenCandidates.formUnion(zeroHintResults.map{$0.text})
//
let predictionResults = self.converter.getPredictionCandidates(prepart: leftSideCandidate, N_best: 10)
let predictionResults = self.getUniquePredictionCandidate(self.converter.getPredictionCandidates(prepart: leftSideCandidate, N_best: 15), seenCandidates: seenCandidates)
seenCandidates.formUnion(predictionResults.map{$0.text})
//
// TODO: implement
//

View File

@@ -43,120 +43,4 @@ struct Kana2Kanji {
data: data.data
)
}
public func mergeCandidates(_ left: Candidate, _ right: Candidate) -> Candidate {
guard let leftLast = left.data.last, let rightFirst = right.data.first else {
return Candidate(
text: left.text + right.text,
value: left.value + right.value,
correspondingCount: left.correspondingCount + right.correspondingCount,
lastMid: right.lastMid,
data: left.data + right.data
)
}
let ccValue = self.dicdataStore.getCCValue(leftLast.lcid, rightFirst.lcid)
let includeMMValueCalculation = DicdataStore.includeMMValueCalculation(rightFirst)
let mmValue = includeMMValueCalculation ? self.dicdataStore.getMMValue(left.lastMid, rightFirst.mid):.zero
let newValue = left.value + mmValue + ccValue + right.value
return Candidate(
text: left.text + right.text,
value: newValue,
correspondingCount: left.correspondingCount + right.correspondingCount,
lastMid: right.lastMid,
data: left.data + right.data
)
}
func getPredictionCandidates(prepart: Candidate, N_best: Int) -> [PredictionCandidate] {
var result: [PredictionCandidate] = []
var count = 1
var prefixCandidate = prepart
prefixCandidate.actions = []
var prefixCandidateData = prepart.data
var totalWord = ""
var totalRuby = ""
var totalData: [DicdataElement] = []
while count <= min(prepart.data.count, 3), let element = prefixCandidateData.popLast() {
defer {
count += 1
}
// prefixCandidate
do {
prefixCandidate.value -= element.value()
prefixCandidate.value -= self.dicdataStore.getCCValue(prefixCandidateData.last?.rcid ?? CIDData.BOS.cid, element.lcid)
if DicdataStore.includeMMValueCalculation(element) {
let previousMid = prefixCandidateData.last(where: DicdataStore.includeMMValueCalculation)?.mid ?? MIDData.BOS.mid
prefixCandidate.lastMid = previousMid
prefixCandidate.value -= self.dicdataStore.getMMValue(previousMid, element.mid)
}
prefixCandidate.data = prefixCandidateData
prefixCandidate.text = prefixCandidateData.reduce(into: "") { $0 += $1.word }
prefixCandidate.correspondingCount = prefixCandidateData.reduce(into: 0) { $0 += $1.ruby.count }
}
totalWord.insert(contentsOf: element.word, at: totalWord.startIndex)
totalRuby.insert(contentsOf: element.ruby, at: totalRuby.startIndex)
totalData.insert(element, at: 0)
let dicdata = self.dicdataStore.getPredictionLOUDSDicdata(key: totalRuby).filter {$0.word.hasPrefix(totalWord)}
for data in dicdata {
let ccValue = self.dicdataStore.getCCValue(prefixCandidateData.last?.rcid ?? CIDData.BOS.cid, data.lcid)
let includeMMValueCalculation = DicdataStore.includeMMValueCalculation(data)
let mmValue = includeMMValueCalculation ? self.dicdataStore.getMMValue(prefixCandidate.lastMid, data.mid):.zero
let wValue = data.value()
let newValue = prefixCandidate.value + mmValue + ccValue + wValue
// index
let lastindex: Int = (result.lastIndex(where: {$0.value >= newValue}) ?? -1) + 1
if lastindex == N_best {
continue
}
//
if result.count >= N_best {
result.removeLast()
}
//
let text = String(data.word.dropFirst(totalWord.count))
result.insert(.replacement(.init(text: text, targetData: totalData, replacementData: [data], value: newValue)), at: lastindex)
}
}
return result
}
///
/// - parameters:
/// - preparts: Candidate
/// - N_best:
/// - returns:
///
/// - note:
/// --
func getZeroHintPredictionCandidates(preparts: some Collection<Candidate>, N_best: Int) -> [PredictionCandidate] {
var result: [PredictionCandidate] = []
for candidate in preparts {
if let last = candidate.data.last {
let dicdata = self.dicdataStore.getZeroHintPredictionDicdata(lastRcid: last.rcid)
for data in dicdata {
let ccValue = self.dicdataStore.getCCValue(last.rcid, data.lcid)
let includeMMValueCalculation = DicdataStore.includeMMValueCalculation(data)
let mmValue = includeMMValueCalculation ? self.dicdataStore.getMMValue(candidate.lastMid, data.mid):.zero
let wValue = data.value()
let newValue = candidate.value + mmValue + ccValue + wValue
// index
let lastindex: Int = (result.lastIndex(where: {$0.value >= newValue}) ?? -1) + 1
if lastindex == N_best {
continue
}
//
if result.count >= N_best {
result.removeLast()
}
result.insert(.additional(.init(text: data.word, data: [data], value: newValue)), at: lastindex)
}
}
}
return result
}
}

View File

@@ -0,0 +1,127 @@
//
// prediction.swift
//
//
// Created by miwa on 2023/09/19.
//
import Foundation
extension Kana2Kanji {
func mergeCandidates(_ left: Candidate, _ right: Candidate) -> Candidate {
guard let leftLast = left.data.last, let rightFirst = right.data.first else {
return Candidate(
text: left.text + right.text,
value: left.value + right.value,
correspondingCount: left.correspondingCount + right.correspondingCount,
lastMid: right.lastMid,
data: left.data + right.data
)
}
let ccValue = self.dicdataStore.getCCValue(leftLast.lcid, rightFirst.lcid)
let includeMMValueCalculation = DicdataStore.includeMMValueCalculation(rightFirst)
let mmValue = includeMMValueCalculation ? self.dicdataStore.getMMValue(left.lastMid, rightFirst.mid):.zero
let newValue = left.value + mmValue + ccValue + right.value
return Candidate(
text: left.text + right.text,
value: newValue,
correspondingCount: left.correspondingCount + right.correspondingCount,
lastMid: right.lastMid,
data: left.data + right.data
)
}
func getPredictionCandidates(prepart: Candidate, N_best: Int) -> [PredictionCandidate] {
var result: [PredictionCandidate] = []
var count = 1
var prefixCandidate = prepart
prefixCandidate.actions = []
var prefixCandidateData = prepart.data
var totalWord = ""
var totalRuby = ""
var totalData: [DicdataElement] = []
while count <= min(prepart.data.count, 3), let element = prefixCandidateData.popLast() {
defer {
count += 1
}
// prefixCandidate
do {
prefixCandidate.value -= element.value()
prefixCandidate.value -= self.dicdataStore.getCCValue(prefixCandidateData.last?.rcid ?? CIDData.BOS.cid, element.lcid)
if DicdataStore.includeMMValueCalculation(element) {
let previousMid = prefixCandidateData.last(where: DicdataStore.includeMMValueCalculation)?.mid ?? MIDData.BOS.mid
prefixCandidate.lastMid = previousMid
prefixCandidate.value -= self.dicdataStore.getMMValue(previousMid, element.mid)
}
prefixCandidate.data = prefixCandidateData
prefixCandidate.text = prefixCandidateData.reduce(into: "") { $0 += $1.word }
prefixCandidate.correspondingCount = prefixCandidateData.reduce(into: 0) { $0 += $1.ruby.count }
}
totalWord.insert(contentsOf: element.word, at: totalWord.startIndex)
totalRuby.insert(contentsOf: element.ruby, at: totalRuby.startIndex)
totalData.insert(element, at: 0)
let dicdata = self.dicdataStore.getPredictionLOUDSDicdata(key: totalRuby).filter {$0.word.hasPrefix(totalWord)}
for data in dicdata {
let ccValue = self.dicdataStore.getCCValue(prefixCandidateData.last?.rcid ?? CIDData.BOS.cid, data.lcid)
let includeMMValueCalculation = DicdataStore.includeMMValueCalculation(data)
let mmValue = includeMMValueCalculation ? self.dicdataStore.getMMValue(prefixCandidate.lastMid, data.mid):.zero
let wValue = data.value()
let newValue = prefixCandidate.value + mmValue + ccValue + wValue
// index
let lastindex: Int = (result.lastIndex(where: {$0.value >= newValue}) ?? -1) + 1
if lastindex == N_best {
continue
}
//
if result.count >= N_best {
result.removeLast()
}
//
let text = String(data.word.dropFirst(totalWord.count))
result.insert(.replacement(.init(text: text, targetData: totalData, replacementData: [data], value: newValue)), at: lastindex)
}
}
return result
}
///
/// - parameters:
/// - preparts: Candidate
/// - N_best:
/// - returns:
///
/// - note:
/// --
func getZeroHintPredictionCandidates(preparts: some Collection<Candidate>, N_best: Int) -> [PredictionCandidate] {
var result: [PredictionCandidate] = []
for candidate in preparts {
if let last = candidate.data.last {
let dicdata = self.dicdataStore.getZeroHintPredictionDicdata(lastRcid: last.rcid)
for data in dicdata {
let ccValue = self.dicdataStore.getCCValue(last.rcid, data.lcid)
let includeMMValueCalculation = DicdataStore.includeMMValueCalculation(data)
let mmValue = includeMMValueCalculation ? self.dicdataStore.getMMValue(candidate.lastMid, data.mid):.zero
let wValue = data.value()
let newValue = candidate.value + mmValue + ccValue + wValue
// index
let lastindex: Int = (result.lastIndex(where: {$0.value >= newValue}) ?? -1) + 1
if lastindex == N_best {
continue
}
//
if result.count >= N_best {
result.removeLast()
}
result.insert(.additional(.init(text: data.word, data: [data], value: newValue)), at: lastindex)
}
}
}
return result
}
}

View File

@@ -7,16 +7,16 @@
import Foundation
public enum PredictionCandidate: Sendable {
public enum PredictionCandidate: Sendable, Hashable {
case additional(AdditionalPredictionCandidate)
case replacement(ReplacementPredictionCandidate)
public struct AdditionalPredictionCandidate: Sendable {
public struct AdditionalPredictionCandidate: Sendable, Hashable {
public var text: String
public var data: [DicdataElement]
public var value: PValue
}
public struct ReplacementPredictionCandidate: Sendable {
public struct ReplacementPredictionCandidate: Sendable, Hashable {
///
public var text: String
///