Files
AzooKeyKanaKanjiConverter/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift
Miwa f5037e393c perf: 同じloudsに対する検索をバルク処理することによって、処理の効率化を実現 (#208)
* perf: 同じloudsに対する検索をバルク処理することによって、処理の効率化を実現

* fix: bug

* test: add typo correction test

* chore: finalize imp;
2025-06-27 22:32:46 +09:00

1077 lines
52 KiB
Swift
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//
// DicdataStore.swift
// Keyboard
//
// Created by ensan on 2020/09/17.
// Copyright © 2020 ensan. All rights reserved.
//
import Algorithms
import Foundation
import SwiftUtils
public final class DicdataStore {
public init(convertRequestOptions: ConvertRequestOptions) {
self.requestOptions = convertRequestOptions
self.setup()
}
init(requestOptions: ConvertRequestOptions = .default) {
self.requestOptions = requestOptions
debug("DicdataStoreが初期化されました")
self.setup()
}
private var ccParsed: [Bool] = .init(repeating: false, count: 1319)
private var ccLines: [[Int: PValue]] = []
private var mmValue: [PValue] = []
private var loudses: [String: LOUDS] = [:]
private var loudstxts: [String: Data] = [:]
private var importedLoudses: Set<String> = []
private var charsID: [Character: UInt8] = [:]
private var learningManager = LearningManager()
private var dynamicUserDict: [DicdataElement] = []
///
/// - TODO: make this value as an option
public let maxlength: Int = 20
///
/// - TODO: make this value as an option
public let threshold: PValue = -17
private let midCount = 502
private let cidCount = 1319
private var requestOptions: ConvertRequestOptions = .default
private let numberFormatter = NumberFormatter()
///
private func setup() {
numberFormatter.numberStyle = .spellOut
numberFormatter.locale = .init(identifier: "ja-JP")
self.ccLines = [[Int: PValue]].init(repeating: [:], count: CIDData.totalCount)
do {
let string = try String(contentsOf: self.requestOptions.dictionaryResourceURL.appendingPathComponent("louds/charID.chid", isDirectory: false), encoding: String.Encoding.utf8)
charsID = [Character: UInt8].init(uniqueKeysWithValues: string.enumerated().map {($0.element, UInt8($0.offset))})
} catch {
debug("Error: louds/charID.chidが存在しません。このエラーは深刻ですが、テスト時には無視できる場合があります。Description: \(error)")
}
do {
let url = requestOptions.dictionaryResourceURL.appendingPathComponent("mm.binary", isDirectory: false)
do {
let binaryData = try Data(contentsOf: url, options: [.uncached])
self.mmValue = binaryData.toArray(of: Float.self).map {PValue($0)}
} catch {
debug("Error: mm.binaryが存在しません。このエラーは深刻ですが、テスト時には無視できる場合があります。Description: \(error)")
self.mmValue = [PValue].init(repeating: .zero, count: self.midCount * self.midCount)
}
}
self.reloadUser()
_ = self.loadLOUDS(query: "user")
self.reloadMemory()
_ = self.loadLOUDS(query: "memory")
if requestOptions.preloadDictionary {
self.preloadDictionary()
}
}
/// I/O
private func preloadDictionary() {
guard let fileURLs = try? FileManager.default.contentsOfDirectory(
at: self.requestOptions.dictionaryResourceURL.appendingPathComponent("louds", isDirectory: true),
includingPropertiesForKeys: nil
) else { return }
for url in fileURLs {
let identifier = url.deletingPathExtension().lastPathComponent
let pathExt = url.pathExtension
switch pathExt {
case "louds":
// usermemory
if identifier == "user" || identifier == "memory" {
continue
}
loudses[identifier] = LOUDS.load(identifier, option: self.requestOptions)
case "loudstxt3":
if let data = try? Data(contentsOf: url) {
loudstxts[identifier] = data
} else {
debug("Error: Could not load loudstxt3 file at \(url)")
}
default:
continue
}
}
}
public enum Notification {
/// use `importDynamicUserDict` for data that cannot be obtained statically.
/// - warning: Too many dynamic user dictionary will damage conversion performance, as dynamic user dictionary uses inefficent algorithms for looking up. If your entries can be listed up statically, then use normal user dictionaries.
case importDynamicUserDict([DicdataElement])
@available(*, deprecated, renamed: "importDynamicUserDict", message: "it will be removed in AzooKeyKanaKanjiConverter v1.0")
case importOSUserDict([DicdataElement])
case setRequestOptions(ConvertRequestOptions)
case forgetMemory(Candidate)
case closeKeyboard
}
func sendToDicdataStore(_ data: Notification) {
switch data {
case .closeKeyboard:
self.closeKeyboard()
case .importOSUserDict(let dicdata), .importDynamicUserDict(let dicdata):
self.dynamicUserDict = dicdata
self.dynamicUserDict.mutatingForEach {
$0.metadata = .isFromUserDictionary
}
case let .forgetMemory(candidate):
self.learningManager.forgetMemory(data: candidate.data)
// louds
self.reloadMemory()
case let .setRequestOptions(value):
// bundleURLsetup
if value.dictionaryResourceURL != self.requestOptions.dictionaryResourceURL {
self.requestOptions = value
self.setup()
} else {
self.requestOptions = value
}
let shouldReset = self.learningManager.setRequestOptions(value)
if shouldReset {
self.reloadMemory()
}
}
}
func character2charId(_ character: Character) -> UInt8 {
self.charsID[character, default: .max]
}
private func reloadMemory() {
self.loudses.removeValue(forKey: "memory")
self.importedLoudses.remove("memory")
}
private func reloadUser() {
self.loudses.removeValue(forKey: "user")
self.importedLoudses.remove("user")
}
private func closeKeyboard() {
self.learningManager.save()
// savememoryLOUDS使
self.reloadMemory()
self.reloadUser()
}
///
@inlinable static func getPenalty(data: borrowing DicdataElement) -> PValue {
-2.0 / PValue(data.word.count)
}
///
private func shouldBeRemoved(value: PValue, wordCount: Int) -> Bool {
let d = value - self.threshold
if d < 0 {
return true
}
// d
return -2.0 / PValue(wordCount) < -d
}
///
@inlinable func shouldBeRemoved(data: borrowing DicdataElement) -> Bool {
let d = data.value() - self.threshold
if d < 0 {
return true
}
return Self.getPenalty(data: data) < -d
}
func loadLOUDS(query: String) -> LOUDS? {
if importedLoudses.contains(query) {
return self.loudses[query]
}
// LOUDSimportedLoudses
importedLoudses.insert(query)
// ASCII
let identifier = [
"\\n": "[0A]",
" ": "[20]",
"\"": "[22]",
"\'": "[27]",
"*": "[2A]",
"+": "[2B]",
".": "[2E]",
"/": "[2F]",
":": "[3A]",
"<": "[3C]",
">": "[3E]",
"\\": "[5C]",
"|": "[7C]",
][query, default: query]
if let louds = LOUDS.load(identifier, option: self.requestOptions) {
self.loudses[query] = louds
return louds
} else {
if identifier == "user" || identifier == "memory" {
debug("Error: IDが「\(identifier) (query: \(query))」のloudsファイルの読み込みに失敗しましたが、このエラーは深刻ではありません。")
} else {
debug("Error: IDが「\(identifier) (query: \(query))」のloudsファイルの読み込みに失敗しました。IDに対する辞書データが存在しないことが想定される場合はこのエラーは深刻ではありませんが、そうでない場合は深刻なエラーの可能性があります。")
}
return nil
}
}
func perfectMatchLOUDS(query: String, charIDs: [UInt8]) -> [Int] {
guard let louds = self.loadLOUDS(query: query) else {
return []
}
return [louds.searchNodeIndex(chars: charIDs)].compactMap {$0}
}
private func throughMatchLOUDS(query: String, charIDs: [UInt8], depth: Range<Int>) -> [Int] {
guard let louds = self.loadLOUDS(query: query) else {
return []
}
let result = louds.byfixNodeIndices(chars: charIDs)
// result[1]3..<5 (34)14..<6
return Array(result[min(depth.lowerBound + 1, result.endIndex) ..< min(depth.upperBound + 1, result.endIndex)])
}
///
/// - Parameters:
/// - group: 1ID
/// - depth: `2..<4`23
/// - Returns:
private func throughMatchLOUDS(group: [String: [([Character], [UInt8])]], depth: Range<Int>) -> [(key: String, indices: Set<Int>)] {
let indices: [(String, Set<Int>)] = group.map {dic in
guard let louds = self.loadLOUDS(query: dic.key) else {
return (dic.key, [])
}
//
let result = louds.byfixNodeIndices(targets: dic.value.map { $0.1 }, depth: depth)
return (dic.key, Set(result))
}
return indices
}
private func prefixMatchLOUDS(query: String, charIDs: [UInt8], depth: Int = .max, maxCount: Int = .max) -> [Int] {
guard let louds = self.loadLOUDS(query: query) else {
return []
}
return louds.prefixNodeIndices(chars: charIDs, maxDepth: depth, maxCount: maxCount)
}
package func getDicdataFromLoudstxt3(identifier: String, indices: some Sequence<Int>) -> [DicdataElement] {
// split = 2048
let dict = [Int: [Int]].init(grouping: indices, by: {$0 >> 11})
var data: [DicdataElement] = []
for (key, value) in dict {
data.append(contentsOf: LOUDS.getDataForLoudstxt3(identifier + "\(key)", indices: value.map {$0 & 2047}, cache: self.loudstxts[identifier + "\(key)"], option: self.requestOptions))
}
if identifier == "memory" {
data.mutatingForEach {
$0.metadata = .isLearned
}
}
if identifier == "user" {
data.mutatingForEach {
$0.metadata = .isFromUserDictionary
}
}
return data
}
/// kana2lattice
/// - Parameters:
/// - inputData:
/// - from:
/// - toIndexRange: `from ..< (toIndexRange)`
public func getLOUDSDataInRange(inputData: ComposingText, from fromIndex: Int, toIndexRange: Range<Int>? = nil, needTypoCorrection: Bool = true) -> [LatticeNode] {
if !needTypoCorrection {
return self.getFrozenLOUDSDataInRange(inputData: inputData, from: fromIndex, toIndexRange: toIndexRange)
}
let toIndexLeft = toIndexRange?.startIndex ?? fromIndex
let toIndexRight = min(toIndexRange?.endIndex ?? inputData.input.count, fromIndex + self.maxlength)
if fromIndex > toIndexLeft || toIndexLeft >= toIndexRight {
debug(#function, "index is wrong")
return []
}
let segments = (fromIndex ..< toIndexRight).reduce(into: []) { (segments: inout [String], rightIndex: Int) in
segments.append((segments.last ?? "") + String(inputData.input[rightIndex].character.toKatakana()))
}
// MARK:
var stringToInfo = inputData.getRangesWithTypos(fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
// MARK:
let stringSet: [([Character], [UInt8])] = stringToInfo.keys.map {($0, $0.map(self.character2charId))}
let (minCharIDsCount, maxCharIDsCount) = stringSet.lazy.map {$0.1.count}.minAndMax() ?? (0, -1)
let depth = minCharIDsCount - 1 ..< maxCharIDsCount
let group = [String: [([Character], [UInt8])]].init(grouping: stringSet, by: {String($0.0.first!)})
var indices = self.throughMatchLOUDS(group: group, depth: depth)
if learningManager.enabled {
indices.append(contentsOf: self.throughMatchLOUDS(group: ["user": stringSet, "memory": stringSet], depth: depth))
} else {
indices.append(contentsOf: self.throughMatchLOUDS(group: ["user": stringSet], depth: depth))
}
// MARK: indices
var dicdata: [DicdataElement] = []
for (identifier, value) in indices {
let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
let rubyArray = Array(data.ruby)
let penalty = stringToInfo[rubyArray, default: (0, .zero)].penalty
if penalty.isZero {
return data
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
return nil
}
return data.adjustedData(adjust)
}
dicdata.append(contentsOf: result)
}
// temporalpenalty
for (_, charIds) in consume stringSet {
for data in self.learningManager.temporaryThroughMatch(charIDs: consume charIds, depth: depth) {
let rubyArray = Array(data.ruby)
let penalty = stringToInfo[rubyArray, default: (0, .zero)].penalty
if penalty.isZero {
dicdata.append(data)
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
continue
}
dicdata.append(data.adjustedData(adjust))
}
}
for i in toIndexLeft ..< toIndexRight {
do {
let result = self.getWiseDicdata(convertTarget: segments[i - fromIndex], inputData: inputData, inputRange: fromIndex ..< i + 1)
for item in result {
stringToInfo[Array(item.ruby)] = (i, 0)
}
dicdata.append(contentsOf: result)
}
do {
let result = self.getMatchDynamicUserDict(segments[i - fromIndex])
for item in result {
stringToInfo[Array(item.ruby)] = (i, 0)
}
dicdata.append(contentsOf: result)
}
}
if fromIndex == .zero {
let result: [LatticeNode] = dicdata.compactMap {
guard let endIndex = stringToInfo[Array($0.ruby)]?.endIndex else {
return nil
}
let node = LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
node.prevs.append(RegisteredNode.BOSNode())
return node
}
return result
} else {
let result: [LatticeNode] = dicdata.compactMap {
guard let endIndex = stringToInfo[Array($0.ruby)]?.endIndex else {
return nil
}
return LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
}
return result
}
}
/// kana2lattice
/// - Parameters:
/// - inputData:
/// - from:
/// - toIndexRange: `from ..< (toIndexRange)`
private func getFrozenLOUDSDataInRange(inputData: ComposingText, from fromIndex: Int, toIndexRange: Range<Int>? = nil) -> [LatticeNode] {
let toIndexLeft = toIndexRange?.startIndex ?? fromIndex
let toIndexRight = min(toIndexRange?.endIndex ?? inputData.input.count, fromIndex + self.maxlength)
debug(#function, fromIndex, toIndexRange?.description ?? "nil", toIndexLeft, toIndexRight)
if fromIndex > toIndexLeft || toIndexLeft >= toIndexRight {
debug(#function, "index is wrong")
return []
}
let character = String(inputData.input[fromIndex].character.toKatakana())
let characterNode = LatticeNode(data: DicdataElement(word: character, ruby: character, cid: CIDData..cid, mid: MIDData..mid, value: -10), inputRange: fromIndex ..< fromIndex + 1)
if fromIndex == .zero {
characterNode.prevs.append(.BOSNode())
}
// MARK:
let stringToEndIndex = inputData.getRangesWithoutTypos(fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
// MARK:
guard let (minString, maxString) = stringToEndIndex.keys.minAndMax(by: {$0.count < $1.count}) else {
debug(#function, "minString/maxString is nil", stringToEndIndex)
return [characterNode]
}
let maxIDs = maxString.map(self.character2charId)
var keys = [String(stringToEndIndex.keys.first!.first!), "user"]
if learningManager.enabled {
keys.append("memory")
}
// MARK: indices
var dicdata: [DicdataElement] = []
let depth = minString.count - 1 ..< maxString.count
for identifier in keys {
dicdata.append(contentsOf: self.getDicdataFromLoudstxt3(identifier: identifier, indices: self.throughMatchLOUDS(query: identifier, charIDs: maxIDs, depth: depth)))
}
if learningManager.enabled {
// temporalpenalty
dicdata.append(contentsOf: self.learningManager.temporaryThroughMatch(charIDs: consume maxIDs, depth: depth))
}
for (key, value) in stringToEndIndex {
let convertTarget = String(key)
dicdata.append(contentsOf: self.getWiseDicdata(convertTarget: convertTarget, inputData: inputData, inputRange: fromIndex ..< value + 1))
dicdata.append(contentsOf: self.getMatchDynamicUserDict(convertTarget))
}
if fromIndex == .zero {
return dicdata.compactMap {
guard let endIndex = stringToEndIndex[Array($0.ruby)] else {
return nil
}
let node = LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
node.prevs.append(RegisteredNode.BOSNode())
return node
} + [characterNode]
} else {
return dicdata.compactMap {
guard let endIndex = stringToEndIndex[Array($0.ruby)] else {
return nil
}
return LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
} + [characterNode]
}
}
/// kana2latticelouds
/// - Parameters:
/// - inputData:
/// - from:
/// - to:
public func getLOUDSData(inputData: ComposingText, from fromIndex: Int, to toIndex: Int, needTypoCorrection: Bool) -> [LatticeNode] {
if toIndex - fromIndex > self.maxlength || fromIndex > toIndex {
return []
}
let segment = inputData.input[fromIndex...toIndex].reduce(into: "") {$0.append($1.character)}.toKatakana()
// TODO:
let string2penalty = inputData.getRangeWithTypos(fromIndex, toIndex).filter {
needTypoCorrection || $0.value == 0.0
}
// MARK: indices
// :
let strings = string2penalty.keys.map {
(key: $0, charIDs: $0.map(self.character2charId))
}
let group = [Character: [(key: [Character], charIDs: [UInt8])]].init(grouping: strings, by: {$0.key.first!})
var indices: [(String, Set<Int>)] = group.map {dic in
let head = String(dic.key)
let set = dic.value.flatMapSet { (_, charIDs) in
self.perfectMatchLOUDS(query: head, charIDs: charIDs)
}
return (head, set)
}
do {
let set = strings.flatMapSet { (_, charIDs) in
self.perfectMatchLOUDS(query: "user", charIDs: charIDs)
}
indices.append(("user", set))
}
if learningManager.enabled {
let set = strings.flatMapSet { (_, charIDs) in
self.perfectMatchLOUDS(query: "memory", charIDs: charIDs)
}
indices.append(("memory", set))
}
var dicdata: [DicdataElement] = []
for (identifier, value) in indices {
let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
let rubyArray = Array(data.ruby)
let penalty = string2penalty[rubyArray, default: .zero]
if penalty.isZero {
return data
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
return nil
}
return data.adjustedData(adjust)
}
dicdata.append(contentsOf: result)
}
// temporalpenalty
for (characters, charIds) in consume strings {
for data in self.learningManager.temporaryPerfectMatch(charIDs: consume charIds) {
// perfect matchArray(data.ruby)characters
let penalty = string2penalty[characters, default: .zero]
if penalty.isZero {
dicdata.append(data)
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: characters.count) {
continue
}
dicdata.append(data.adjustedData(adjust))
}
}
dicdata.append(contentsOf: self.getWiseDicdata(convertTarget: segment, inputData: inputData, inputRange: fromIndex ..< toIndex + 1))
for segment in string2penalty.keys {
dicdata.append(contentsOf: self.getMatchDynamicUserDict(String(segment)))
}
if fromIndex == .zero {
let result: [LatticeNode] = dicdata.map {
let node = LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)
node.prevs.append(RegisteredNode.BOSNode())
return node
}
return result
} else {
let result: [LatticeNode] = dicdata.map {LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)}
return result
}
}
func getZeroHintPredictionDicdata(lastRcid: Int) -> [DicdataElement] {
do {
let csvString = try String(contentsOf: requestOptions.dictionaryResourceURL.appendingPathComponent("p/pc_\(lastRcid).csv", isDirectory: false), encoding: .utf8)
let csvLines = csvString.split(separator: "\n")
let csvData = csvLines.map {$0.split(separator: ",", omittingEmptySubsequences: false)}
let dicdata: [DicdataElement] = csvData.map {self.parseLoudstxt2FormattedEntry(from: $0)}
return dicdata
} catch {
debug("Error: 右品詞ID\(lastRcid)のためのZero Hint Predictionのためのデータの読み込みに失敗しました。このエラーは深刻ですが、テスト時には無視できる場合があります。 Description: \(error.localizedDescription)")
return []
}
}
///
/// - Parameters:
/// - head:
/// - Returns:
///
func getPredictionLOUDSDicdata(key: some StringProtocol) -> [DicdataElement] {
let count = key.count
if count == .zero {
return []
}
// 700
let maxCount = 700
var result: [DicdataElement] = []
let first = String(key.first!)
let charIDs = key.map(self.character2charId)
// 1, 2depth
let depth = if count == 1 {
3
} else if count == 2 {
5
} else {
Int.max
}
let prefixIndices = self.prefixMatchLOUDS(query: first, charIDs: charIDs, depth: depth, maxCount: maxCount)
result.append(
contentsOf: self.getDicdataFromLoudstxt3(identifier: first, indices: Set(prefixIndices))
.filter { Self.predictionUsable[$0.rcid] }
)
let userDictIndices = self.prefixMatchLOUDS(query: "user", charIDs: charIDs, maxCount: maxCount)
result.append(contentsOf: self.getDicdataFromLoudstxt3(identifier: "user", indices: Set(consume userDictIndices)))
if learningManager.enabled {
let memoryDictIndices = self.prefixMatchLOUDS(query: "memory", charIDs: charIDs, maxCount: maxCount)
result.append(contentsOf: self.getDicdataFromLoudstxt3(identifier: "memory", indices: Set(consume memoryDictIndices)))
result.append(contentsOf: self.learningManager.temporaryPrefixMatch(charIDs: charIDs))
}
return result
}
private func parseLoudstxt2FormattedEntry(from dataString: [some StringProtocol]) -> DicdataElement {
let ruby = String(dataString[0])
let word = dataString[1].isEmpty ? ruby:String(dataString[1])
let lcid = Int(dataString[2]) ?? .zero
let rcid = Int(dataString[3]) ?? lcid
let mid = Int(dataString[4]) ?? .zero
let value: PValue = PValue(dataString[5]) ?? -30.0
return DicdataElement(word: word, ruby: ruby, lcid: lcid, rcid: rcid, mid: mid, value: value)
}
///
/// - parameters:
/// - convertTarget:
/// - note
/// - Converter
func getWiseDicdata(convertTarget: String, inputData: ComposingText, inputRange: Range<Int>) -> [DicdataElement] {
var result: [DicdataElement] = []
result.append(contentsOf: self.getJapaneseNumberDicdata(head: convertTarget))
if inputData.input[..<inputRange.startIndex].last?.character.isNumber != true && inputData.input[inputRange.endIndex...].first?.character.isNumber != true, let number = Int(convertTarget) {
result.append(DicdataElement(ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: -14))
if Double(number) <= 1E12 && -1E12 <= Double(number), let kansuji = self.numberFormatter.string(from: NSNumber(value: number)) {
result.append(DicdataElement(word: kansuji, ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: -16))
}
}
// convertTarget
if requestOptions.keyboardLanguage == .en_US && convertTarget.onlyRomanAlphabet {
result.append(DicdataElement(ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: -14))
}
//
if requestOptions.keyboardLanguage != .en_US && inputData.input[inputRange].allSatisfy({$0.inputStyle == .roman2kana}) {
let roman = String(inputData.input[inputRange].map(\.character))
if let katakana = Roman2Kana.katakanaChanges[roman], let hiragana = Roman2Kana.hiraganaChanges[Array(roman)] {
result.append(DicdataElement(word: String(hiragana), ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -13))
result.append(DicdataElement(ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -14))
}
}
//
if convertTarget.count == 1 {
let katakana = convertTarget.toKatakana()
let hiragana = convertTarget.toHiragana()
if convertTarget == katakana {
result.append(DicdataElement(ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -14))
} else {
result.append(DicdataElement(word: hiragana, ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -13))
result.append(DicdataElement(ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -14))
}
}
//
if convertTarget.count == 1, let first = convertTarget.first {
var value: PValue = -14
let hs = Self.fullwidthToHalfwidth[first, default: first]
if hs != first {
result.append(DicdataElement(word: convertTarget, ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
result.append(DicdataElement(word: String(hs), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
}
if let fs = Self.halfwidthToFullwidth[first], fs != first {
result.append(DicdataElement(word: convertTarget, ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
result.append(DicdataElement(word: String(fs), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
}
for group in Self.weakRelatingSymbolGroups where group.contains(hs) {
for symbol in group where symbol != hs {
result.append(DicdataElement(word: String(symbol), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
if let fs = Self.halfwidthToFullwidth[symbol] {
result.append(DicdataElement(word: String(fs), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
}
}
}
}
return result
}
//
private static let (fullwidthToHalfwidth, halfwidthToFullwidth) = zip(
"+ー*=・!#%&'"〜|£$¥@`;:<>,.\/_ ̄-",
"+ー*=・!#%&'"〜|£$¥@`;:<>,.\/_ ̄-".applyingTransform(.fullwidthToHalfwidth, reverse: false)!
)
.reduce(into: ([Character: Character](), [Character: Character]())) { (results: inout ([Character: Character], [Character: Character]), values: (Character, Character)) in
results.0[values.0] = values.1
results.1[values.1] = values.0
}
// ()
// strongRelatingSymbolGroups
//
// 1
//
private static let weakRelatingSymbolGroups: [[Character]] = [
// ()
["", ""], //
["", "", "", ""],
["", ""],
["", ""],
["", ""],
["", ""],
["", "辻󠄀"],
["禰󠄀", ""],
["煉󠄁", ""],
["", ""], //
["", ""],
["", "𠮷"], //
["", "𣘺", "", "𫞎"],
["", "", ""],
["", ""],
["", ""],
["", ""],
["", ""],
["", ""],
["", ""],
//
["", "", "", "", ""], //
["^", ""], //
["¥", "$", "¢", "", "£", ""], //
["%", ""], //
["°", "", ""],
[""], //
["*", "", "✳︎", "✴︎"], //
["", "", "", ""],
["+", "±", ""],
["×", "", "✖️"],
["÷", "" ],
["<", "", "", "", "", "", "«"],
[">", "", "", "", "", "", "»"],
["=", "", "", ""],
[":", ";"],
["!", "❗️", "❣️", "‼︎", "⁉︎", "", "‼️", "⁉️", "¡"],
["?", "", "⁉︎", "", "", "⁉️", "¿"],
["", "", "", "☎︎"],
["", "", "", "", "", "", "", ""],
["", "", "", ""], //
["", "", "", "", "", "", "", "", "", "", "⚪︎", ""], //
["", "", "", "", "↙︎", "↖︎", "↘︎", "↗︎", "↔︎", "↕︎", "↪︎", "↩︎", ""], //
["", "", "", "", "", "", "", "𝄞", "𝄞"], //
["", "", ""] //
]
private func loadCCBinary(url: URL) -> [(Int32, Float)] {
do {
let binaryData = try Data(contentsOf: url, options: [.uncached])
return binaryData.toArray(of: (Int32, Float).self)
} catch {
debug("Error: 品詞連接コストデータの読み込みに失敗しました。このエラーは深刻ですが、テスト時には無視できる場合があります。 Description: \(error.localizedDescription)")
return []
}
}
/// ruby
func getMatchDynamicUserDict(_ ruby: some StringProtocol) -> [DicdataElement] {
self.dynamicUserDict.filter {$0.ruby == ruby}
}
/// ruby
func getPrefixMatchDynamicUserDict(_ ruby: some StringProtocol) -> [DicdataElement] {
self.dynamicUserDict.filter {$0.ruby.hasPrefix(ruby)}
}
//
// TODO: previous
func updateLearningData(_ candidate: Candidate, with previous: DicdataElement?) {
if let previous {
self.learningManager.update(data: [previous] + candidate.data)
} else {
self.learningManager.update(data: candidate.data)
}
}
//
// TODO: previous
func updateLearningData(_ candidate: Candidate, with predictionCandidate: PostCompositionPredictionCandidate) {
switch predictionCandidate.type {
case .additional(data: let data):
self.learningManager.update(data: candidate.data, updatePart: data)
case .replacement(targetData: let targetData, replacementData: let replacementData):
self.learningManager.update(data: candidate.data.dropLast(targetData.count), updatePart: replacementData)
}
}
/// class id
/// - Parameters:
/// - former: id
/// - latter: id
/// - Returns:
///
/// -
/// : 0.115224 : ___CCValue
public func getCCValue(_ former: Int, _ latter: Int) -> PValue {
if !ccParsed[former] {
let url = requestOptions.dictionaryResourceURL.appendingPathComponent("cb/\(former).binary", isDirectory: false)
let values = loadCCBinary(url: url)
ccLines[former] = [Int: PValue].init(uniqueKeysWithValues: values.map {(Int($0.0), PValue($0.1))})
ccParsed[former] = true
}
let defaultValue = ccLines[former][-1, default: -25]
return ccLines[former][latter, default: defaultValue]
}
/// meaning id
/// - Parameters:
/// - former: id
/// - latter: id
/// - Returns:
///
/// -
public func getMMValue(_ former: Int, _ latter: Int) -> PValue {
if former == 500 || latter == 500 {
return 0
}
return self.mmValue[former * self.midCount + latter]
}
private static let possibleLOUDS: Set<Character> = [
" ", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "*", "", "", "", "´", "¨", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "", "", "(", ")", "#", "%", "&", "^", "_", "'", "\""
]
//
static func existLOUDS(for character: Character) -> Bool {
Self.possibleLOUDS.contains(character)
}
/*
*
*
*
*
*
*
*/
/// class id
/// - Parameters:
/// - c_former: id
/// - c_latter: id
/// - Returns:
///
@inlinable static func isClause(_ former: Int, _ latter: Int) -> Bool {
// EOS
let latter_wordtype = Self.wordTypes[latter]
if latter_wordtype == 3 {
return false
}
let former_wordtype = Self.wordTypes[former]
if former_wordtype == 3 {
return false
}
if latter_wordtype == 0 {
return former_wordtype != 0
}
if latter_wordtype == 1 {
return former_wordtype != 0
}
return false
}
/// wordTypes使
private static let BOS_EOS_wordIDs: Set<Int> = [CIDData.BOS.cid, CIDData.EOS.cid]
/// wordTypes使
private static let PREPOSITION_wordIDs: Set<Int> = [1315, 6, 557, 558, 559, 560]
/// wordTypes使
private static let INPOSITION_wordIDs: Set<Int> = Set<Int>(
Array(561..<868).chained(1283..<1297).chained(1306..<1310).chained(11..<53).chained(555..<557).chained(1281..<1283)
).union([1314, 3, 2, 4, 5, 1, 9])
/*
private static let POSTPOSITION_wordIDs: Set<Int> = Set<Int>((7...8).map{$0}
+ (54..<555).map{$0}
+ (868..<1281).map{$0}
+ (1297..<1306).map{$0}
+ (1310..<1314).map{$0}
).union([10])
*/
/// - Returns:
/// - 3 when BOS/EOS
/// - 0 when preposition
/// - 1 when core
/// - 2 when postposition
/// - 11B1.3KB
public static let wordTypes = (0...1319).map(_judgeWordType)
/// wordTypes使
private static func _judgeWordType(cid: Int) -> UInt8 {
if Self.BOS_EOS_wordIDs.contains(cid) {
return 3 // BOS/EOS
}
if Self.PREPOSITION_wordIDs.contains(cid) {
return 0 //
}
if Self.INPOSITION_wordIDs.contains(cid) {
return 1 //
}
return 2 //
}
@inlinable static func includeMMValueCalculation(_ data: DicdataElement) -> Bool {
//
if 895...1280 ~= data.lcid || 895...1280 ~= data.rcid {
return true
}
//
if 1297...1305 ~= data.lcid || 1297...1305 ~= data.rcid {
return true
}
//
return wordTypes[data.lcid] == 1 || wordTypes[data.rcid] == 1
}
/// - 12B2.6KB
static let penaltyRatio = (0...1319).map(_getTypoPenaltyRatio)
/// penaltyRatio使
static func _getTypoPenaltyRatio(_ lcid: Int) -> PValue {
// 147...368, 369...554
if 147...554 ~= lcid {
return 2.5
}
return 1
}
/// id
static let predictionUsable = (0...1319).map(_getPredictionUsable)
/// penaltyRatio使
static func _getPredictionUsable(_ rcid: Int) -> Bool {
//
// `cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "`
if Set([33, 34, 50, 86, 87, 88, 103, 127, 128, 144, 397, 398, 408, 426, 427, 450, 457, 480, 687, 688, 703, 704, 727, 742, 750, 758, 766, 786, 787, 798, 810, 811, 829, 830, 831, 893, 973, 974, 975, 976, 977, 1007, 1008, 1009, 1010, 1063, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1240, 1241, 1242, 1243, 1268, 1269, 1270, 1271]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([15, 16, 17, 18, 41, 42, 59, 60, 61, 62, 63, 64, 94, 95, 109, 110, 111, 112, 135, 136, 379, 380, 381, 382, 402, 412, 413, 442, 443, 471, 472, 562, 572, 582, 591, 598, 618, 627, 677, 678, 693, 694, 709, 710, 722, 730, 737, 745, 753, 761, 770, 771, 791, 869, 878, 885, 896, 906, 917, 918, 932, 948, 949, 950, 951, 952, 987, 988, 989, 990, 1017, 1018, 1033, 1034, 1035, 1036, 1058, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1212, 1213, 1214, 1215]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([372, 406, 418, 419, 431, 437, 438, 455, 462, 463, 464, 495, 496, 504, 533, 534, 540, 551, 567, 577, 587, 595, 606, 614, 622, 630, 641, 647, 653, 659, 665, 672, 683, 684, 699, 700, 715, 716, 725, 733, 740, 748, 756, 764, 780, 781, 794, 806, 807, 823, 824, 825, 837, 842, 847, 852, 859, 865, 873, 881, 890, 901, 911, 925, 935, 963, 964, 965, 966, 967, 999, 1000, 1001, 1002, 1023, 1024, 1045, 1046, 1047, 1048, 1061, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1224, 1225, 1226, 1227, 1260, 1261, 1262, 1263, 1278]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([420, 421, 631, 782, 783, 795, 891, 936, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1228, 1229, 1230, 1231]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([25, 26, 46, 74, 75, 76, 99, 119, 120, 140, 389, 390, 405, 416, 417, 447, 476, 493, 494, 566, 576, 585, 594, 603, 621, 629, 671, 681, 682, 697, 698, 713, 714, 724, 732, 739, 747, 755, 763, 778, 779, 793, 804, 805, 820, 821, 822, 872, 880, 889, 900, 910, 923, 924, 934, 958, 959, 960, 961, 962, 995, 996, 997, 998, 1021, 1022, 1041, 1042, 1043, 1044, 1060, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1220, 1221, 1222, 1223, 1256, 1257, 1258, 1259]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([27, 28, 47, 77, 78, 79, 100, 121, 122, 141, 391, 392, 448, 477, 604]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([404, 564, 565, 574, 575, 600, 601, 620, 774, 775, 776, 777, 871, 887, 888, 898, 899, 908, 909, 921, 922, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([13, 14, 40, 56, 57, 58, 93, 107, 108, 134, 369, 377, 378, 401, 410, 411, 433, 434, 441, 452, 470, 483, 489, 490, 527, 528, 537, 542, 548, 561, 571, 581, 590, 597, 611, 617, 626, 636, 638, 644, 650, 656, 662, 668, 675, 676, 691, 692, 707, 708, 721, 729, 736, 744, 752, 760, 768, 769, 790, 800, 801, 814, 815, 816, 835, 840, 845, 850, 855, 862, 868, 877, 884, 895, 905, 915, 916, 931, 941, 943, 944, 945, 946, 947, 983, 984, 985, 986, 1015, 1016, 1029, 1030, 1031, 1032, 1057, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1208, 1209, 1210, 1211, 1248, 1249, 1250, 1251, 1276]).contains(rcid) {
return false
}
//
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([373, 553, 569, 579, 589, 596, 609, 624, 634, 642, 648, 654, 660, 666, 673, 860, 866, 875, 903, 913, 928, 929, 939]).contains(rcid) {
return false
}
return true
}
//
@inlinable static func needWValueMemory(_ data: DicdataElement) -> Bool {
//
if 147...554 ~= data.lcid {
return false
}
//
if 557...560 ~= data.lcid {
return false
}
//
if 1297...1305 ~= data.lcid {
return false
}
//
if 6...9 ~= data.lcid {
return false
}
if 0 == data.lcid || 1316 == data.lcid {
return false
}
return true
}
static let possibleNexts: [String: [String]] = [
"x": ["", "", "", "", "", "", "", "", "", ""],
"l": ["", "", "", "", "", "", "", "", "", ""],
"xt": [""],
"lt": [""],
"xts": [""],
"lts": [""],
"xy": ["", "", ""],
"ly": ["", "", ""],
"xw": [""],
"lw": [""],
"v": [""],
"k": ["", "", "", "", ""],
"q": ["クァ", "クィ", "クゥ", "クェ", "クォ"],
"qy": ["クャ", "クィ", "クュ", "クェ", "クョ"],
"qw": ["クヮ", "クィ", "クゥ", "クェ", "クォ"],
"ky": ["キャ", "キィ", "キュ", "キェ", "キョ"],
"g": ["", "", "", "", ""],
"gy": ["ギャ", "ギィ", "ギュ", "ギェ", "ギョ"],
"s": ["", "", "", "", ""],
"sy": ["シャ", "シィ", "シュ", "シェ", "ショ"],
"sh": ["シャ", "シィ", "シュ", "シェ", "ショ"],
"z": ["", "", "", "", ""],
"zy": ["ジャ", "ジィ", "ジュ", "ジェ", "ジョ"],
"j": [""],
"t": ["", "", "", "", ""],
"ty": ["チャ", "チィ", "チュ", "チェ", "チョ"],
"ts": [""],
"th": ["テャ", "ティ", "テュ", "テェ", "テョ"],
"tw": ["トァ", "トィ", "トゥ", "トェ", "トォ"],
"cy": ["チャ", "チィ", "チュ", "チェ", "チョ"],
"ch": [""],
"d": ["", "", "", "", ""],
"dy": ["ヂャ", "ヂィ", "ヂュ", "ヂェ", "ヂョ"],
"dh": ["デャ", "ディ", "デュ", "デェ", "デョ"],
"dw": ["ドァ", "ドィ", "ドゥ", "ドェ", "ドォ"],
"n": ["", "", "", "", "", ""],
"ny": ["ニャ", "ニィ", "ニュ", "ニェ", "ニョ"],
"h": ["", "", "", "", ""],
"hy": ["ヒャ", "ヒィ", "ヒュ", "ヒェ", "ヒョ"],
"hw": ["ファ", "フィ", "フェ", "フォ"],
"f": [""],
"b": ["", "", "", "", ""],
"by": ["ビャ", "ビィ", "ビュ", "ビェ", "ビョ"],
"p": ["", "", "", "", ""],
"py": ["ピャ", "ピィ", "ピュ", "ピェ", "ピョ"],
"m": ["", "", "", "", ""],
"my": ["ミャ", "ミィ", "ミュ", "ミェ", "ミョ"],
"y": ["", "", "イェ", ""],
"r": ["", "", "", "", ""],
"ry": ["リャ", "リィ", "リュ", "リェ", "リョ"],
"w": ["", "ウィ", "ウェ", ""],
"wy": ["", ""]
]
}