Files
AzooKeyKanaKanjiConverter/Sources/KanaKanjiConverterModule/DicdataStore/DicdataStore.swift
Miwa 55ffe3c708 [Experimental] Zenzai (#92)
* experimental rinna integration

* Update impl

* update

* Bump swift-actions/setup-swift from 1 to 2

Bumps [swift-actions/setup-swift](https://github.com/swift-actions/setup-swift) from 1 to 2.
- [Release notes](https://github.com/swift-actions/setup-swift/releases)
- [Commits](https://github.com/swift-actions/setup-swift/compare/v1...v2)

---
updated-dependencies:
- dependency-name: swift-actions/setup-swift
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

* change test

* change impl

* take gpt2 weight as option

* don't use async

* support status check

* enhance error

* avoid percent encode

* update

* GPT-2 based kana-kanji conversion is now perfectly workinggit statusgit status

* fix a bug

* Rename gpt2/llama -> zenz

* cleanup

* internal apiを綺麗にした

* cleanup experimental commands

* update

* partially support incremental input using cache

* fix names

* fix bug

* support roman2kana

* cleanup

* fix minor bugs

* improve logic

* fix minor bug

* fix minor bug

* fix minor bug

* optimize

* optimize performance

* Optimize cache hit

* cli: add anco session command

* fix cache hit bugs

* improve session commands

* maybe this will work better for incremental input environment

* speed up zenzai by using n_best alternatives

* update zenz context

* adding no_typo api

* add inference limit

* fix bug

* reset install_cli

* make package buildable -- but llama.cpp features just do not work at this point because metal is not preprocessed

* add proper availability checks

* change macOS minimum version

* fix several problems

* code cleanup

* enable ubuntu build

* fix build error

* fix ubuntu build

* fix borrowing

* update install_cli.sh

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-05-15 01:36:45 +09:00

989 lines
48 KiB
Swift
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//
// DicdataStore.swift
// Keyboard
//
// Created by ensan on 2020/09/17.
// Copyright © 2020 ensan. All rights reserved.
//
import Foundation
import SwiftUtils
public final class DicdataStore {
public init(convertRequestOptions: ConvertRequestOptions) {
self.requestOptions = convertRequestOptions
self.setup()
}
init(requestOptions: ConvertRequestOptions = .default) {
self.requestOptions = requestOptions
debug("DicdataStoreが初期化されました")
self.setup()
}
private var ccParsed: [Bool] = .init(repeating: false, count: 1319)
private var ccLines: [[Int: PValue]] = []
private var mmValue: [PValue] = []
private var loudses: [String: LOUDS] = [:]
private var importedLoudses: Set<String> = []
private var charsID: [Character: UInt8] = [:]
private var learningManager = LearningManager()
private var dynamicUserDict: [DicdataElement] = []
///
/// - TODO: make this value as an option
public let maxlength: Int = 20
///
/// - TODO: make this value as an option
public let threshold: PValue = -17
private let midCount = 502
private let cidCount = 1319
private var requestOptions: ConvertRequestOptions = .default
private let numberFormatter = NumberFormatter()
///
private func setup() {
numberFormatter.numberStyle = .spellOut
numberFormatter.locale = .init(identifier: "ja-JP")
self.ccLines = [[Int: PValue]].init(repeating: [:], count: CIDData.totalCount)
do {
let string = try String(contentsOf: self.requestOptions.dictionaryResourceURL.appendingPathComponent("louds/charID.chid", isDirectory: false), encoding: String.Encoding.utf8)
charsID = [Character: UInt8].init(uniqueKeysWithValues: string.enumerated().map {($0.element, UInt8($0.offset))})
} catch {
debug("ファイルが存在しません: \(error)")
}
do {
let url = requestOptions.dictionaryResourceURL.appendingPathComponent("mm.binary", isDirectory: false)
do {
let binaryData = try Data(contentsOf: url, options: [.uncached])
self.mmValue = binaryData.toArray(of: Float.self).map {PValue($0)}
} catch {
debug("Failed to read the file.")
self.mmValue = [PValue].init(repeating: .zero, count: self.midCount * self.midCount)
}
}
_ = self.loadLOUDS(identifier: "user")
_ = self.loadLOUDS(identifier: "memory")
}
public enum Notification {
/// use `importDynamicUserDict` for data that cannot be obtained statically.
/// - warning: Too many dynamic user dictionary will damage conversion performance, as dynamic user dictionary uses inefficent algorithms for looking up. If your entries can be listed up statically, then use normal user dictionaries.
case importDynamicUserDict([DicdataElement])
@available(*, deprecated, renamed: "importDynamicUserDict", message: "it will be removed in AzooKeyKanaKanjiConverter v1.0")
case importOSUserDict([DicdataElement])
case setRequestOptions(ConvertRequestOptions)
case forgetMemory(Candidate)
case closeKeyboard
}
func sendToDicdataStore(_ data: Notification) {
switch data {
case .closeKeyboard:
self.closeKeyboard()
case .importOSUserDict(let dicdata), .importDynamicUserDict(let dicdata):
self.dynamicUserDict = dicdata
case let .forgetMemory(candidate):
self.learningManager.forgetMemory(data: candidate.data)
// louds
self.reloadMemory()
case let .setRequestOptions(value):
// bundleURLsetup
if value.dictionaryResourceURL != self.requestOptions.dictionaryResourceURL {
self.requestOptions = value
self.setup()
} else {
self.requestOptions = value
}
let shouldReset = self.learningManager.setRequestOptions(options: value)
if shouldReset {
self.reloadMemory()
}
}
}
func character2charId(_ character: Character) -> UInt8 {
self.charsID[character, default: .max]
}
private func reloadMemory() {
self.loudses.removeValue(forKey: "memory")
self.importedLoudses.remove("memory")
}
private func reloadUser() {
self.loudses.removeValue(forKey: "user")
self.importedLoudses.remove("user")
}
private func closeKeyboard() {
self.learningManager.save()
// savememoryLOUDS使
self.reloadMemory()
self.reloadUser()
}
///
@inlinable static func getPenalty(data: borrowing DicdataElement) -> PValue {
-2.0 / PValue(data.word.count)
}
///
private func shouldBeRemoved(value: PValue, wordCount: Int) -> Bool {
let d = value - self.threshold
if d < 0 {
return true
}
// d
return -2.0 / PValue(wordCount) < -d
}
///
@inlinable func shouldBeRemoved(data: borrowing DicdataElement) -> Bool {
let d = data.value() - self.threshold
if d < 0 {
return true
}
return Self.getPenalty(data: data) < -d
}
func loadLOUDS(identifier: String) -> LOUDS? {
if importedLoudses.contains(identifier) {
return self.loudses[identifier]
}
importedLoudses.insert(identifier)
if let louds = LOUDS.load(identifier, option: self.requestOptions) {
self.loudses[identifier] = louds
return louds
} else {
debug("loudsの読み込みに失敗、identifierは\(identifier)")
return nil
}
}
private func perfectMatchLOUDS(identifier: String, charIDs: [UInt8]) -> [Int] {
guard let louds = self.loadLOUDS(identifier: identifier) else {
return []
}
return [louds.searchNodeIndex(chars: charIDs)].compactMap {$0}
}
private func throughMatchLOUDS(identifier: String, charIDs: [UInt8], depth: Range<Int>) -> [Int] {
guard let louds = self.loadLOUDS(identifier: identifier) else {
return []
}
let result = louds.byfixNodeIndices(chars: charIDs)
// result[1]3..<5 (34)14..<6
return Array(result[min(depth.lowerBound + 1, result.endIndex) ..< min(depth.upperBound + 1, result.endIndex)])
}
private func prefixMatchLOUDS(identifier: String, charIDs: [UInt8], depth: Int = .max) -> [Int] {
guard let louds = self.loadLOUDS(identifier: identifier) else {
return []
}
return louds.prefixNodeIndices(chars: charIDs, maxDepth: depth)
}
package func getDicdataFromLoudstxt3(identifier: String, indices: some Sequence<Int>) -> [DicdataElement] {
debug("getDicdataFromLoudstxt3", identifier, indices)
// split = 2048
let dict = [Int: [Int]].init(grouping: indices, by: {$0 >> 11})
var data: [DicdataElement] = []
for (key, value) in dict {
data.append(contentsOf: LOUDS.getDataForLoudstxt3(identifier + "\(key)", indices: value.map {$0 & 2047}, option: self.requestOptions))
}
return data
}
/// kana2lattice
/// - Parameters:
/// - inputData:
/// - from:
/// - toIndexRange: `from ..< (toIndexRange)`
public func getLOUDSDataInRange(inputData: ComposingText, from fromIndex: Int, toIndexRange: Range<Int>? = nil) -> [LatticeNode] {
let toIndexLeft = toIndexRange?.startIndex ?? fromIndex
let toIndexRight = min(toIndexRange?.endIndex ?? inputData.input.count, fromIndex + self.maxlength)
debug("getLOUDSDataInRange", fromIndex, toIndexRange?.description ?? "nil", toIndexLeft, toIndexRight)
if fromIndex > toIndexLeft || toIndexLeft >= toIndexRight {
debug("getLOUDSDataInRange: index is wrong")
return []
}
let segments = (fromIndex ..< toIndexRight).reduce(into: []) { (segments: inout [String], rightIndex: Int) in
segments.append((segments.last ?? "") + String(inputData.input[rightIndex].character.toKatakana()))
}
// MARK:
var stringToInfo = inputData.getRangesWithTypos(fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
// MARK:
let stringSet = stringToInfo.keys.map {($0, $0.map(self.character2charId))}
let (minCharIDsCount, maxCharIDsCount) = stringSet.lazy.map {$0.1.count}.minAndMax() ?? (0, -1)
// :
let group = [Character: [([Character], [UInt8])]].init(grouping: stringSet, by: {$0.0.first!})
let depth = minCharIDsCount - 1 ..< maxCharIDsCount
var indices: [(String, Set<Int>)] = group.map {dic in
let key = String(dic.key)
let set = dic.value.flatMapSet {(_, charIDs) in self.throughMatchLOUDS(identifier: key, charIDs: charIDs, depth: depth)}
return (key, set)
}
indices.append(("user", stringSet.flatMapSet {self.throughMatchLOUDS(identifier: "user", charIDs: $0.1, depth: depth)}))
if learningManager.enabled {
indices.append(("memory", stringSet.flatMapSet {self.throughMatchLOUDS(identifier: "memory", charIDs: $0.1, depth: depth)}))
}
// MARK: indices
var dicdata: [DicdataElement] = []
for (identifier, value) in indices {
let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
let rubyArray = Array(data.ruby)
let penalty = stringToInfo[rubyArray, default: (0, .zero)].penalty
if penalty.isZero {
return data
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
return nil
}
return data.adjustedData(adjust)
}
dicdata.append(contentsOf: result)
}
// temporalpenalty
for (_, charIds) in consume stringSet {
for data in self.learningManager.temporaryThroughMatch(charIDs: consume charIds, depth: depth) {
let rubyArray = Array(data.ruby)
let penalty = stringToInfo[rubyArray, default: (0, .zero)].penalty
if penalty.isZero {
dicdata.append(data)
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
continue
}
dicdata.append(data.adjustedData(adjust))
}
}
for i in toIndexLeft ..< toIndexRight {
do {
let result = self.getWiseDicdata(convertTarget: segments[i - fromIndex], inputData: inputData, inputRange: fromIndex ..< i + 1)
for item in result {
stringToInfo[Array(item.ruby)] = (i, 0)
}
dicdata.append(contentsOf: result)
}
do {
let result = self.getMatchOSUserDict(segments[i - fromIndex])
for item in result {
stringToInfo[Array(item.ruby)] = (i, 0)
}
dicdata.append(contentsOf: result)
}
}
if fromIndex == .zero {
let result: [LatticeNode] = dicdata.compactMap {
guard let endIndex = stringToInfo[Array($0.ruby)]?.endIndex else {
return nil
}
let node = LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
node.prevs.append(RegisteredNode.BOSNode())
return node
}
return result
} else {
let result: [LatticeNode] = dicdata.compactMap {
guard let endIndex = stringToInfo[Array($0.ruby)]?.endIndex else {
return nil
}
return LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
}
return result
}
}
/// kana2lattice
/// - Parameters:
/// - inputData:
/// - from:
/// - toIndexRange: `from ..< (toIndexRange)`
public func getFrozenLOUDSDataInRange(inputData: ComposingText, from fromIndex: Int, toIndexRange: Range<Int>? = nil) -> [LatticeNode] {
let toIndexLeft = toIndexRange?.startIndex ?? fromIndex
let toIndexRight = min(toIndexRange?.endIndex ?? inputData.input.count, fromIndex + self.maxlength)
debug("getLOUDSDataInRange", fromIndex, toIndexRange?.description ?? "nil", toIndexLeft, toIndexRight)
if fromIndex > toIndexLeft || toIndexLeft >= toIndexRight {
debug("getLOUDSDataInRange: index is wrong")
return []
}
let segments = (fromIndex ..< toIndexRight).reduce(into: []) { (segments: inout [String], rightIndex: Int) in
segments.append((segments.last ?? "") + String(inputData.input[rightIndex].character.toKatakana()))
}
let character = String(inputData.input[fromIndex].character.toKatakana())
let characterNode = LatticeNode(data: DicdataElement(word: character, ruby: character, cid: CIDData..cid, mid: MIDData..mid, value: -10), inputRange: fromIndex ..< fromIndex + 1)
if fromIndex == .zero {
characterNode.prevs.append(.BOSNode())
}
// MARK:
var stringToEndIndex = inputData.getRanges(fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
// MARK:
guard let (minString, maxString) = stringToEndIndex.keys.minAndMax(by: {$0.count < $1.count}) else {
return [characterNode]
}
let maxIDs = maxString.map(self.character2charId)
var keys = [String(stringToEndIndex.keys.first!.first!), "user"]
if learningManager.enabled {
keys.append("memory")
}
// MARK: indices
var dicdata: [DicdataElement] = []
let depth = minString.count - 1 ..< maxString.count
for identifier in keys {
dicdata.append(contentsOf: self.getDicdataFromLoudstxt3(identifier: identifier, indices: self.throughMatchLOUDS(identifier: identifier, charIDs: maxIDs, depth: depth)))
}
if learningManager.enabled {
// temporalpenalty
dicdata.append(contentsOf: self.learningManager.temporaryThroughMatch(charIDs: consume maxIDs, depth: depth))
}
for i in toIndexLeft ..< toIndexRight {
dicdata.append(contentsOf: self.getWiseDicdata(convertTarget: segments[i - fromIndex], inputData: inputData, inputRange: fromIndex ..< i + 1))
dicdata.append(contentsOf: self.getMatchOSUserDict(segments[i - fromIndex]))
}
if fromIndex == .zero {
return dicdata.compactMap {
guard let endIndex = stringToEndIndex[Array($0.ruby)] else {
return nil
}
let node = LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
node.prevs.append(RegisteredNode.BOSNode())
return node
} + [characterNode]
} else {
return dicdata.compactMap {
guard let endIndex = stringToEndIndex[Array($0.ruby)] else {
return nil
}
return LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
} + [characterNode]
}
}
/// kana2latticelouds
/// - Parameters:
/// - inputData:
/// - from:
/// - to:
public func getLOUDSData(inputData: ComposingText, from fromIndex: Int, to toIndex: Int) -> [LatticeNode] {
if toIndex - fromIndex > self.maxlength || fromIndex > toIndex {
return []
}
let segment = inputData.input[fromIndex...toIndex].reduce(into: "") {$0.append($1.character)}.toKatakana()
let string2penalty = inputData.getRangeWithTypos(fromIndex, toIndex)
// MARK: indices
// :
let strings = string2penalty.keys.map {
(key: $0, charIDs: $0.map(self.character2charId))
}
let group = [Character: [(key: [Character], charIDs: [UInt8])]].init(grouping: strings, by: {$0.key.first!})
var indices: [(String, Set<Int>)] = group.map {dic in
let head = String(dic.key)
let set = dic.value.flatMapSet { (_, charIDs) in
self.perfectMatchLOUDS(identifier: head, charIDs: charIDs)
}
return (head, set)
}
do {
let set = strings.flatMapSet { (_, charIDs) in
self.perfectMatchLOUDS(identifier: "user", charIDs: charIDs)
}
indices.append(("user", set))
}
if learningManager.enabled {
let set = strings.flatMapSet { (_, charIDs) in
self.perfectMatchLOUDS(identifier: "memory", charIDs: charIDs)
}
indices.append(("memory", set))
}
var dicdata: [DicdataElement] = []
for (identifier, value) in indices {
let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
let rubyArray = Array(data.ruby)
let penalty = string2penalty[rubyArray, default: .zero]
if penalty.isZero {
return data
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
return nil
}
return data.adjustedData(adjust)
}
dicdata.append(contentsOf: result)
}
// temporalpenalty
for (characters, charIds) in consume strings {
for data in self.learningManager.temporaryPerfectMatch(charIDs: consume charIds) {
// perfect matchArray(data.ruby)characters
let penalty = string2penalty[characters, default: .zero]
if penalty.isZero {
dicdata.append(data)
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: characters.count) {
continue
}
dicdata.append(data.adjustedData(adjust))
}
}
dicdata.append(contentsOf: self.getWiseDicdata(convertTarget: segment, inputData: inputData, inputRange: fromIndex ..< toIndex + 1))
dicdata.append(contentsOf: self.getMatchOSUserDict(segment))
if fromIndex == .zero {
let result: [LatticeNode] = dicdata.map {
let node = LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)
node.prevs.append(RegisteredNode.BOSNode())
return node
}
return result
} else {
let result: [LatticeNode] = dicdata.map {LatticeNode(data: $0, inputRange: fromIndex ..< toIndex + 1)}
return result
}
}
func getZeroHintPredictionDicdata(lastRcid: Int) -> [DicdataElement] {
do {
let csvString = try String(contentsOf: requestOptions.dictionaryResourceURL.appendingPathComponent("p/pc_\(lastRcid).csv", isDirectory: false), encoding: .utf8)
let csvLines = csvString.split(separator: "\n")
let csvData = csvLines.map {$0.split(separator: ",", omittingEmptySubsequences: false)}
let dicdata: [DicdataElement] = csvData.map {self.parseLoudstxt2FormattedEntry(from: $0)}
return dicdata
} catch {
debug(error)
return []
}
}
///
/// - Parameters:
/// - head:
/// - Returns:
///
func getPredictionLOUDSDicdata(key: some StringProtocol) -> [DicdataElement] {
let count = key.count
if count == .zero {
return []
}
// 700
var result: [DicdataElement] = []
let first = String(key.first!)
let charIDs = key.map(self.character2charId)
// 1, 2depth
let depth = if count == 1 {
3
} else if count == 2 {
5
} else {
Int.max
}
let prefixIndices = self.prefixMatchLOUDS(identifier: first, charIDs: charIDs, depth: depth).prefix(700)
result.append(
contentsOf: self.getDicdataFromLoudstxt3(identifier: first, indices: Set(consume prefixIndices))
.filter { Self.predictionUsable[$0.rcid] }
)
let userDictIndices = self.prefixMatchLOUDS(identifier: "user", charIDs: charIDs, depth: depth).prefix(700)
result.append(contentsOf: self.getDicdataFromLoudstxt3(identifier: "user", indices: Set(consume userDictIndices)))
if learningManager.enabled {
let memoryDictIndices = self.prefixMatchLOUDS(identifier: "memory", charIDs: charIDs).prefix(700)
result.append(contentsOf: self.getDicdataFromLoudstxt3(identifier: "memory", indices: Set(consume memoryDictIndices)))
result.append(contentsOf: self.learningManager.temporaryPrefixMatch(charIDs: charIDs))
}
return result
}
private func parseLoudstxt2FormattedEntry(from dataString: [some StringProtocol]) -> DicdataElement {
let ruby = String(dataString[0])
let word = dataString[1].isEmpty ? ruby:String(dataString[1])
let lcid = Int(dataString[2]) ?? .zero
let rcid = Int(dataString[3]) ?? lcid
let mid = Int(dataString[4]) ?? .zero
let value: PValue = PValue(dataString[5]) ?? -30.0
return DicdataElement(word: word, ruby: ruby, lcid: lcid, rcid: rcid, mid: mid, value: value)
}
///
/// - parameters:
/// - convertTarget:
/// - note
/// - Converter
private func getWiseDicdata(convertTarget: String, inputData: ComposingText, inputRange: Range<Int>) -> [DicdataElement] {
var result: [DicdataElement] = []
result.append(contentsOf: self.getJapaneseNumberDicdata(head: convertTarget))
if inputData.input[..<inputRange.startIndex].last?.character.isNumber != true && inputData.input[inputRange.endIndex...].first?.character.isNumber != true, let number = Float(convertTarget) {
result.append(DicdataElement(ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: -14))
if number.truncatingRemainder(dividingBy: 1) == 0 {
let int = Int(number)
if int < Int(1E18) && -Int(1E18) < int, let kansuji = self.numberFormatter.string(from: NSNumber(value: int)) {
result.append(DicdataElement(word: kansuji, ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: -16))
}
}
}
// convertTarget
if requestOptions.keyboardLanguage == .en_US && convertTarget.onlyRomanAlphabet {
result.append(DicdataElement(ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: -14))
}
//
if requestOptions.keyboardLanguage != .en_US && inputData.input[inputRange].allSatisfy({$0.inputStyle == .roman2kana}) {
if let katakana = Roman2Kana.katakanaChanges[convertTarget], let hiragana = Roman2Kana.hiraganaChanges[Array(convertTarget)] {
result.append(DicdataElement(word: String(hiragana), ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -13))
result.append(DicdataElement(ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -14))
}
}
//
if convertTarget.count == 1 {
let katakana = convertTarget.toKatakana()
let hiragana = convertTarget.toHiragana()
if convertTarget == katakana {
result.append(DicdataElement(ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -14))
} else {
result.append(DicdataElement(word: hiragana, ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -13))
result.append(DicdataElement(ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -14))
}
}
//
if convertTarget.count == 1, let first = convertTarget.first {
var value: PValue = -14
let hs = Self.fullwidthToHalfwidth[first, default: first]
if hs != first {
result.append(DicdataElement(word: convertTarget, ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
result.append(DicdataElement(word: String(hs), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
}
if let fs = Self.halfwidthToFullwidth[first], fs != first {
result.append(DicdataElement(word: convertTarget, ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
result.append(DicdataElement(word: String(fs), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
}
for group in Self.weakRelatingSymbolGroups where group.contains(hs) {
for symbol in group where symbol != hs {
result.append(DicdataElement(word: String(symbol), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
if let fs = Self.halfwidthToFullwidth[symbol] {
result.append(DicdataElement(word: String(fs), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
}
}
}
}
return result
}
//
private static let (fullwidthToHalfwidth, halfwidthToFullwidth) = zip(
"+ー*=・!#%&'"〜|£$¥@`;:<>,.\/_ ̄-",
"+ー*=・!#%&'"〜|£$¥@`;:<>,.\/_ ̄-".applyingTransform(.fullwidthToHalfwidth, reverse: false)!
)
.reduce(into: ([Character: Character](), [Character: Character]())) { (results: inout ([Character: Character], [Character: Character]), values: (Character, Character)) in
results.0[values.0] = values.1
results.1[values.1] = values.0
}
// ()
// strongRelatingSymbolGroups
//
// 1
//
private static let weakRelatingSymbolGroups: [[Character]] = [
// ()
["", ""], //
["", "", "", ""],
["", ""],
["", ""],
["", ""],
["", ""],
["", "辻󠄀"],
["禰󠄀", ""],
["煉󠄁", ""],
["", ""], //
["", ""],
["", "𠮷"], //
["", "𣘺", "", "𫞎"],
["", "", ""],
["", ""],
["", ""],
["", ""],
["", ""],
["", ""],
["", ""],
//
["", "", "", "", ""], //
["^", ""], //
["¥", "$", "¢", "", "£", ""], //
["%", ""], //
["°", "", ""],
[""], //
["*", "", "✳︎", "✴︎"], //
["", "", "", ""],
["+", "±", ""],
["×", "", "✖️"],
["÷", "" ],
["<", "", "", "", "", "", "«"],
[">", "", "", "", "", "", "»"],
["=", "", "", ""],
[":", ";"],
["!", "❗️", "❣️", "‼︎", "⁉︎", "", "‼️", "⁉️", "¡"],
["?", "", "⁉︎", "", "", "⁉️", "¿"],
["", "", "", "☎︎"],
["", "", "", "", "", "", "", ""],
["", "", "", ""], //
["", "", "", "", "", "", "", "", "", "", "⚪︎", ""], //
["", "", "", "", "↙︎", "↖︎", "↘︎", "↗︎", "↔︎", "↕︎", "↪︎", "↩︎", ""], //
["", "", "", "", "", "", "", "𝄞", "𝄞"], //
["", "", ""] //
]
private func loadCCBinary(url: URL) -> [(Int32, Float)] {
do {
let binaryData = try Data(contentsOf: url, options: [.uncached])
return binaryData.toArray(of: (Int32, Float).self)
} catch {
debug("Failed to read the file.", error)
return []
}
}
/// OSruby
func getMatchOSUserDict(_ ruby: some StringProtocol) -> [DicdataElement] {
self.dynamicUserDict.filter {$0.ruby == ruby}
}
/// OSruby
func getPrefixMatchOSUserDict(_ ruby: some StringProtocol) -> [DicdataElement] {
self.dynamicUserDict.filter {$0.ruby.hasPrefix(ruby)}
}
//
// TODO: previous
func updateLearningData(_ candidate: Candidate, with previous: DicdataElement?) {
if let previous {
self.learningManager.update(data: [previous] + candidate.data)
} else {
self.learningManager.update(data: candidate.data)
}
}
//
// TODO: previous
func updateLearningData(_ candidate: Candidate, with predictionCandidate: PostCompositionPredictionCandidate) {
switch predictionCandidate.type {
case .additional(data: let data):
self.learningManager.update(data: candidate.data, updatePart: data)
case .replacement(targetData: let targetData, replacementData: let replacementData):
self.learningManager.update(data: candidate.data.dropLast(targetData.count), updatePart: replacementData)
}
}
/// class id
/// - Parameters:
/// - former: id
/// - latter: id
/// - Returns:
///
/// -
/// : 0.115224 : ___CCValue
public func getCCValue(_ former: Int, _ latter: Int) -> PValue {
if !ccParsed[former] {
let url = requestOptions.dictionaryResourceURL.appendingPathComponent("cb/\(former).binary", isDirectory: false)
let values = loadCCBinary(url: url)
ccLines[former] = [Int: PValue].init(uniqueKeysWithValues: values.map {(Int($0.0), PValue($0.1))})
ccParsed[former] = true
}
let defaultValue = ccLines[former][-1, default: -25]
return ccLines[former][latter, default: defaultValue]
}
/// meaning id
/// - Parameters:
/// - former: id
/// - latter: id
/// - Returns:
///
/// -
public func getMMValue(_ former: Int, _ latter: Int) -> PValue {
if former == 500 || latter == 500 {
return 0
}
return self.mmValue[former * self.midCount + latter]
}
private static let possibleLOUDS: Set<Character> = [
" ", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "*", "", "", "", "´", "¨", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "", "", "(", ")", "#", "%", "&", "^", "_", "'", "\""
]
//
static func existLOUDS(for character: Character) -> Bool {
Self.possibleLOUDS.contains(character)
}
/*
*
*
*
*
*
*
*/
/// class id
/// - Parameters:
/// - c_former: id
/// - c_latter: id
/// - Returns:
///
@inlinable static func isClause(_ former: Int, _ latter: Int) -> Bool {
// EOS
let latter_wordtype = Self.wordTypes[latter]
if latter_wordtype == 3 {
return false
}
let former_wordtype = Self.wordTypes[former]
if former_wordtype == 3 {
return false
}
if latter_wordtype == 0 {
return former_wordtype != 0
}
if latter_wordtype == 1 {
return former_wordtype != 0
}
return false
}
/// wordTypes使
private static let BOS_EOS_wordIDs: Set<Int> = [CIDData.BOS.cid, CIDData.EOS.cid]
/// wordTypes使
private static let PREPOSITION_wordIDs: Set<Int> = [1315, 6, 557, 558, 559, 560]
/// wordTypes使
private static let INPOSITION_wordIDs: Set<Int> = Set<Int>(
Array(561..<868).chained(1283..<1297).chained(1306..<1310).chained(11..<53).chained(555..<557).chained(1281..<1283)
).union([1314, 3, 2, 4, 5, 1, 9])
/*
private static let POSTPOSITION_wordIDs: Set<Int> = Set<Int>((7...8).map{$0}
+ (54..<555).map{$0}
+ (868..<1281).map{$0}
+ (1297..<1306).map{$0}
+ (1310..<1314).map{$0}
).union([10])
*/
/// - Returns:
/// - 3 when BOS/EOS
/// - 0 when preposition
/// - 1 when core
/// - 2 when postposition
/// - 11B1.3KB
public static let wordTypes = (0...1319).map(_judgeWordType)
/// wordTypes使
private static func _judgeWordType(cid: Int) -> UInt8 {
if Self.BOS_EOS_wordIDs.contains(cid) {
return 3 // BOS/EOS
}
if Self.PREPOSITION_wordIDs.contains(cid) {
return 0 //
}
if Self.INPOSITION_wordIDs.contains(cid) {
return 1 //
}
return 2 //
}
@inlinable static func includeMMValueCalculation(_ data: DicdataElement) -> Bool {
//
if 895...1280 ~= data.lcid || 895...1280 ~= data.rcid {
return true
}
//
if 1297...1305 ~= data.lcid || 1297...1305 ~= data.rcid {
return true
}
//
return wordTypes[data.lcid] == 1 || wordTypes[data.rcid] == 1
}
/// - 12B2.6KB
static let penaltyRatio = (0...1319).map(_getTypoPenaltyRatio)
/// penaltyRatio使
static func _getTypoPenaltyRatio(_ lcid: Int) -> PValue {
// 147...368, 369...554
if 147...554 ~= lcid {
return 2.5
}
return 1
}
/// id
static let predictionUsable = (0...1319).map(_getPredictionUsable)
/// penaltyRatio使
static func _getPredictionUsable(_ rcid: Int) -> Bool {
//
// `cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "`
if Set([33, 34, 50, 86, 87, 88, 103, 127, 128, 144, 397, 398, 408, 426, 427, 450, 457, 480, 687, 688, 703, 704, 727, 742, 750, 758, 766, 786, 787, 798, 810, 811, 829, 830, 831, 893, 973, 974, 975, 976, 977, 1007, 1008, 1009, 1010, 1063, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1240, 1241, 1242, 1243, 1268, 1269, 1270, 1271]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([15, 16, 17, 18, 41, 42, 59, 60, 61, 62, 63, 64, 94, 95, 109, 110, 111, 112, 135, 136, 379, 380, 381, 382, 402, 412, 413, 442, 443, 471, 472, 562, 572, 582, 591, 598, 618, 627, 677, 678, 693, 694, 709, 710, 722, 730, 737, 745, 753, 761, 770, 771, 791, 869, 878, 885, 896, 906, 917, 918, 932, 948, 949, 950, 951, 952, 987, 988, 989, 990, 1017, 1018, 1033, 1034, 1035, 1036, 1058, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1212, 1213, 1214, 1215]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([372, 406, 418, 419, 431, 437, 438, 455, 462, 463, 464, 495, 496, 504, 533, 534, 540, 551, 567, 577, 587, 595, 606, 614, 622, 630, 641, 647, 653, 659, 665, 672, 683, 684, 699, 700, 715, 716, 725, 733, 740, 748, 756, 764, 780, 781, 794, 806, 807, 823, 824, 825, 837, 842, 847, 852, 859, 865, 873, 881, 890, 901, 911, 925, 935, 963, 964, 965, 966, 967, 999, 1000, 1001, 1002, 1023, 1024, 1045, 1046, 1047, 1048, 1061, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1224, 1225, 1226, 1227, 1260, 1261, 1262, 1263, 1278]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([420, 421, 631, 782, 783, 795, 891, 936, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1228, 1229, 1230, 1231]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([25, 26, 46, 74, 75, 76, 99, 119, 120, 140, 389, 390, 405, 416, 417, 447, 476, 493, 494, 566, 576, 585, 594, 603, 621, 629, 671, 681, 682, 697, 698, 713, 714, 724, 732, 739, 747, 755, 763, 778, 779, 793, 804, 805, 820, 821, 822, 872, 880, 889, 900, 910, 923, 924, 934, 958, 959, 960, 961, 962, 995, 996, 997, 998, 1021, 1022, 1041, 1042, 1043, 1044, 1060, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1220, 1221, 1222, 1223, 1256, 1257, 1258, 1259]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([27, 28, 47, 77, 78, 79, 100, 121, 122, 141, 391, 392, 448, 477, 604]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([404, 564, 565, 574, 575, 600, 601, 620, 774, 775, 776, 777, 871, 887, 888, 898, 899, 908, 909, 921, 922, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([13, 14, 40, 56, 57, 58, 93, 107, 108, 134, 369, 377, 378, 401, 410, 411, 433, 434, 441, 452, 470, 483, 489, 490, 527, 528, 537, 542, 548, 561, 571, 581, 590, 597, 611, 617, 626, 636, 638, 644, 650, 656, 662, 668, 675, 676, 691, 692, 707, 708, 721, 729, 736, 744, 752, 760, 768, 769, 790, 800, 801, 814, 815, 816, 835, 840, 845, 850, 855, 862, 868, 877, 884, 895, 905, 915, 916, 931, 941, 943, 944, 945, 946, 947, 983, 984, 985, 986, 1015, 1016, 1029, 1030, 1031, 1032, 1057, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1208, 1209, 1210, 1211, 1248, 1249, 1250, 1251, 1276]).contains(rcid) {
return false
}
//
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([373, 553, 569, 579, 589, 596, 609, 624, 634, 642, 648, 654, 660, 666, 673, 860, 866, 875, 903, 913, 928, 929, 939]).contains(rcid) {
return false
}
return true
}
//
@inlinable static func needWValueMemory(_ data: DicdataElement) -> Bool {
//
if 147...554 ~= data.lcid {
return false
}
//
if 557...560 ~= data.lcid {
return false
}
//
if 1297...1305 ~= data.lcid {
return false
}
//
if 6...9 ~= data.lcid {
return false
}
if 0 == data.lcid || 1316 == data.lcid {
return false
}
return true
}
static let possibleNexts: [String: [String]] = [
"x": ["", "", "", "", "", "", "", "", "", ""],
"l": ["", "", "", "", "", "", "", "", "", ""],
"xt": [""],
"lt": [""],
"xts": [""],
"lts": [""],
"xy": ["", "", ""],
"ly": ["", "", ""],
"xw": [""],
"lw": [""],
"v": [""],
"k": ["", "", "", "", ""],
"q": ["クァ", "クィ", "クゥ", "クェ", "クォ"],
"qy": ["クャ", "クィ", "クュ", "クェ", "クョ"],
"qw": ["クヮ", "クィ", "クゥ", "クェ", "クォ"],
"ky": ["キャ", "キィ", "キュ", "キェ", "キョ"],
"g": ["", "", "", "", ""],
"gy": ["ギャ", "ギィ", "ギュ", "ギェ", "ギョ"],
"s": ["", "", "", "", ""],
"sy": ["シャ", "シィ", "シュ", "シェ", "ショ"],
"sh": ["シャ", "シィ", "シュ", "シェ", "ショ"],
"z": ["", "", "", "", ""],
"zy": ["ジャ", "ジィ", "ジュ", "ジェ", "ジョ"],
"j": [""],
"t": ["", "", "", "", ""],
"ty": ["チャ", "チィ", "チュ", "チェ", "チョ"],
"ts": [""],
"th": ["テャ", "ティ", "テュ", "テェ", "テョ"],
"tw": ["トァ", "トィ", "トゥ", "トェ", "トォ"],
"cy": ["チャ", "チィ", "チュ", "チェ", "チョ"],
"ch": [""],
"d": ["", "", "", "", ""],
"dy": ["ヂャ", "ヂィ", "ヂュ", "ヂェ", "ヂョ"],
"dh": ["デャ", "ディ", "デュ", "デェ", "デョ"],
"dw": ["ドァ", "ドィ", "ドゥ", "ドェ", "ドォ"],
"n": ["", "", "", "", "", ""],
"ny": ["ニャ", "ニィ", "ニュ", "ニェ", "ニョ"],
"h": ["", "", "", "", ""],
"hy": ["ヒャ", "ヒィ", "ヒュ", "ヒェ", "ヒョ"],
"hw": ["ファ", "フィ", "フェ", "フォ"],
"f": [""],
"b": ["", "", "", "", ""],
"by": ["ビャ", "ビィ", "ビュ", "ビェ", "ビョ"],
"p": ["", "", "", "", ""],
"py": ["ピャ", "ピィ", "ピュ", "ピェ", "ピョ"],
"m": ["", "", "", "", ""],
"my": ["ミャ", "ミィ", "ミュ", "ミェ", "ミョ"],
"y": ["", "", "イェ", ""],
"r": ["", "", "", "", ""],
"ry": ["リャ", "リィ", "リュ", "リェ", "リョ"],
"w": ["", "ウィ", "ウェ", ""],
"wy": ["", ""]
]
}