Files
AzooKeyKanaKanjiConverter/Sources/KanaKanjiConverterModule/DictionaryManagement/DicdataStore.swift
2025-07-15 05:02:29 +09:00

1096 lines
52 KiB
Swift
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//
// DicdataStore.swift
// Keyboard
//
// Created by ensan on 2020/09/17.
// Copyright © 2020 ensan. All rights reserved.
//
import Algorithms
import Foundation
import SwiftUtils
public final class DicdataStore {
public init(convertRequestOptions: ConvertRequestOptions) {
self.requestOptions = convertRequestOptions
self.setup()
}
init(requestOptions: ConvertRequestOptions = .default) {
self.requestOptions = requestOptions
debug("DicdataStoreが初期化されました")
self.setup()
}
private var ccParsed: [Bool] = .init(repeating: false, count: 1319)
private var ccLines: [[Int: PValue]] = []
private var mmValue: [PValue] = []
private var loudses: [String: LOUDS] = [:]
private var loudstxts: [String: Data] = [:]
private var importedLoudses: Set<String> = []
private var charsID: [Character: UInt8] = [:]
private var learningManager = LearningManager()
private var dynamicUserDict: [DicdataElement] = []
///
/// - TODO: make this value as an option
public let maxlength: Int = 20
///
/// - TODO: make this value as an option
public let threshold: PValue = -17
private let midCount = 502
private let cidCount = 1319
private var requestOptions: ConvertRequestOptions = .default
private let numberFormatter = NumberFormatter()
///
private func setup() {
numberFormatter.numberStyle = .spellOut
numberFormatter.locale = .init(identifier: "ja-JP")
self.ccLines = [[Int: PValue]].init(repeating: [:], count: CIDData.totalCount)
do {
let string = try String(contentsOf: self.requestOptions.dictionaryResourceURL.appendingPathComponent("louds/charID.chid", isDirectory: false), encoding: String.Encoding.utf8)
charsID = [Character: UInt8].init(uniqueKeysWithValues: string.enumerated().map {($0.element, UInt8($0.offset))})
} catch {
debug("Error: louds/charID.chidが存在しません。このエラーは深刻ですが、テスト時には無視できる場合があります。Description: \(error)")
}
do {
let url = requestOptions.dictionaryResourceURL.appendingPathComponent("mm.binary", isDirectory: false)
do {
let binaryData = try Data(contentsOf: url, options: [.uncached])
self.mmValue = binaryData.toArray(of: Float.self).map {PValue($0)}
} catch {
debug("Error: mm.binaryが存在しません。このエラーは深刻ですが、テスト時には無視できる場合があります。Description: \(error)")
self.mmValue = [PValue].init(repeating: .zero, count: self.midCount * self.midCount)
}
}
self.reloadUser()
_ = self.loadLOUDS(query: "user")
self.reloadMemory()
_ = self.loadLOUDS(query: "memory")
if requestOptions.preloadDictionary {
self.preloadDictionary()
}
}
/// I/O
private func preloadDictionary() {
guard let fileURLs = try? FileManager.default.contentsOfDirectory(
at: self.requestOptions.dictionaryResourceURL.appendingPathComponent("louds", isDirectory: true),
includingPropertiesForKeys: nil
) else { return }
for url in fileURLs {
let identifier = url.deletingPathExtension().lastPathComponent
let pathExt = url.pathExtension
switch pathExt {
case "louds":
// usermemory
if identifier == "user" || identifier == "memory" {
continue
}
loudses[identifier] = LOUDS.load(identifier, option: self.requestOptions)
case "loudstxt3":
if let data = try? Data(contentsOf: url) {
loudstxts[identifier] = data
} else {
debug("Error: Could not load loudstxt3 file at \(url)")
}
default:
continue
}
}
}
public enum Notification {
/// use `importDynamicUserDict` for data that cannot be obtained statically.
/// - warning: Too many dynamic user dictionary will damage conversion performance, as dynamic user dictionary uses inefficent algorithms for looking up. If your entries can be listed up statically, then use normal user dictionaries.
case importDynamicUserDict([DicdataElement])
@available(*, deprecated, renamed: "importDynamicUserDict", message: "it will be removed in AzooKeyKanaKanjiConverter v1.0")
case importOSUserDict([DicdataElement])
case setRequestOptions(ConvertRequestOptions)
case forgetMemory(Candidate)
case closeKeyboard
}
func sendToDicdataStore(_ data: Notification) {
switch data {
case .closeKeyboard:
self.closeKeyboard()
case .importOSUserDict(let dicdata), .importDynamicUserDict(let dicdata):
self.dynamicUserDict = dicdata
self.dynamicUserDict.mutatingForEach {
$0.metadata = .isFromUserDictionary
}
case let .forgetMemory(candidate):
self.learningManager.forgetMemory(data: candidate.data)
// louds
self.reloadMemory()
case let .setRequestOptions(value):
// bundleURLsetup
if value.dictionaryResourceURL != self.requestOptions.dictionaryResourceURL {
self.requestOptions = value
self.setup()
} else {
self.requestOptions = value
}
let shouldReset = self.learningManager.setRequestOptions(value)
if shouldReset {
self.reloadMemory()
}
}
}
func character2charId(_ character: Character) -> UInt8 {
self.charsID[character, default: .max]
}
private func reloadMemory() {
self.loudses.removeValue(forKey: "memory")
self.importedLoudses.remove("memory")
}
private func reloadUser() {
self.loudses.removeValue(forKey: "user")
self.importedLoudses.remove("user")
}
private func closeKeyboard() {
self.learningManager.save()
// savememoryLOUDS使
self.reloadMemory()
self.reloadUser()
}
///
@inlinable static func getPenalty(data: borrowing DicdataElement) -> PValue {
-2.0 / PValue(data.word.count)
}
///
private func shouldBeRemoved(value: PValue, wordCount: Int) -> Bool {
let d = value - self.threshold
if d < 0 {
return true
}
// d
return -2.0 / PValue(wordCount) < -d
}
///
@inlinable func shouldBeRemoved(data: borrowing DicdataElement) -> Bool {
let d = data.value() - self.threshold
if d < 0 {
return true
}
return Self.getPenalty(data: data) < -d
}
func loadLOUDS(query: String) -> LOUDS? {
if importedLoudses.contains(query) {
return self.loudses[query]
}
// LOUDSimportedLoudses
importedLoudses.insert(query)
// ASCII
let identifier = [
"\\n": "[0A]",
" ": "[20]",
"\"": "[22]",
"\'": "[27]",
"*": "[2A]",
"+": "[2B]",
".": "[2E]",
"/": "[2F]",
":": "[3A]",
"<": "[3C]",
">": "[3E]",
"\\": "[5C]",
"|": "[7C]",
][query, default: query]
if let louds = LOUDS.load(identifier, option: self.requestOptions) {
self.loudses[query] = louds
return louds
} else {
if identifier == "user" || identifier == "memory" {
debug("Error: IDが「\(identifier) (query: \(query))」のloudsファイルの読み込みに失敗しましたが、このエラーは深刻ではありません。")
} else {
debug("Error: IDが「\(identifier) (query: \(query))」のloudsファイルの読み込みに失敗しました。IDに対する辞書データが存在しないことが想定される場合はこのエラーは深刻ではありませんが、そうでない場合は深刻なエラーの可能性があります。")
}
return nil
}
}
///
/// - Parameters:
/// - query: LOUDS1"user"
/// - charIDs: ID
/// - Returns: ID1
///
/// IDLOUDS
///
func perfectMatchingSearch(query: String, charIDs: [UInt8]) -> [Int] {
guard let louds = self.loadLOUDS(query: query) else {
return []
}
return [louds.searchNodeIndex(chars: charIDs)].compactMap {$0}
}
private struct UnifiedGenerator {
struct SurfaceGenerator {
var surface: [Character] = []
var range: TypoCorrectionGenerator.ProcessRange
var currentIndex: Int
init(surface: [Character], range: TypoCorrectionGenerator.ProcessRange) {
self.surface = surface
self.range = range
self.currentIndex = range.rightIndexRange.lowerBound
}
mutating func setUnreachablePath<C: Collection<Character>>(target: C) where C.Indices == Range<Int> {
if self.surface[self.range.leftIndex...].hasPrefix(target) {
// new upper bound
let currentLowerBound = self.range.rightIndexRange.lowerBound
let currentUpperBound = self.range.rightIndexRange.upperBound
let targetUpperBound = self.range.leftIndex + target.indices.upperBound
self.range.rightIndexRange = min(currentLowerBound, targetUpperBound) ..< min(currentUpperBound, targetUpperBound)
}
}
mutating func next() -> ([Character], (endIndex: Lattice.LatticeIndex, penalty: PValue))? {
if self.surface.indices.contains(self.currentIndex), self.currentIndex < self.range.rightIndexRange.upperBound {
defer {
self.currentIndex += 1
}
let characters = Array(self.surface[self.range.leftIndex ... self.currentIndex])
return (characters, (.surface(self.currentIndex), 0))
}
return nil
}
}
var typoCorrectionGenerator: TypoCorrectionGenerator? = nil
var surfaceGenerator: SurfaceGenerator? = nil
mutating func register(_ generator: TypoCorrectionGenerator) {
self.typoCorrectionGenerator = generator
}
mutating func register(_ generator: SurfaceGenerator) {
self.surfaceGenerator = generator
}
mutating func setUnreachablePath<C: Collection<Character>>(target: C) where C.Indices == Range<Int> {
self.typoCorrectionGenerator?.setUnreachablePath(target: target)
self.surfaceGenerator?.setUnreachablePath(target: target)
}
mutating func next() -> ([Character], (endIndex: Lattice.LatticeIndex, penalty: PValue))? {
if let next = self.surfaceGenerator?.next() {
return next
}
if let next = self.typoCorrectionGenerator?.next() {
return next
}
return nil
}
}
func movingTowardPrefixSearch(
composingText: ComposingText,
inputProcessRange: TypoCorrectionGenerator.ProcessRange?,
surfaceProcessRange: TypoCorrectionGenerator.ProcessRange?,
useMemory: Bool,
needTypoCorrection: Bool
) -> (
stringToInfo: [[Character]: (endIndex: Lattice.LatticeIndex, penalty: PValue)],
indices: [(key: String, indices: [Int])],
temporaryMemoryDicdata: [DicdataElement]
) {
var generator = UnifiedGenerator()
if let surfaceProcessRange {
let surfaceGenerator = UnifiedGenerator.SurfaceGenerator(
surface: Array(composingText.convertTarget.toKatakana()),
range: surfaceProcessRange
)
generator.register(surfaceGenerator)
}
if let inputProcessRange {
let typoCorrectionGenerator = TypoCorrectionGenerator(
inputs: composingText.input,
range: inputProcessRange,
needTypoCorrection: needTypoCorrection
)
generator.register(typoCorrectionGenerator)
}
var targetLOUDS: [String: LOUDS.MovingTowardPrefixSearchHelper] = [:]
var stringToInfo: [([Character], (endIndex: Lattice.LatticeIndex, penalty: PValue))] = []
//
var dynamicDicdata: [Int: [DicdataElement]] = [:]
//
while let (characters, info) = generator.next() {
guard let firstCharacter = characters.first else {
continue
}
let charIDs = characters.map(self.character2charId(_:))
let keys: [String] = if useMemory {
[String(firstCharacter), "user", "memory"]
} else {
[String(firstCharacter), "user"]
}
var updated = false
var availableMaxIndex = 0
for key in keys {
withMutableValue(&targetLOUDS[key]) { helper in
if helper == nil, let louds = self.loadLOUDS(query: key) {
helper = LOUDS.MovingTowardPrefixSearchHelper(louds: louds)
}
guard helper != nil else {
return
}
let result = helper!.update(target: charIDs)
updated = updated || result.updated
availableMaxIndex = max(availableMaxIndex, result.availableMaxIndex)
}
}
//
let result = self.learningManager.movingTowardPrefixSearchOnTemporaryMemory(charIDs: consume charIDs)
updated = updated || !(result.dicdata.isEmpty)
availableMaxIndex = max(availableMaxIndex, result.availableMaxIndex)
for (depth, dicdata) in result.dicdata {
for data in dicdata {
if info.penalty.isZero {
dynamicDicdata[depth, default: []].append(data)
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * info.penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: data.ruby.count) {
continue
}
dynamicDicdata[depth, default: []].append(data.adjustedData(adjust))
}
}
if !self.dynamicUserDict.isEmpty {
//
let katakanaString = String(characters).toKatakana()
let dynamicUserDictResult = self.getMatchDynamicUserDict(katakanaString)
updated = updated || !dynamicUserDictResult.isEmpty
for data in dynamicUserDictResult {
let depth = characters.endIndex
if info.penalty.isZero {
dynamicDicdata[depth, default: []].append(data)
} else {
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * info.penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: Array(data.ruby).count) {
continue
}
dynamicDicdata[depth, default: []].append(data.adjustedData(adjust))
}
}
}
if availableMaxIndex < characters.endIndex - 1 {
//
generator.setUnreachablePath(target: characters[...(availableMaxIndex + 1)])
}
if updated {
stringToInfo.append((characters, info))
}
}
let minCount = stringToInfo.map {$0.0.count}.min() ?? 0
return (
Dictionary(
stringToInfo,
uniquingKeysWith: { (lhs, rhs) in
if lhs.penalty < rhs.penalty {
return lhs
} else if lhs.penalty == rhs.penalty {
return switch (lhs.endIndex, rhs.endIndex) {
case (.input, .input), (.surface, .surface): lhs //
case (.surface, .input): lhs // surfaceIndex
case (.input, .surface): rhs // surfaceIndex
}
} else {
return rhs
}
}
),
targetLOUDS.map {
($0.key, $0.value.indicesInDepth(depth: minCount - 1 ..< .max))
},
dynamicDicdata.flatMap {
minCount < $0.key + 1 ? $0.value : []
}
)
}
/// prefixprefix matchLOUDS
/// - Parameters:
/// - query: 1"user"
/// - charIDs: ID
/// - depth:
/// - maxCount:
/// - Returns: prefix
///
/// prefixLOUDS`maxCount``depth`
/// ABCABCABCDABCDE
private func startingFromPrefixSearch(query: String, charIDs: [UInt8], depth: Int = .max, maxCount: Int = .max) -> [Int] {
guard let louds = self.loadLOUDS(query: query) else {
return []
}
return louds.prefixNodeIndices(chars: charIDs, maxDepth: depth, maxCount: maxCount)
}
package func getDicdataFromLoudstxt3(identifier: String, indices: some Sequence<Int>) -> [DicdataElement] {
// split = 2048
let dict = [Int: [Int]].init(grouping: indices, by: {$0 >> 11})
var data: [DicdataElement] = []
for (key, value) in dict {
data.append(contentsOf: LOUDS.getDataForLoudstxt3(identifier + "\(key)", indices: value.map {$0 & 2047}, cache: self.loudstxts[identifier + "\(key)"], option: self.requestOptions))
}
if identifier == "memory" {
data.mutatingForEach {
$0.metadata = .isLearned
}
}
if identifier == "user" {
data.mutatingForEach {
$0.metadata = .isFromUserDictionary
}
}
return data
}
///
/// - Parameters:
/// - composingText:
/// - inputRange: `composingText.input`
/// - surfaceRange: `composingText.convertTarget`
/// - needTypoCorrection:
/// - Returns: `LatticeNode`
public func lookupDicdata(
composingText: ComposingText,
inputRange:(startIndex: Int, endIndexRange: Range<Int>?)? = nil,
surfaceRange: (startIndex: Int, endIndexRange: Range<Int>?)? = nil,
needTypoCorrection: Bool = true
) -> [LatticeNode] {
let inputProcessRange: TypoCorrectionGenerator.ProcessRange?
if let inputRange {
let toInputIndexLeft = inputRange.endIndexRange?.startIndex ?? inputRange.startIndex
let toInputIndexRight = min(
inputRange.endIndexRange?.endIndex ?? composingText.input.count,
inputRange.startIndex + self.maxlength
)
if inputRange.startIndex > toInputIndexLeft || toInputIndexLeft >= toInputIndexRight {
debug(#function, "index is wrong")
return []
}
inputProcessRange = .init(leftIndex: inputRange.startIndex, rightIndexRange: toInputIndexLeft ..< toInputIndexRight)
} else {
inputProcessRange = nil
}
let surfaceProcessRange: TypoCorrectionGenerator.ProcessRange?
if let surfaceRange {
let toSurfaceIndexLeft = surfaceRange.endIndexRange?.startIndex ?? surfaceRange.startIndex
let toSurfaceIndexRight = min(
surfaceRange.endIndexRange?.endIndex ?? composingText.convertTarget.count,
surfaceRange.startIndex + self.maxlength
)
if surfaceRange.startIndex > toSurfaceIndexLeft || toSurfaceIndexLeft >= toSurfaceIndexRight {
debug(#function, "index is wrong")
return []
}
surfaceProcessRange = .init(leftIndex: surfaceRange.startIndex, rightIndexRange: toSurfaceIndexLeft ..< toSurfaceIndexRight)
} else {
surfaceProcessRange = nil
}
if inputProcessRange == nil && surfaceProcessRange == nil {
debug(#function, "either of inputProcessRange and surfaceProcessRange must not be nil")
return []
}
// MARK:
var (stringToInfo, indices, dicdata) = self.movingTowardPrefixSearch(
composingText: composingText,
inputProcessRange: inputProcessRange,
surfaceProcessRange: surfaceProcessRange,
useMemory: self.learningManager.enabled,
needTypoCorrection: needTypoCorrection
)
// MARK: indices
for (identifier, value) in indices {
let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
let rubyArray = Array(data.ruby)
let penalty = stringToInfo[rubyArray]?.penalty ?? 0
if penalty.isZero {
return data
}
let ratio = Self.penaltyRatio[data.lcid]
let pUnit: PValue = Self.getPenalty(data: data) / 2 //
let adjust = pUnit * penalty * ratio
if self.shouldBeRemoved(value: data.value() + adjust, wordCount: rubyArray.count) {
return nil
}
return data.adjustedData(adjust)
}
dicdata.append(contentsOf: result)
}
if let inputProcessRange {
let segments = (inputProcessRange.leftIndex ..< inputProcessRange.rightIndexRange.endIndex).reduce(into: []) { (segments: inout [String], rightIndex: Int) in
segments.append((segments.last ?? "") + String(composingText.input[rightIndex].character.toKatakana()))
}
for i in inputProcessRange.rightIndexRange {
do {
let result = self.getWiseDicdata(
convertTarget: segments[i - inputProcessRange.leftIndex],
inputData: composingText,
inputRange: inputProcessRange.leftIndex ..< i + 1
)
for item in result {
stringToInfo[Array(item.ruby)] = (.input(i), 0)
}
dicdata.append(contentsOf: result)
}
}
}
let needBOS = inputRange?.startIndex == .zero || surfaceRange?.startIndex == .zero
let result: [LatticeNode] = dicdata.compactMap {
guard let endIndex = stringToInfo[Array($0.ruby)]?.endIndex else {
return nil
}
let range: Lattice.LatticeRange = switch endIndex {
case .input(let endIndex): .input(from: (inputRange?.startIndex)!, to: endIndex + 1)
case .surface(let endIndex): .surface(from: (surfaceRange?.startIndex)!, to: endIndex + 1)
}
let node = LatticeNode(data: $0, range: range)
if needBOS {
node.prevs.append(RegisteredNode.BOSNode())
}
return node
}
return result
}
func getZeroHintPredictionDicdata(lastRcid: Int) -> [DicdataElement] {
do {
let csvString = try String(contentsOf: requestOptions.dictionaryResourceURL.appendingPathComponent("p/pc_\(lastRcid).csv", isDirectory: false), encoding: .utf8)
let csvLines = csvString.split(separator: "\n")
let csvData = csvLines.map {$0.split(separator: ",", omittingEmptySubsequences: false)}
let dicdata: [DicdataElement] = csvData.map {self.parseLoudstxt2FormattedEntry(from: $0)}
return dicdata
} catch {
debug("Error: 右品詞ID\(lastRcid)のためのZero Hint Predictionのためのデータの読み込みに失敗しました。このエラーは深刻ですが、テスト時には無視できる場合があります。 Description: \(error.localizedDescription)")
return []
}
}
///
/// - Parameters:
/// - head:
/// - Returns:
///
func getPredictionLOUDSDicdata(key: some StringProtocol) -> [DicdataElement] {
let count = key.count
if count == .zero {
return []
}
// 700
let maxCount = 700
var result: [DicdataElement] = []
let first = String(key.first!)
let charIDs = key.map(self.character2charId)
// 1, 2depth
let depth = if count == 1 {
3
} else if count == 2 {
5
} else {
Int.max
}
let prefixIndices = self.startingFromPrefixSearch(query: first, charIDs: charIDs, depth: depth, maxCount: maxCount)
result.append(
contentsOf: self.getDicdataFromLoudstxt3(identifier: first, indices: Set(prefixIndices))
.filter { Self.predictionUsable[$0.rcid] }
)
let userDictIndices = self.startingFromPrefixSearch(query: "user", charIDs: charIDs, maxCount: maxCount)
result.append(contentsOf: self.getDicdataFromLoudstxt3(identifier: "user", indices: Set(consume userDictIndices)))
if learningManager.enabled {
let memoryDictIndices = self.startingFromPrefixSearch(query: "memory", charIDs: charIDs, maxCount: maxCount)
result.append(contentsOf: self.getDicdataFromLoudstxt3(identifier: "memory", indices: Set(consume memoryDictIndices)))
result.append(contentsOf: self.learningManager.temporaryPrefixMatch(charIDs: charIDs))
}
return result
}
private func parseLoudstxt2FormattedEntry(from dataString: [some StringProtocol]) -> DicdataElement {
let ruby = String(dataString[0])
let word = dataString[1].isEmpty ? ruby:String(dataString[1])
let lcid = Int(dataString[2]) ?? .zero
let rcid = Int(dataString[3]) ?? lcid
let mid = Int(dataString[4]) ?? .zero
let value: PValue = PValue(dataString[5]) ?? -30.0
return DicdataElement(word: word, ruby: ruby, lcid: lcid, rcid: rcid, mid: mid, value: value)
}
///
/// - parameters:
/// - convertTarget:
/// - note
/// - Converter
func getWiseDicdata(convertTarget: String, inputData: ComposingText, inputRange: Range<Int>) -> [DicdataElement] {
var result: [DicdataElement] = []
result.append(contentsOf: self.getJapaneseNumberDicdata(head: convertTarget))
if inputData.input[..<inputRange.startIndex].last?.character.isNumber != true && inputData.input[inputRange.endIndex...].first?.character.isNumber != true, let number = Int(convertTarget) {
result.append(DicdataElement(ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: -14))
if Double(number) <= 1E12 && -1E12 <= Double(number), let kansuji = self.numberFormatter.string(from: NSNumber(value: number)) {
result.append(DicdataElement(word: kansuji, ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: -16))
}
}
// convertTarget
if requestOptions.keyboardLanguage == .en_US && convertTarget.onlyRomanAlphabet {
result.append(DicdataElement(ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: -14))
}
//
if requestOptions.keyboardLanguage != .en_US && inputData.input[inputRange].allSatisfy({$0.inputStyle == .roman2kana}) {
let roman = String(inputData.input[inputRange].map(\.character))
if let katakana = Roman2Kana.katakanaChanges[roman], let hiragana = Roman2Kana.hiraganaChanges[Array(roman)] {
result.append(DicdataElement(word: String(hiragana), ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -13))
result.append(DicdataElement(ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -14))
}
}
//
if convertTarget.count == 1 {
let katakana = convertTarget.toKatakana()
let hiragana = convertTarget.toHiragana()
if convertTarget == katakana && katakana == hiragana {
//
let element = DicdataElement(ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -14)
result.append(element)
} else {
//
let hiraganaElement = DicdataElement(word: hiragana, ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -13)
let katakanaElement = DicdataElement(ruby: katakana, cid: CIDData..cid, mid: MIDData..mid, value: -14)
result.append(hiraganaElement)
result.append(katakanaElement)
}
}
//
if convertTarget.count == 1, let first = convertTarget.first {
var value: PValue = -14
let hs = Self.fullwidthToHalfwidth[first, default: first]
if hs != first {
result.append(DicdataElement(word: convertTarget, ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
result.append(DicdataElement(word: String(hs), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
}
if let fs = Self.halfwidthToFullwidth[first], fs != first {
result.append(DicdataElement(word: convertTarget, ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
result.append(DicdataElement(word: String(fs), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
}
for group in Self.weakRelatingSymbolGroups where group.contains(hs) {
for symbol in group where symbol != hs {
result.append(DicdataElement(word: String(symbol), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
if let fs = Self.halfwidthToFullwidth[symbol] {
result.append(DicdataElement(word: String(fs), ruby: convertTarget, cid: CIDData..cid, mid: MIDData..mid, value: value))
value -= 5.0
}
}
}
}
return result
}
//
private static let (fullwidthToHalfwidth, halfwidthToFullwidth) = zip(
"+ー*=・!#%&'"〜|£$¥@`;:<>,.\/_ ̄-",
"+ー*=・!#%&'"〜|£$¥@`;:<>,.\/_ ̄-".applyingTransform(.fullwidthToHalfwidth, reverse: false)!
)
.reduce(into: ([Character: Character](), [Character: Character]())) { (results: inout ([Character: Character], [Character: Character]), values: (Character, Character)) in
results.0[values.0] = values.1
results.1[values.1] = values.0
}
// ()
// strongRelatingSymbolGroups
//
// 1
//
private static let weakRelatingSymbolGroups: [[Character]] = [
// ()
["", ""], //
["", "", "", ""],
["", ""],
["", ""],
["", ""],
["", ""],
["", "辻󠄀"],
["禰󠄀", ""],
["煉󠄁", ""],
["", ""], //
["", ""],
["", "𠮷"], //
["", "𣘺", "", "𫞎"],
["", "", ""],
["", ""],
["", ""],
["", ""],
["", ""],
["", ""],
["", ""],
//
["", "", "", "", ""], //
["^", ""], //
["¥", "$", "¢", "", "£", ""], //
["%", ""], //
["°", "", ""],
[""], //
["*", "", "✳︎", "✴︎"], //
["", "", "", ""],
["+", "±", ""],
["×", "", "✖️"],
["÷", "" ],
["<", "", "", "", "", "", "«"],
[">", "", "", "", "", "", "»"],
["=", "", "", ""],
[":", ";"],
["!", "❗️", "❣️", "‼︎", "⁉︎", "", "‼️", "⁉️", "¡"],
["?", "", "⁉︎", "", "", "⁉️", "¿"],
["", "", "", "☎︎"],
["", "", "", "", "", "", "", ""],
["", "", "", ""], //
["", "", "", "", "", "", "", "", "", "", "⚪︎", ""], //
["", "", "", "", "↙︎", "↖︎", "↘︎", "↗︎", "↔︎", "↕︎", "↪︎", "↩︎", ""], //
["", "", "", "", "", "", "", "𝄞", "𝄞"], //
["", "", ""] //
]
private func loadCCBinary(url: URL) -> [(Int32, Float)] {
do {
let binaryData = try Data(contentsOf: url, options: [.uncached])
return binaryData.toArray(of: (Int32, Float).self)
} catch {
debug("Error: 品詞連接コストデータの読み込みに失敗しました。このエラーは深刻ですが、テスト時には無視できる場合があります。 Description: \(error.localizedDescription)")
return []
}
}
/// ruby
func getMatchDynamicUserDict(_ ruby: some StringProtocol) -> [DicdataElement] {
self.dynamicUserDict.filter {$0.ruby == ruby}
}
/// ruby
func getPrefixMatchDynamicUserDict(_ ruby: some StringProtocol) -> [DicdataElement] {
self.dynamicUserDict.filter {$0.ruby.hasPrefix(ruby)}
}
//
// TODO: previous
func updateLearningData(_ candidate: Candidate, with previous: DicdataElement?) {
if let previous {
self.learningManager.update(data: [previous] + candidate.data)
} else {
self.learningManager.update(data: candidate.data)
}
}
//
// TODO: previous
func updateLearningData(_ candidate: Candidate, with predictionCandidate: PostCompositionPredictionCandidate) {
switch predictionCandidate.type {
case .additional(data: let data):
self.learningManager.update(data: candidate.data, updatePart: data)
case .replacement(targetData: let targetData, replacementData: let replacementData):
self.learningManager.update(data: candidate.data.dropLast(targetData.count), updatePart: replacementData)
}
}
/// class id
/// - Parameters:
/// - former: id
/// - latter: id
/// - Returns:
///
/// -
/// : 0.115224 : ___CCValue
public func getCCValue(_ former: Int, _ latter: Int) -> PValue {
if !ccParsed[former] {
let url = requestOptions.dictionaryResourceURL.appendingPathComponent("cb/\(former).binary", isDirectory: false)
let values = loadCCBinary(url: url)
ccLines[former] = [Int: PValue].init(uniqueKeysWithValues: values.map {(Int($0.0), PValue($0.1))})
ccParsed[former] = true
}
let defaultValue = ccLines[former][-1, default: -25]
return ccLines[former][latter, default: defaultValue]
}
/// meaning id
/// - Parameters:
/// - former: id
/// - latter: id
/// - Returns:
///
/// -
public func getMMValue(_ former: Int, _ latter: Int) -> PValue {
if former == 500 || latter == 500 {
return 0
}
return self.mmValue[former * self.midCount + latter]
}
/*
*
*
*
*
*
*
*/
/// class id
/// - Parameters:
/// - c_former: id
/// - c_latter: id
/// - Returns:
///
@inlinable static func isClause(_ former: Int, _ latter: Int) -> Bool {
// EOS
let latter_wordtype = Self.wordTypes[latter]
if latter_wordtype == 3 {
return false
}
let former_wordtype = Self.wordTypes[former]
if former_wordtype == 3 {
return false
}
if latter_wordtype == 0 {
return former_wordtype != 0
}
if latter_wordtype == 1 {
return former_wordtype != 0
}
return false
}
/// wordTypes使
private static let BOS_EOS_wordIDs: Set<Int> = [CIDData.BOS.cid, CIDData.EOS.cid]
/// wordTypes使
private static let PREPOSITION_wordIDs: Set<Int> = [1315, 6, 557, 558, 559, 560]
/// wordTypes使
private static let INPOSITION_wordIDs: Set<Int> = Set<Int>(
Array(561..<868).chained(1283..<1297).chained(1306..<1310).chained(11..<53).chained(555..<557).chained(1281..<1283)
).union([1314, 3, 2, 4, 5, 1, 9])
/*
private static let POSTPOSITION_wordIDs: Set<Int> = Set<Int>((7...8).map{$0}
+ (54..<555).map{$0}
+ (868..<1281).map{$0}
+ (1297..<1306).map{$0}
+ (1310..<1314).map{$0}
).union([10])
*/
/// - Returns:
/// - 3 when BOS/EOS
/// - 0 when preposition
/// - 1 when core
/// - 2 when postposition
/// - 11B1.3KB
public static let wordTypes = (0...1319).map(_judgeWordType)
/// wordTypes使
private static func _judgeWordType(cid: Int) -> UInt8 {
if Self.BOS_EOS_wordIDs.contains(cid) {
return 3 // BOS/EOS
}
if Self.PREPOSITION_wordIDs.contains(cid) {
return 0 //
}
if Self.INPOSITION_wordIDs.contains(cid) {
return 1 //
}
return 2 //
}
@inlinable static func includeMMValueCalculation(_ data: DicdataElement) -> Bool {
//
if 895...1280 ~= data.lcid || 895...1280 ~= data.rcid {
return true
}
//
if 1297...1305 ~= data.lcid || 1297...1305 ~= data.rcid {
return true
}
//
return wordTypes[data.lcid] == 1 || wordTypes[data.rcid] == 1
}
/// - 12B2.6KB
static let penaltyRatio = (0...1319).map(_getTypoPenaltyRatio)
/// penaltyRatio使
static func _getTypoPenaltyRatio(_ lcid: Int) -> PValue {
// 147...368, 369...554
if 147...554 ~= lcid {
return 2.5
}
return 1
}
/// id
static let predictionUsable = (0...1319).map(_getPredictionUsable)
/// penaltyRatio使
static func _getPredictionUsable(_ rcid: Int) -> Bool {
//
// `cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "`
if Set([33, 34, 50, 86, 87, 88, 103, 127, 128, 144, 397, 398, 408, 426, 427, 450, 457, 480, 687, 688, 703, 704, 727, 742, 750, 758, 766, 786, 787, 798, 810, 811, 829, 830, 831, 893, 973, 974, 975, 976, 977, 1007, 1008, 1009, 1010, 1063, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1240, 1241, 1242, 1243, 1268, 1269, 1270, 1271]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([15, 16, 17, 18, 41, 42, 59, 60, 61, 62, 63, 64, 94, 95, 109, 110, 111, 112, 135, 136, 379, 380, 381, 382, 402, 412, 413, 442, 443, 471, 472, 562, 572, 582, 591, 598, 618, 627, 677, 678, 693, 694, 709, 710, 722, 730, 737, 745, 753, 761, 770, 771, 791, 869, 878, 885, 896, 906, 917, 918, 932, 948, 949, 950, 951, 952, 987, 988, 989, 990, 1017, 1018, 1033, 1034, 1035, 1036, 1058, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1212, 1213, 1214, 1215]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([372, 406, 418, 419, 431, 437, 438, 455, 462, 463, 464, 495, 496, 504, 533, 534, 540, 551, 567, 577, 587, 595, 606, 614, 622, 630, 641, 647, 653, 659, 665, 672, 683, 684, 699, 700, 715, 716, 725, 733, 740, 748, 756, 764, 780, 781, 794, 806, 807, 823, 824, 825, 837, 842, 847, 852, 859, 865, 873, 881, 890, 901, 911, 925, 935, 963, 964, 965, 966, 967, 999, 1000, 1001, 1002, 1023, 1024, 1045, 1046, 1047, 1048, 1061, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1224, 1225, 1226, 1227, 1260, 1261, 1262, 1263, 1278]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([420, 421, 631, 782, 783, 795, 891, 936, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1228, 1229, 1230, 1231]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([25, 26, 46, 74, 75, 76, 99, 119, 120, 140, 389, 390, 405, 416, 417, 447, 476, 493, 494, 566, 576, 585, 594, 603, 621, 629, 671, 681, 682, 697, 698, 713, 714, 724, 732, 739, 747, 755, 763, 778, 779, 793, 804, 805, 820, 821, 822, 872, 880, 889, 900, 910, 923, 924, 934, 958, 959, 960, 961, 962, 995, 996, 997, 998, 1021, 1022, 1041, 1042, 1043, 1044, 1060, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1220, 1221, 1222, 1223, 1256, 1257, 1258, 1259]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([27, 28, 47, 77, 78, 79, 100, 121, 122, 141, 391, 392, 448, 477, 604]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([404, 564, 565, 574, 575, 600, 601, 620, 774, 775, 776, 777, 871, 887, 888, 898, 899, 908, 909, 921, 922, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129]).contains(rcid) {
return false
}
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([13, 14, 40, 56, 57, 58, 93, 107, 108, 134, 369, 377, 378, 401, 410, 411, 433, 434, 441, 452, 470, 483, 489, 490, 527, 528, 537, 542, 548, 561, 571, 581, 590, 597, 611, 617, 626, 636, 638, 644, 650, 656, 662, 668, 675, 676, 691, 692, 707, 708, 721, 729, 736, 744, 752, 760, 768, 769, 790, 800, 801, 814, 815, 816, 835, 840, 845, 850, 855, 862, 868, 877, 884, 895, 905, 915, 916, 931, 941, 943, 944, 945, 946, 947, 983, 984, 985, 986, 1015, 1016, 1029, 1030, 1031, 1032, 1057, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1208, 1209, 1210, 1211, 1248, 1249, 1250, 1251, 1276]).contains(rcid) {
return false
}
//
//
// cat cid.txt | grep | awk '{print $1}' | xargs -I {} echo -n "{}, "
if Set([373, 553, 569, 579, 589, 596, 609, 624, 634, 642, 648, 654, 660, 666, 673, 860, 866, 875, 903, 913, 928, 929, 939]).contains(rcid) {
return false
}
return true
}
//
@inlinable static func needWValueMemory(_ data: DicdataElement) -> Bool {
//
if 147...554 ~= data.lcid {
return false
}
//
if 557...560 ~= data.lcid {
return false
}
//
if 1297...1305 ~= data.lcid {
return false
}
//
if 6...9 ~= data.lcid {
return false
}
if 0 == data.lcid || 1316 == data.lcid {
return false
}
return true
}
static let possibleNexts: [String: [String]] = [
"x": ["", "", "", "", "", "", "", "", "", ""],
"l": ["", "", "", "", "", "", "", "", "", ""],
"xt": [""],
"lt": [""],
"xts": [""],
"lts": [""],
"xy": ["", "", ""],
"ly": ["", "", ""],
"xw": [""],
"lw": [""],
"v": [""],
"k": ["", "", "", "", ""],
"q": ["クァ", "クィ", "クゥ", "クェ", "クォ"],
"qy": ["クャ", "クィ", "クュ", "クェ", "クョ"],
"qw": ["クヮ", "クィ", "クゥ", "クェ", "クォ"],
"ky": ["キャ", "キィ", "キュ", "キェ", "キョ"],
"g": ["", "", "", "", ""],
"gy": ["ギャ", "ギィ", "ギュ", "ギェ", "ギョ"],
"s": ["", "", "", "", ""],
"sy": ["シャ", "シィ", "シュ", "シェ", "ショ"],
"sh": ["シャ", "シィ", "シュ", "シェ", "ショ"],
"z": ["", "", "", "", ""],
"zy": ["ジャ", "ジィ", "ジュ", "ジェ", "ジョ"],
"j": [""],
"t": ["", "", "", "", ""],
"ty": ["チャ", "チィ", "チュ", "チェ", "チョ"],
"ts": [""],
"th": ["テャ", "ティ", "テュ", "テェ", "テョ"],
"tw": ["トァ", "トィ", "トゥ", "トェ", "トォ"],
"cy": ["チャ", "チィ", "チュ", "チェ", "チョ"],
"ch": [""],
"d": ["", "", "", "", ""],
"dy": ["ヂャ", "ヂィ", "ヂュ", "ヂェ", "ヂョ"],
"dh": ["デャ", "ディ", "デュ", "デェ", "デョ"],
"dw": ["ドァ", "ドィ", "ドゥ", "ドェ", "ドォ"],
"n": ["", "", "", "", "", ""],
"ny": ["ニャ", "ニィ", "ニュ", "ニェ", "ニョ"],
"h": ["", "", "", "", ""],
"hy": ["ヒャ", "ヒィ", "ヒュ", "ヒェ", "ヒョ"],
"hw": ["ファ", "フィ", "フェ", "フォ"],
"f": [""],
"b": ["", "", "", "", ""],
"by": ["ビャ", "ビィ", "ビュ", "ビェ", "ビョ"],
"p": ["", "", "", "", ""],
"py": ["ピャ", "ピィ", "ピュ", "ピェ", "ピョ"],
"m": ["", "", "", "", ""],
"my": ["ミャ", "ミィ", "ミュ", "ミェ", "ミョ"],
"y": ["", "", "イェ", ""],
"r": ["", "", "", "", ""],
"ry": ["リャ", "リィ", "リュ", "リェ", "リョ"],
"w": ["", "ウィ", "ウェ", ""],
"wy": ["", ""]
]
}