[Experimental] Zenzai (#92)

* experimental rinna integration

* Update impl

* update

* Bump swift-actions/setup-swift from 1 to 2

Bumps [swift-actions/setup-swift](https://github.com/swift-actions/setup-swift) from 1 to 2.
- [Release notes](https://github.com/swift-actions/setup-swift/releases)
- [Commits](https://github.com/swift-actions/setup-swift/compare/v1...v2)

---
updated-dependencies:
- dependency-name: swift-actions/setup-swift
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

* change test

* change impl

* take gpt2 weight as option

* don't use async

* support status check

* enhance error

* avoid percent encode

* update

* GPT-2 based kana-kanji conversion is now perfectly workinggit statusgit status

* fix a bug

* Rename gpt2/llama -> zenz

* cleanup

* internal apiを綺麗にした

* cleanup experimental commands

* update

* partially support incremental input using cache

* fix names

* fix bug

* support roman2kana

* cleanup

* fix minor bugs

* improve logic

* fix minor bug

* fix minor bug

* fix minor bug

* optimize

* optimize performance

* Optimize cache hit

* cli: add anco session command

* fix cache hit bugs

* improve session commands

* maybe this will work better for incremental input environment

* speed up zenzai by using n_best alternatives

* update zenz context

* adding no_typo api

* add inference limit

* fix bug

* reset install_cli

* make package buildable -- but llama.cpp features just do not work at this point because metal is not preprocessed

* add proper availability checks

* change macOS minimum version

* fix several problems

* code cleanup

* enable ubuntu build

* fix build error

* fix ubuntu build

* fix borrowing

* update install_cli.sh

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
This commit is contained in:
Miwa
2024-05-15 01:36:45 +09:00
committed by GitHub
parent c4aa3eee76
commit 55ffe3c708
20 changed files with 942 additions and 108 deletions

View File

@ -9,12 +9,30 @@ on:
branches: [ "main", "develop" ]
jobs:
build:
macos-build:
name: Swift ${{ matrix.swift-version }} on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
os: [macos-latest]
swift-version: ["5.9", "5.10"]
steps:
- uses: swift-actions/setup-swift@v2
with:
swift-version: ${{ matrix.swift-version }}
- uses: actions/checkout@v4
with:
submodules: true
- name: Build
run: swift build -Xswiftc -strict-concurrency=complete -Xcxx -xobjective-c++ -v
- name: Run tests
run: swift test -c release -Xswiftc -strict-concurrency=complete -Xcxx -xobjective-c++ -v
ubuntu-build:
name: Swift ${{ matrix.swift-version }} on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
swift-version: ["5.9", "5.10"]
steps:
- uses: swift-actions/setup-swift@v2
@ -27,3 +45,4 @@ jobs:
run: swift build -Xswiftc -strict-concurrency=complete -v
- name: Run tests
run: swift test -c release -Xswiftc -strict-concurrency=complete -v

1
.gitignore vendored
View File

@ -17,3 +17,4 @@ Package.resolved
*.pyc
.docc-build
.vscode
*.gguf

View File

@ -13,9 +13,10 @@ let swiftSettings: [SwiftSetting] = [
.enableUpcomingFeature("DisableOutwardActorInference"),
.enableUpcomingFeature("ImportObjcForwardDeclarations")
]
let package = Package(
name: "AzooKeyKanakanjiConverter",
platforms: [.iOS(.v14), .macOS(.v11)],
platforms: [.iOS(.v14), .macOS(.v12)],
products: [
// Products define the executables and libraries a package produces, and make them visible to other packages.
.library(
@ -39,6 +40,8 @@ let package = Package(
.package(url: "https://github.com/apple/swift-algorithms", from: "1.0.0"),
.package(url: "https://github.com/apple/swift-collections", from: "1.0.0"),
.package(url: "https://github.com/apple/swift-argument-parser", .upToNextMajor(from: "1.0.0")),
// local package
.package(url: "https://github.com/ensan-hcl/llama.cpp", branch: "9f41923"),
],
targets: [
// Targets are the basic building blocks of a package. A target can define a module or a test suite.
@ -54,9 +57,9 @@ let package = Package(
.target(
name: "KanaKanjiConverterModule",
dependencies: [
"SwiftUtils"
"SwiftUtils",
.product(name: "llama", package: "llama.cpp")
],
resources: [],
swiftSettings: swiftSettings
),
.target(

View File

@ -2,10 +2,10 @@ import KanaKanjiConverterModuleWithDefaultDictionary
import ArgumentParser
@main
public struct Anco: ParsableCommand {
public struct Anco: AsyncParsableCommand {
public static var configuration = CommandConfiguration(
abstract: "Anco is A(zooKey) Kana-Ka(n)ji (co)nverter",
subcommands: [Subcommands.Run.self, Subcommands.Dict.self, Subcommands.Evaluate.self],
subcommands: [Subcommands.Run.self, Subcommands.Dict.self, Subcommands.Evaluate.self, Subcommands.Session.self],
defaultSubcommand: Subcommands.Run.self
)

View File

@ -13,10 +13,14 @@ extension Subcommands {
var configNBest: Int = 10
@Flag(name: [.customLong("stable")], help: "Report only stable properties; timestamps and values will not be reported.")
var stable: Bool = false
@Option(name: [.customLong("zenz")], help: "gguf format model weight for zenz.")
var zenzWeightPath: String = ""
@Option(name: [.customLong("config_zenzai_inference_limit")], help: "inference limit for zenzai.")
var configZenzaiInferenceLimit: Int = .max
static var configuration = CommandConfiguration(commandName: "evaluate", abstract: "Evaluate quality of Conversion for input data.")
func parseInputFile() throws -> [InputItem] {
private func parseInputFile() throws -> [InputItem] {
let url = URL(fileURLWithPath: self.inputFile)
let lines = (try String(contentsOf: url)).split(separator: "\n", omittingEmptySubsequences: false)
return lines.enumerated().compactMap { (index, line) -> InputItem? in
@ -33,14 +37,15 @@ extension Subcommands {
@MainActor mutating func run() throws {
let inputItems = try parseInputFile()
let requestOptions = requestOptions()
let converter = KanaKanjiConverter()
let start = Date()
var resultItems: [EvaluateItem] = []
for item in inputItems {
var composingText = ComposingText()
composingText.insertAtCursorPosition(item.query, inputStyle: .direct)
let result = converter.requestCandidates(composingText, options: requestOptions())
let result = converter.requestCandidates(composingText, options: requestOptions)
let mainResults = result.mainResults.filter {
$0.data.reduce(into: "", {$0.append(contentsOf: $1.ruby)}) == item.query.toKatakana()
}
@ -53,6 +58,8 @@ extension Subcommands {
}
)
)
// Explictly reset state
converter.stopComposition()
}
let end = Date()
var result = EvaluateResult(n_best: self.configNBest, execution_time: end.timeIntervalSince(start), items: resultItems)
@ -94,6 +101,7 @@ extension Subcommands {
shouldResetMemory: false,
memoryDirectoryURL: URL(fileURLWithPath: ""),
sharedContainerURL: URL(fileURLWithPath: ""),
zenzaiMode: self.zenzWeightPath.isEmpty ? .off : .on(weight: URL(string: self.zenzWeightPath)!, inferenceLimit: self.configZenzaiInferenceLimit),
metadata: .init(versionString: "anco for debugging")
)
option.requestQuery = .
@ -101,7 +109,7 @@ extension Subcommands {
}
}
struct InputItem {
private struct InputItem {
///
var query: String

View File

@ -3,7 +3,7 @@ import ArgumentParser
import Foundation
extension Subcommands {
struct Run: ParsableCommand {
struct Run: AsyncParsableCommand {
@Argument(help: "ひらがなで表記された入力")
var input: String = ""
@ -11,6 +11,10 @@ extension Subcommands {
var configNBest: Int = 10
@Option(name: [.customShort("n"), .customLong("top_n")], help: "Display top n candidates.")
var displayTopN: Int = 1
@Option(name: [.customLong("zenz")], help: "gguf format model weight for zenz.")
var zenzWeightPath: String = ""
@Option(name: [.customLong("config_zenzai_inference_limit")], help: "inference limit for zenzai.")
var configZenzaiInferenceLimit: Int = .max
@Flag(name: [.customLong("disable_prediction")], help: "Disable producing prediction candidates.")
var disablePrediction = false
@ -23,7 +27,7 @@ extension Subcommands {
static var configuration = CommandConfiguration(commandName: "run", abstract: "Show help for this utility.")
@MainActor mutating func run() {
@MainActor mutating func run() async {
let converter = KanaKanjiConverter()
var composingText = ComposingText()
composingText.insertAtCursorPosition(input, inputStyle: .direct)
@ -66,6 +70,7 @@ extension Subcommands {
shouldResetMemory: false,
memoryDirectoryURL: URL(fileURLWithPath: ""),
sharedContainerURL: URL(fileURLWithPath: ""),
zenzaiMode: self.zenzWeightPath.isEmpty ? .off : .on(weight: URL(string: self.zenzWeightPath)!, inferenceLimit: self.configZenzaiInferenceLimit),
metadata: .init(versionString: "anco for debugging")
)
if self.onlyWholeConversion {

View File

@ -0,0 +1,102 @@
import KanaKanjiConverterModuleWithDefaultDictionary
import ArgumentParser
import Foundation
extension Subcommands {
struct Session: AsyncParsableCommand {
@Argument(help: "ひらがなで表記された入力")
var input: String = ""
@Option(name: [.customLong("config_n_best")], help: "The parameter n (n best parameter) for internal viterbi search.")
var configNBest: Int = 10
@Option(name: [.customShort("n"), .customLong("top_n")], help: "Display top n candidates.")
var displayTopN: Int = 1
@Option(name: [.customLong("zenz")], help: "gguf format model weight for zenz.")
var zenzWeightPath: String = ""
@Flag(name: [.customLong("disable_prediction")], help: "Disable producing prediction candidates.")
var disablePrediction = false
@Flag(name: [.customLong("only_whole_conversion")], help: "Show only whole conversion (完全一致変換).")
var onlyWholeConversion = false
@Flag(name: [.customLong("report_score")], help: "Show internal score for the candidate.")
var reportScore = false
@Flag(name: [.customLong("roman2kana")], help: "Use roman2kana input.")
var roman2kana = false
@Option(name: [.customLong("config_zenzai_inference_limit")], help: "inference limit for zenzai.")
var configZenzaiInferenceLimit: Int = .max
static var configuration = CommandConfiguration(commandName: "session", abstract: "Start session for incremental input.")
@MainActor mutating func run() async {
let converter = KanaKanjiConverter()
var composingText = ComposingText()
let inputStyle: InputStyle = self.roman2kana ? .roman2kana : .direct
while true {
print()
print("\(bold: "== type :q to end session, type :d to delete character, type :c to stop composition, type any other text to input ==")")
let input = readLine(strippingNewline: true) ?? ""
switch input {
case ":q": return
case ":d":
composingText.deleteBackwardFromCursorPosition(count: 1)
case ":c":
composingText.stopComposition()
converter.stopComposition()
print("composition is stopped")
continue
default:
composingText.insertAtCursorPosition(input, inputStyle: inputStyle)
}
print(composingText.convertTarget)
let start = Date()
let result = converter.requestCandidates(composingText, options: requestOptions())
let mainResults = result.mainResults.filter {
!self.onlyWholeConversion || $0.data.reduce(into: "", {$0.append(contentsOf: $1.ruby)}) == input.toKatakana()
}
for candidate in mainResults.prefix(self.displayTopN) {
if self.reportScore {
print("\(candidate.text) \(bold: "score:") \(candidate.value)")
} else {
print(candidate.text)
}
}
if self.onlyWholeConversion {
// entropy
let mean = mainResults.reduce(into: 0) { $0 += Double($1.value) } / Double(mainResults.count)
let expValues = mainResults.map { exp(Double($0.value) - mean) }
let sumOfExpValues = expValues.reduce(into: 0, +=)
//
let probs = mainResults.map { exp(Double($0.value) - mean) / sumOfExpValues }
let entropy = -probs.reduce(into: 0) { $0 += $1 * log($1) }
print("\(bold: "Entropy:") \(entropy)")
}
print("\(bold: "Time:") \(-start.timeIntervalSinceNow)")
}
}
func requestOptions() -> ConvertRequestOptions {
var option: ConvertRequestOptions = .withDefaultDictionary(
N_best: self.onlyWholeConversion ? max(self.configNBest, self.displayTopN) : self.configNBest,
requireJapanesePrediction: !self.onlyWholeConversion && !self.disablePrediction,
requireEnglishPrediction: false,
keyboardLanguage: .ja_JP,
typographyLetterCandidate: false,
unicodeCandidate: true,
englishCandidateInRoman2KanaInput: true,
fullWidthRomanCandidate: false,
halfWidthKanaCandidate: false,
learningType: .nothing,
maxMemoryCount: 0,
shouldResetMemory: false,
memoryDirectoryURL: URL(fileURLWithPath: ""),
sharedContainerURL: URL(fileURLWithPath: ""),
zenzaiMode: self.zenzWeightPath.isEmpty ? .off : .on(weight: URL(string: self.zenzWeightPath)!, inferenceLimit: self.configZenzaiInferenceLimit),
metadata: .init(versionString: "anco for debugging")
)
if self.onlyWholeConversion {
option.requestQuery = .
}
return option
}
}
}

View File

@ -29,7 +29,7 @@ public struct ConvertRequestOptions: Sendable {
/// - sharedContainerURL:
/// - textReplacer:
/// - metadata: `ConvertRequestOptions.Metadata`
public init(N_best: Int = 10, requireJapanesePrediction: Bool, requireEnglishPrediction: Bool, keyboardLanguage: KeyboardLanguage, typographyLetterCandidate: Bool = false, unicodeCandidate: Bool = true, englishCandidateInRoman2KanaInput: Bool = false, fullWidthRomanCandidate: Bool = false, halfWidthKanaCandidate: Bool = false, learningType: LearningType, maxMemoryCount: Int = 65536, shouldResetMemory: Bool = false, dictionaryResourceURL: URL, memoryDirectoryURL: URL, sharedContainerURL: URL, textReplacer: TextReplacer = TextReplacer(), metadata: ConvertRequestOptions.Metadata?) {
public init(N_best: Int = 10, requireJapanesePrediction: Bool, requireEnglishPrediction: Bool, keyboardLanguage: KeyboardLanguage, typographyLetterCandidate: Bool = false, unicodeCandidate: Bool = true, englishCandidateInRoman2KanaInput: Bool = false, fullWidthRomanCandidate: Bool = false, halfWidthKanaCandidate: Bool = false, learningType: LearningType, maxMemoryCount: Int = 65536, shouldResetMemory: Bool = false, dictionaryResourceURL: URL, memoryDirectoryURL: URL, sharedContainerURL: URL, textReplacer: TextReplacer = TextReplacer(), zenzaiMode: ZenzaiMode = .off, metadata: ConvertRequestOptions.Metadata?) {
self.N_best = N_best
self.requireJapanesePrediction = requireJapanesePrediction
self.requireEnglishPrediction = requireEnglishPrediction
@ -46,10 +46,11 @@ public struct ConvertRequestOptions: Sendable {
self.sharedContainerURL = sharedContainerURL
self.metadata = metadata
self.textReplacer = textReplacer
self.zenzaiMode = zenzaiMode
self.dictionaryResourceURL = dictionaryResourceURL
}
package init(N_best: Int = 10, requireJapanesePrediction: Bool, requireEnglishPrediction: Bool, keyboardLanguage: KeyboardLanguage, typographyLetterCandidate: Bool = false, unicodeCandidate: Bool = true, englishCandidateInRoman2KanaInput: Bool = false, fullWidthRomanCandidate: Bool = false, halfWidthKanaCandidate: Bool = false, learningType: LearningType, maxMemoryCount: Int = 65536, shouldResetMemory: Bool = false, dictionaryResourceURL: URL, memoryDirectoryURL: URL, sharedContainerURL: URL, textReplacer: TextReplacer = TextReplacer(), metadata: ConvertRequestOptions.Metadata?, requestQuery: RequestQuery) {
package init(N_best: Int = 10, requireJapanesePrediction: Bool, requireEnglishPrediction: Bool, keyboardLanguage: KeyboardLanguage, typographyLetterCandidate: Bool = false, unicodeCandidate: Bool = true, englishCandidateInRoman2KanaInput: Bool = false, fullWidthRomanCandidate: Bool = false, halfWidthKanaCandidate: Bool = false, learningType: LearningType, maxMemoryCount: Int = 65536, shouldResetMemory: Bool = false, dictionaryResourceURL: URL, memoryDirectoryURL: URL, sharedContainerURL: URL, textReplacer: TextReplacer = TextReplacer(), zenzaiMode: ZenzaiMode = .off, metadata: ConvertRequestOptions.Metadata?, requestQuery: RequestQuery) {
self.N_best = N_best
self.requireJapanesePrediction = requireJapanesePrediction
self.requireEnglishPrediction = requireEnglishPrediction
@ -66,6 +67,7 @@ public struct ConvertRequestOptions: Sendable {
self.sharedContainerURL = sharedContainerURL
self.metadata = metadata
self.textReplacer = textReplacer
self.zenzaiMode = zenzaiMode
self.dictionaryResourceURL = dictionaryResourceURL
}
@ -88,6 +90,7 @@ public struct ConvertRequestOptions: Sendable {
public var memoryDirectoryURL: URL
public var sharedContainerURL: URL
public var dictionaryResourceURL: URL
public var zenzaiMode: ZenzaiMode
//
public var metadata: Metadata?
@ -138,4 +141,19 @@ public struct ConvertRequestOptions: Sendable {
case `default`
case
}
public struct ZenzaiMode: Sendable, Equatable {
public static let off = ZenzaiMode(enabled: false, weightURL: URL(fileURLWithPath: ""), inferenceLimit: 10)
/// activate *Zenzai* - Neural Kana-Kanji Conversiion Engine
/// - Parameters:
/// - weight: path for model weight (gguf)
/// - inferenceLimit: applying inference count limitation. Smaller limit makes conversion faster but quality will be worse. (Default: 10)
public static func on(weight: URL, inferenceLimit: Int = 10) -> Self {
ZenzaiMode(enabled: true, weightURL: weight, inferenceLimit: inferenceLimit)
}
var enabled: Bool
var weightURL: URL
var inferenceLimit: Int
}
}

View File

@ -25,15 +25,37 @@ import SwiftUtils
private var nodes: [[LatticeNode]] = []
private var completedData: Candidate?
private var lastData: DicdataElement?
/// Zenzaizenz-v1
private var zenz: Zenz? = nil
private var zenzaiCache: Kana2Kanji.ZenzaiCache? = nil
public private(set) var zenzStatus: String = ""
///
public func stopComposition() {
self.zenz?.endSession()
self.zenzaiCache = nil
self.previousInputData = nil
self.nodes = []
self.completedData = nil
self.lastData = nil
}
private func getModel(modelURL: URL) -> Zenz? {
if let model = self.zenz, model.resourceURL == modelURL {
self.zenzStatus = "load \(modelURL.absoluteString)"
return model
} else {
do {
self.zenz = try Zenz(resourceURL: modelURL)
self.zenzStatus = "load \(modelURL.absoluteString)"
return self.zenz
} catch {
self.zenzStatus = "load \(modelURL.absoluteString) " + error.localizedDescription
return nil
}
}
}
/// SpellChecker
public func setKeyboardLanguage(_ language: KeyboardLanguage) {
if !checkerInitialized[language, default: false] {
@ -429,10 +451,27 @@ import SwiftUtils
// 5
let whole_sentence_unique_candidates = self.getUniqueCandidate(sums.map {$0.1})
if case . = options.requestQuery {
// return
if options.zenzaiMode.enabled {
return ConversionResult(mainResults: whole_sentence_unique_candidates, firstClauseResults: [])
} else {
return ConversionResult(mainResults: whole_sentence_unique_candidates.sorted(by: {$0.value > $1.value}), firstClauseResults: [])
}
let sentence_candidates = whole_sentence_unique_candidates.min(count: 5, sortedBy: {$0.value > $1.value})
}
//
let sentence_candidates: [Candidate]
if options.zenzaiMode.enabled {
// FIXME:
// candidatevalueZenzairerank
// `Candidate`AI
var first5 = Array(whole_sentence_unique_candidates.prefix(5))
var values = first5.map(\.value).sorted(by: >)
for (i, v) in zip(first5.indices, values) {
first5[i].value = v
}
sentence_candidates = first5
} else {
sentence_candidates = whole_sentence_unique_candidates.min(count: 5, sortedBy: {$0.value > $1.value})
}
// 3
let prediction_candidates: [Candidate] = options.requireJapanesePrediction ? Array(self.getUniqueCandidate(self.getPredictionCandidate(sums, composingText: inputData, options: options)).min(count: 3, sortedBy: {$0.value > $1.value})) : []
@ -447,7 +486,7 @@ import SwiftUtils
}
// 538
let best8 = getUniqueCandidate(sentence_candidates.chained(prediction_candidates)).sorted {$0.value > $1.value}
let best8 = getUniqueCandidate(sentence_candidates.prefix(5).chained(prediction_candidates)).sorted {$0.value > $1.value}
//
let toplevel_additional_candidate = self.getTopLevelAdditionalCandidate(inputData, options: options)
// best8foreign_candidateszeroHintPrediction_candidatestoplevel_additional_candidate5
@ -522,11 +561,19 @@ import SwiftUtils
/// - N_best:
/// - Returns:
///
private func convertToLattice(_ inputData: ComposingText, N_best: Int) -> (result: LatticeNode, nodes: [[LatticeNode]])? {
private func convertToLattice(_ inputData: ComposingText, N_best: Int, zenzaiMode: ConvertRequestOptions.ZenzaiMode) -> (result: LatticeNode, nodes: [[LatticeNode]])? {
if inputData.convertTarget.isEmpty {
return nil
}
// FIXME: enable cache based zenzai
if zenzaiMode.enabled, let model = self.getModel(modelURL: zenzaiMode.weightURL) {
let (result, nodes, cache) = self.converter.all_zenzai(inputData, zenz: model, zenzaiCache: self.zenzaiCache, inferenceLimit: zenzaiMode.inferenceLimit)
self.zenzaiCache = cache
self.previousInputData = inputData
return (result, nodes)
}
guard let previousInputData else {
debug("convertToLattice: 新規計算用の関数を呼びますA")
let result = converter.kana2lattice_all(inputData, N_best: N_best)
@ -621,7 +668,7 @@ import SwiftUtils
// DicdataStoreRequestOption
self.sendToDicdataStore(.setRequestOptions(options))
guard let result = self.convertToLattice(inputData, N_best: options.N_best) else {
guard let result = self.convertToLattice(inputData, N_best: options.N_best, zenzaiMode: options.zenzaiMode) else {
return ConversionResult(mainResults: [], firstClauseResults: [])
}

View File

@ -219,7 +219,6 @@ public final class DicdataStore {
}
// MARK:
var stringToInfo = inputData.getRangesWithTypos(fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
// MARK:
let stringSet = stringToInfo.keys.map {($0, $0.map(self.character2charId))}
let (minCharIDsCount, maxCharIDsCount) = stringSet.lazy.map {$0.1.count}.minAndMax() ?? (0, -1)
@ -310,6 +309,73 @@ public final class DicdataStore {
}
}
/// kana2lattice
/// - Parameters:
/// - inputData:
/// - from:
/// - toIndexRange: `from ..< (toIndexRange)`
public func getFrozenLOUDSDataInRange(inputData: ComposingText, from fromIndex: Int, toIndexRange: Range<Int>? = nil) -> [LatticeNode] {
let toIndexLeft = toIndexRange?.startIndex ?? fromIndex
let toIndexRight = min(toIndexRange?.endIndex ?? inputData.input.count, fromIndex + self.maxlength)
debug("getLOUDSDataInRange", fromIndex, toIndexRange?.description ?? "nil", toIndexLeft, toIndexRight)
if fromIndex > toIndexLeft || toIndexLeft >= toIndexRight {
debug("getLOUDSDataInRange: index is wrong")
return []
}
let segments = (fromIndex ..< toIndexRight).reduce(into: []) { (segments: inout [String], rightIndex: Int) in
segments.append((segments.last ?? "") + String(inputData.input[rightIndex].character.toKatakana()))
}
let character = String(inputData.input[fromIndex].character.toKatakana())
let characterNode = LatticeNode(data: DicdataElement(word: character, ruby: character, cid: CIDData..cid, mid: MIDData..mid, value: -10), inputRange: fromIndex ..< fromIndex + 1)
if fromIndex == .zero {
characterNode.prevs.append(.BOSNode())
}
// MARK:
var stringToEndIndex = inputData.getRanges(fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
// MARK:
guard let (minString, maxString) = stringToEndIndex.keys.minAndMax(by: {$0.count < $1.count}) else {
return [characterNode]
}
let maxIDs = maxString.map(self.character2charId)
var keys = [String(stringToEndIndex.keys.first!.first!), "user"]
if learningManager.enabled {
keys.append("memory")
}
// MARK: indices
var dicdata: [DicdataElement] = []
let depth = minString.count - 1 ..< maxString.count
for identifier in keys {
dicdata.append(contentsOf: self.getDicdataFromLoudstxt3(identifier: identifier, indices: self.throughMatchLOUDS(identifier: identifier, charIDs: maxIDs, depth: depth)))
}
if learningManager.enabled {
// temporalpenalty
dicdata.append(contentsOf: self.learningManager.temporaryThroughMatch(charIDs: consume maxIDs, depth: depth))
}
for i in toIndexLeft ..< toIndexRight {
dicdata.append(contentsOf: self.getWiseDicdata(convertTarget: segments[i - fromIndex], inputData: inputData, inputRange: fromIndex ..< i + 1))
dicdata.append(contentsOf: self.getMatchOSUserDict(segments[i - fromIndex]))
}
if fromIndex == .zero {
return dicdata.compactMap {
guard let endIndex = stringToEndIndex[Array($0.ruby)] else {
return nil
}
let node = LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
node.prevs.append(RegisteredNode.BOSNode())
return node
} + [characterNode]
} else {
return dicdata.compactMap {
guard let endIndex = stringToEndIndex[Array($0.ruby)] else {
return nil
}
return LatticeNode(data: $0, inputRange: fromIndex ..< endIndex + 1)
} + [characterNode]
}
}
/// kana2latticelouds
/// - Parameters:
/// - inputData:
@ -727,12 +793,8 @@ public final class DicdataStore {
/// wordTypes使
private static let PREPOSITION_wordIDs: Set<Int> = [1315, 6, 557, 558, 559, 560]
/// wordTypes使
private static let INPOSITION_wordIDs: Set<Int> = Set<Int>(Array(561..<868)
+ Array(1283..<1297)
+ Array(1306..<1310)
+ Array(11..<53)
+ Array(555..<557)
+ Array(1281..<1283)
private static let INPOSITION_wordIDs: Set<Int> = Set<Int>(
Array(561..<868).chained(1283..<1297).chained(1306..<1310).chained(11..<53).chained(555..<557).chained(1281..<1283)
).union([1314, 3, 2, 4, 5, 1, 9])
/*

View File

@ -97,6 +97,70 @@ extension ComposingText {
return Dictionary(stringToInfo, uniquingKeysWith: {$0.penalty < $1.penalty ? $1 : $0})
}
/// closedRange
/// `left=4, rightIndexRange=6..<10``4...6, 4...7, 4...8, 4...9`
/// `left <= rightIndexRange.startIndex`
func getRanges(_ left: Int, rightIndexRange: Range<Int>) -> [[Character]: Int] {
let count = rightIndexRange.endIndex - left
debug("getRangesWithTypos", left, rightIndexRange, count)
let nodes = (0..<count).map {(i: Int) in
Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
let j = i + k
if count <= j {
return []
}
return Self.getTypo(self.input[left + i ... left + j], frozen: true)
}
}
// Performance Tuning NoteDictionaryArrayDictionary
var stringToInfo: [([Character], Int)] = []
//
var stack: [(convertTargetElements: [ConvertTargetElement], lastElement: InputElement, count: Int)] = nodes[0].compactMap { typoCandidate in
guard let firstElement = typoCandidate.inputElements.first else {
return nil
}
if Self.isLeftSideValid(first: firstElement, of: self.input, from: left) {
var convertTargetElements = [ConvertTargetElement]()
for element in typoCandidate.inputElements {
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
}
return (convertTargetElements, typoCandidate.inputElements.last!, typoCandidate.inputElements.count)
}
return nil
}
while var (convertTargetElements, lastElement, count) = stack.popLast() {
if rightIndexRange.contains(count + left - 1) {
if let convertTarget = ComposingText.getConvertTargetIfRightSideIsValid(lastElement: lastElement, of: self.input, to: count + left, convertTargetElements: convertTargetElements)?.map({$0.toKatakana()}) {
stringToInfo.append((convertTarget, (count + left - 1)))
}
}
//
if nodes.endIndex <= count {
continue
}
stack.append(contentsOf: nodes[count].compactMap {
if count + $0.inputElements.count > nodes.endIndex {
return nil
}
for element in $0.inputElements {
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
}
if shouldBeRemovedForDicdataStore(components: convertTargetElements) {
return nil
}
return (
convertTargetElements: convertTargetElements,
lastElement: $0.inputElements.last!,
count: count + $0.inputElements.count
)
})
}
return Dictionary(stringToInfo, uniquingKeysWith: {$0 < $1 ? $1 : $0})
}
func getRangeWithTypos(_ left: Int, _ right: Int) -> [[Character]: PValue] {
// i
// input = [d(), r(s), r(i), r(t), r(s), d(), d(), d()]
@ -178,19 +242,20 @@ extension ComposingText {
return Dictionary(stringToPenalty, uniquingKeysWith: max)
}
private static func getTypo(_ elements: some Collection<InputElement>) -> [TypoCandidate] {
private static func getTypo(_ elements: some Collection<InputElement>, frozen: Bool = false) -> [TypoCandidate] {
let key = elements.reduce(into: "") {$0.append($1.character)}.toKatakana()
if (elements.allSatisfy {$0.inputStyle == .direct}) {
let dictionary: [String: [TypoUnit]] = frozen ? [:] : Self.directPossibleTypo
if key.count > 1 {
return Self.directPossibleTypo[key, default: []].map {
return dictionary[key, default: []].map {
TypoCandidate(
inputElements: $0.value.map {InputElement(character: $0, inputStyle: .direct)},
weight: $0.weight
)
}
} else if key.count == 1 {
var result = Self.directPossibleTypo[key, default: []].map {
var result = dictionary[key, default: []].map {
TypoCandidate(
inputElements: $0.value.map {InputElement(character: $0, inputStyle: .direct)},
weight: $0.weight
@ -202,15 +267,16 @@ extension ComposingText {
}
}
if (elements.allSatisfy {$0.inputStyle == .roman2kana}) {
let dictionary: [String: [String]] = frozen ? [:] : Self.roman2KanaPossibleTypo
if key.count > 1 {
return Self.roman2KanaPossibleTypo[key, default: []].map {
return dictionary[key, default: []].map {
TypoCandidate(
inputElements: $0.map {InputElement(character: $0, inputStyle: .roman2kana)},
weight: 3.5
)
}
} else if key.count == 1 {
var result = Self.roman2KanaPossibleTypo[key, default: []].map {
var result = dictionary[key, default: []].map {
TypoCandidate(
inputElements: $0.map {InputElement(character: $0, inputStyle: .roman2kana)},
weight: 3.5

View File

@ -0,0 +1,100 @@
import Foundation
import SwiftUtils
extension Kana2Kanji {
/// ,
/// - Parameters:
/// - inputData:
/// - N_best: N_best
/// - Returns:
///
/// ###
/// (0)
///
/// (1)
///
/// (2)(1)registerN_best
///
/// (3)(1)registerresultEOS
///
/// (4)
func kana2lattice_all_with_prefix_constraint(_ inputData: ComposingText, N_best: Int, constraint: String) -> (result: LatticeNode, nodes: Nodes) {
debug("新規に計算を行います。inputされた文字列は\(inputData.input.count)文字分の\(inputData.convertTarget)。制約は\(constraint)")
let count: Int = inputData.input.count
let result: LatticeNode = LatticeNode.EOSNode
let nodes: [[LatticeNode]] = (.zero ..< count).map {dicdataStore.getFrozenLOUDSDataInRange(inputData: inputData, from: $0)}
// inodes
for (i, nodeArray) in nodes.enumerated() {
// node
for node in nodeArray {
if node.prevs.isEmpty {
continue
}
if self.dicdataStore.shouldBeRemoved(data: node.data) {
continue
}
//
let wValue: PValue = node.data.value()
if i == 0 {
// values
node.values = node.prevs.map {$0.totalValue + wValue + self.dicdataStore.getCCValue($0.data.rcid, node.data.lcid)}
} else {
// values
node.values = node.prevs.map {$0.totalValue + wValue}
}
//
let nextIndex: Int = node.inputRange.endIndex
// count
if nextIndex == count {
for index in node.prevs.indices {
let newnode: RegisteredNode = node.getRegisteredNode(index, value: node.values[index])
let text = newnode.getCandidateData().data.reduce(into: "") { $0.append(contentsOf: $1.word)} + node.data.word
if text.hasPrefix(constraint) {
result.prevs.append(newnode)
}
}
} else {
let candidates = node.getCandidateData().map {
$0.data.reduce(into: "") { $0.append(contentsOf: $1.word)} + node.data.word
}
// nodenextnode
for nextnode in nodes[nextIndex] {
// node.registered.isEmpty
if self.dicdataStore.shouldBeRemoved(data: nextnode.data) {
continue
}
//
let ccValue: PValue = self.dicdataStore.getCCValue(node.data.rcid, nextnode.data.lcid)
// nodeprevnode
for (index, value) in node.values.enumerated() {
//
// common prefix
// AB ABC (OK)
// AB A (OK)
// AB AC (NG)
let text = candidates[index] + nextnode.data.word
if !text.hasPrefix(constraint) && !constraint.hasPrefix(text) {
continue
}
let newValue: PValue = ccValue + value
// index
let lastindex: Int = (nextnode.prevs.lastIndex(where: {$0.totalValue >= newValue}) ?? -1) + 1
if lastindex == N_best {
continue
}
let newnode: RegisteredNode = node.getRegisteredNode(index, value: newValue)
//
if nextnode.prevs.count >= N_best {
nextnode.prevs.removeLast()
}
// removeinsert (insertO(N))
nextnode.prevs.insert(newnode, at: lastindex)
}
}
}
}
}
return (result: result, nodes: nodes)
}
}

View File

@ -0,0 +1,139 @@
import Foundation
import SwiftUtils
extension Kana2Kanji {
struct ZenzaiCache: Sendable {
init(_ inputData: ComposingText, constraint: String, satisfyingCandidate: Candidate?) {
self.inputData = inputData
self.prefixConstraint = constraint
self.satisfyingCandidate = satisfyingCandidate
}
private var prefixConstraint: String
private var satisfyingCandidate: Candidate?
private var inputData: ComposingText
func getNewConstraint(for newInputData: ComposingText) -> String {
if let satisfyingCandidate {
var current = newInputData.convertTarget.toKatakana()[...]
var constraint = ""
for item in satisfyingCandidate.data {
if current.hasPrefix(item.ruby) {
constraint += item.word
current = current.dropFirst(item.ruby.count)
}
}
return constraint
} else if newInputData.convertTarget.hasPrefix(inputData.convertTarget) {
return self.prefixConstraint
} else {
return ""
}
}
}
/// zenzai
@MainActor func all_zenzai(_ inputData: ComposingText, zenz: Zenz, zenzaiCache: ZenzaiCache?, inferenceLimit: Int) -> (result: LatticeNode, nodes: Nodes, cache: ZenzaiCache) {
var constraint = zenzaiCache?.getNewConstraint(for: inputData) ?? ""
print("initial constraint", constraint)
let eosNode = LatticeNode.EOSNode
var nodes: Kana2Kanji.Nodes = []
var inferenceLimit = inferenceLimit
while true {
// 2-best
let start = Date()
let draftResult = self.kana2lattice_all_with_prefix_constraint(inputData, N_best: 2, constraint: constraint)
if nodes.isEmpty {
//
nodes = draftResult.nodes
}
let candidates = draftResult.result.getCandidateData().map(self.processClauseCandidate)
var best: (Int, Candidate)? = nil
for (i, cand) in candidates.enumerated() {
if let (_, c) = best, cand.value > c.value {
best = (i, cand)
} else if best == nil {
best = (i, cand)
}
}
guard var (index, candidate) = best else {
print("best was not found!")
// Empty
//
return (eosNode, nodes, ZenzaiCache(inputData, constraint: "", satisfyingCandidate: nil))
}
print("Constrained draft modeling", -start.timeIntervalSinceNow)
reviewLoop: while true {
// results
eosNode.prevs.insert(draftResult.result.prevs[index], at: 0)
if inferenceLimit == 0 {
print("inference limit! \(candidate.text) is used for excuse")
// When inference occurs more than maximum times, then just return result at this point
return (eosNode, nodes, ZenzaiCache(inputData, constraint: constraint, satisfyingCandidate: candidate))
}
let reviewResult = zenz.candidateEvaluate(convertTarget: inputData.convertTarget, candidates: [candidate])
inferenceLimit -= 1
let nextAction = self.review(
candidateIndex: index,
candidates: candidates,
reviewResult: reviewResult,
constraint: &constraint
)
switch nextAction {
case .return(let constraint, let satisfied):
if satisfied {
return (eosNode, nodes, ZenzaiCache(inputData, constraint: constraint, satisfyingCandidate: candidate))
} else {
return (eosNode, nodes, ZenzaiCache(inputData, constraint: constraint, satisfyingCandidate: nil))
}
case .continue:
break reviewLoop
case .retry(let candidateIndex):
index = candidateIndex
candidate = candidates[candidateIndex]
}
}
}
}
private enum NextAction {
case `return`(constraint: String, satisfied: Bool)
case `continue`
case `retry`(candidateIndex: Int)
}
private func review(
candidateIndex: Int,
candidates: [Candidate],
reviewResult: consuming ZenzContext.CandidateEvaluationResult,
constraint: inout String
) -> NextAction {
switch reviewResult {
case .error:
//
print("error")
return .return(constraint: constraint, satisfied: false)
case .pass(let score):
//
print("passed:", score)
return .return(constraint: constraint, satisfied: true)
case .fixRequired(let prefixConstraint):
// 2
if constraint == prefixConstraint {
print("same constraint:", prefixConstraint)
return .return(constraint: "", satisfied: false)
}
//
print("update constraint:", prefixConstraint)
constraint = prefixConstraint
// 使
for i in candidates.indices where i != candidateIndex {
if candidates[i].text.hasPrefix(prefixConstraint) {
print("found \(candidates[i].text) as another retry")
return .retry(candidateIndex: i)
}
}
return .continue
}
}
}

View File

@ -0,0 +1,43 @@
import Foundation
import SwiftUtils
@MainActor final class Zenz {
package var resourceURL: URL
private var zenzContext: ZenzContext?
init(resourceURL: URL) throws {
self.resourceURL = resourceURL
do {
#if canImport(Darwin)
if #available(iOS 15, macOS 13, *) {
self.zenzContext = try ZenzContext.createContext(path: resourceURL.path(percentEncoded: false))
} else {
// this is not percent-encoded
self.zenzContext = try ZenzContext.createContext(path: resourceURL.path)
}
#else
// this is not percent-encoded
self.zenzContext = try ZenzContext.createContext(path: resourceURL.path)
#endif
debug("Loaded model \(resourceURL.lastPathComponent)")
} catch {
throw error
}
}
func startSession() {}
func endSession() {
try? self.zenzContext?.reset_context()
}
func candidateEvaluate(convertTarget: String, candidates: [Candidate]) -> ZenzContext.CandidateEvaluationResult {
guard let zenzContext else {
return .error
}
for candidate in candidates {
let result = zenzContext.evaluate_candidate(input: convertTarget.toKatakana(), candidate: candidate.text)
return result
}
return .error
}
}

View File

@ -0,0 +1,233 @@
import llama
import SwiftUtils
import Foundation
enum ZenzError: LocalizedError {
case couldNotLoadModel(path: String)
case couldNotLoadContext
var errorDescription: String? {
switch self {
case .couldNotLoadContext: "failed to load context"
case .couldNotLoadModel(path: let path): "could not load model weight at \(path)"
}
}
}
class ZenzContext {
private var model: OpaquePointer
private var context: OpaquePointer
private var prevInput: [llama_token] = []
private let n_len: Int32 = 512
init(model: OpaquePointer, context: OpaquePointer) {
self.model = model
self.context = context
}
deinit {
llama_free(context)
llama_free_model(model)
llama_backend_free()
}
private static var ctx_params: llama_context_params {
let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
debug("Using \(n_threads) threads")
var ctx_params = llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = 512
ctx_params.n_threads = UInt32(n_threads)
ctx_params.n_threads_batch = UInt32(n_threads)
ctx_params.n_batch = 512
return ctx_params
}
static func createContext(path: String) throws -> ZenzContext {
llama_backend_init()
var model_params = llama_model_default_params()
model_params.use_mmap = true
let model = llama_load_model_from_file(path, model_params)
guard let model else {
debug("Could not load model at \(path)")
throw ZenzError.couldNotLoadModel(path: path)
}
let context = llama_new_context_with_model(model, ctx_params)
guard let context else {
debug("Could not load context!")
throw ZenzError.couldNotLoadContext
}
return ZenzContext(model: model, context: context)
}
func reset_context() throws {
llama_free(self.context)
let context = llama_new_context_with_model(self.model, Self.ctx_params)
guard let context else {
debug("Could not load context!")
throw ZenzError.couldNotLoadContext
}
self.context = context
}
private func get_logits(tokens: [llama_token], logits_start_index: Int = 0) -> UnsafeMutablePointer<Float>? {
// manage kv_cache
do {
let commonTokens = self.prevInput.commonPrefix(with: tokens)
llama_kv_cache_seq_rm(context, 0, llama_pos(commonTokens.count), -1)
}
var batch = llama_batch_init(512, 0, 1)
let n_ctx = llama_n_ctx(context)
let n_kv_req = tokens.count + (Int(n_len) - tokens.count)
if n_kv_req > n_ctx {
debug("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
}
for i in tokens.indices {
llama_batch_add(&batch, tokens[i], Int32(i), [0], logits: logits_start_index <= i)
}
//
if llama_decode(context, batch) != 0 {
debug("llama_decode() failed")
return nil
}
return llama_get_logits(context)
}
func evaluate(text: String, ignorePrompt: String = "") -> Float {
let tokens_list = self.tokenize(text: text, add_bos: true, add_eos: true)
guard let logits = self.get_logits(tokens: tokens_list) else {
debug("logits unavailable")
return .nan
}
let tokenizedPromptCount = ignorePrompt.isEmpty ? 1 : tokenize(text: ignorePrompt, add_bos: true, add_eos: false).count
let n_vocab = llama_n_vocab(model)
var sum: Float = 0
//
for (i, token_id) in tokens_list.indexed().dropFirst(tokenizedPromptCount) {
// FIXME: there can be more efficient implementations, poossibly using Accelerate or other frameworks.
var log_prob: Float = 0
for index in ((i - 1) * Int(n_vocab)) ..< (i * Int(n_vocab)) {
log_prob += exp(logits[index])
}
log_prob = log(log_prob)
log_prob = logits[Int((i - 1) * Int(n_vocab) + Int(token_id))] - log_prob
sum += log_prob
}
return sum
}
enum CandidateEvaluationResult: Sendable, Equatable, Hashable {
case error
case pass(score: Float)
case fixRequired(prefixConstraint: String)
}
func evaluate_candidate(input: String, candidate: String) -> CandidateEvaluationResult {
// For zenz-v1 model, \u{EE00} is a token used for 'start query', and \u{EE01} is a token used for 'start answer'
// We assume \u{EE01}\(candidate) is always splitted into \u{EE01}_\(candidate) by zenz-v1 tokenizer
let prompt = "\u{EE00}\(input)\u{EE01}"
// Therefore, tokens = prompt_tokens + candidate_tokens is an appropriate operation.
let prompt_tokens = self.tokenize(text: prompt, add_bos: true, add_eos: false)
let candidate_tokens = self.tokenize(text: candidate, add_bos: false, add_eos: false)
let tokens = prompt_tokens + candidate_tokens
let startOffset = prompt_tokens.count - 1
let pos_max = llama_kv_cache_seq_pos_max(self.context, 0)
print("pos max:", pos_max)
guard let logits = self.get_logits(tokens: tokens, logits_start_index: startOffset) else {
debug("logits unavailable")
return .error
}
let n_vocab = llama_n_vocab(model)
var score: Float = 0
for (i, token_id) in tokens.indexed().dropFirst(prompt_tokens.count) {
//
// softmaxmaxlogits
// log_probsoftmax
var exp_sum: Float = 0
var max_token: llama_token = 0
var max_exp: Float = .infinity * -1
let startIndex = (i - 1 - startOffset) * Int(n_vocab)
let endIndex = (i - startOffset) * Int(n_vocab)
for index in startIndex ..< endIndex {
let v = exp(logits[index])
exp_sum += v
if max_exp < v {
max_exp = v
max_token = llama_token(index - startIndex)
}
}
//
if max_token != token_id {
var cchars = tokens[..<i].reduce(into: []) {
$0.append(contentsOf: token_to_piece(token: $1))
}
// adding "\0"
cchars += token_to_piece(token: max_token) + [0]
let string = String(cString: cchars)
//
let prefixConstraint = String(string.dropFirst(prompt.count))
return .fixRequired(prefixConstraint: prefixConstraint)
}
score += log(max_exp) - log(exp_sum)
}
return .pass(score: score)
}
private func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], logits: Bool) {
batch.token [Int(batch.n_tokens)] = id
batch.pos [Int(batch.n_tokens)] = pos
batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
for i in 0..<seq_ids.count {
batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
}
batch.logits [Int(batch.n_tokens)] = logits ? 1 : 0
batch.n_tokens += 1
}
private func tokenize(text: String, add_bos: Bool, add_eos: Bool = false) -> [llama_token] {
let text = text.lowercased()
let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
var swiftTokens: [llama_token] = if tokenCount < 0 {
[llama_token_bos(model)]
} else {
(0..<tokenCount).map{tokens[Int($0)]}
}
tokens.deallocate()
if add_eos {
swiftTokens.append(llama_token_eos(model))
}
return swiftTokens
}
/// - note: The result does not contain null-terminator
private func token_to_piece(token: llama_token) -> [CChar] {
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
result.initialize(repeating: Int8(0), count: 8)
defer {
result.deallocate()
}
let nTokens = llama_token_to_piece(model, token, result, 8, false)
if nTokens < 0 {
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
defer {
newResult.deallocate()
}
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else {
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
return Array(bufferPointer)
}
}
}

View File

@ -17,6 +17,7 @@ public extension ConvertRequestOptions {
shouldResetMemory: Bool = false,
memoryDirectoryURL: URL,
sharedContainerURL: URL,
zenzaiMode: ZenzaiMode = .off,
textReplacer: TextReplacer = TextReplacer(),
metadata: ConvertRequestOptions.Metadata?
) -> Self {
@ -44,6 +45,7 @@ public extension ConvertRequestOptions {
memoryDirectoryURL: memoryDirectoryURL,
sharedContainerURL: sharedContainerURL,
textReplacer: textReplacer,
zenzaiMode: zenzaiMode,
metadata: metadata
)
}

View File

@ -10,7 +10,7 @@
import XCTest
final class LOUDSTests: XCTestCase {
static var resourceURL = Bundle.module.resourceURL!.standardizedFileURL.appendingPathComponent("DictionaryMock", isDirectory: true)
static let resourceURL = Bundle.module.resourceURL!.standardizedFileURL.appendingPathComponent("DictionaryMock", isDirectory: true)
func requestOptions() -> ConvertRequestOptions {
var options: ConvertRequestOptions = .default
options.dictionaryResourceURL = Self.resourceURL

View File

@ -19,8 +19,8 @@ final class ConverterTests: XCTestCase {
func requestOptions() -> ConvertRequestOptions {
.withDefaultDictionary(
N_best: 5,
requireJapanesePrediction: true,
N_best: 10,
requireJapanesePrediction: false,
requireEnglishPrediction: false,
keyboardLanguage: .ja_JP,
typographyLetterCandidate: false,
@ -38,46 +38,41 @@ final class ConverterTests: XCTestCase {
}
func testFullConversion() async throws {
await MainActor.run {
do {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
c.insertAtCursorPosition("あずーきーはしんじだいのきーぼーどあぷりです", inputStyle: .direct)
let results = converter.requestCandidates(c, options: requestOptions())
let results = await converter.requestCandidates(c, options: requestOptions())
XCTAssertEqual(results.mainResults.first?.text, "azooKeyは新時代のキーボードアプリです")
}
do {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
c.insertAtCursorPosition("ようしょうきからてにすすいえいやきゅうしょうりんじけんぽうなどさまざまなすぽーつをけいけんしながらそだちしょうがっこうじだいはろさんぜるすきんこうにたいざいしておりごるふやてにすをならっていた", inputStyle: .direct)
let results = converter.requestCandidates(c, options: requestOptions())
let results = await converter.requestCandidates(c, options: requestOptions())
XCTAssertEqual(results.mainResults.first?.text, "幼少期からテニス水泳野球少林寺拳法など様々なスポーツを経験しながら育ち小学校時代はロサンゼルス近郊に滞在しておりゴルフやテニスを習っていた")
}
}
}
// 1
// memo:
func testGradualConversion() async throws {
await MainActor.run {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
let text = "ようしょうきからてにすすいえいやきゅうしょうりんじけんぽうなどさまざまなすぽーつをけいけんしながらそだちしょうがっこうじだいはろさんぜるすきんこうにたいざいしておりごるふやてにすをならっていた"
for char in text {
c.insertAtCursorPosition(String(char), inputStyle: .direct)
let results = converter.requestCandidates(c, options: requestOptions())
let results = await converter.requestCandidates(c, options: requestOptions())
if c.input.count == text.count {
XCTAssertEqual(results.mainResults.first?.text, "幼少期からテニス水泳野球少林寺拳法など様々なスポーツを経験しながら育ち小学校時代はロサンゼルス近郊に滞在しておりゴルフやテニスを習っていた")
}
}
}
}
// 1
// memo:
func testRoman2KanaGradualConversion() async throws {
await MainActor.run {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
let text = "youshoukikaratenisusuieiyakyuushourinjikenpounadosamazamanasupoーtuwokeikennsinagarasodatishougakkouzidaiharosanzerusukinkounitaizaisiteorigoruhuyatenisuwonaratteita"
//
@ -87,19 +82,17 @@ final class ConverterTests: XCTestCase {
]
for char in text {
c.insertAtCursorPosition(String(char), inputStyle: .roman2kana)
let results = converter.requestCandidates(c, options: requestOptions())
let results = await converter.requestCandidates(c, options: requestOptions())
if c.input.count == text.count {
XCTAssertTrue(possibles.contains(results.mainResults.first!.text))
}
}
}
}
// 2,3
// memo:
func testSemiGradualConversion() async throws {
await MainActor.run {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
let text = "ようしょうきからてにすすいえいやきゅうしょうりんじけんぽうなどさまざまなすぽーつをけいけんしながらそだちしょうがっこうじだいはろさんぜるすきんこうにたいざいしておりごるふやてにすをならっていた"
var leftIndex = text.startIndex
@ -110,44 +103,40 @@ final class ConverterTests: XCTestCase {
let rightIndex = text.index(leftIndex, offsetBy: count, limitedBy: text.endIndex) ?? text.endIndex
let prefix = String(text[leftIndex ..< rightIndex])
c.insertAtCursorPosition(prefix, inputStyle: .direct)
let results = converter.requestCandidates(c, options: requestOptions())
let results = await converter.requestCandidates(c, options: requestOptions())
leftIndex = rightIndex
if rightIndex == text.endIndex {
XCTAssertEqual(results.mainResults.first?.text, "幼少期からテニス水泳野球少林寺拳法など様々なスポーツを経験しながら育ち小学校時代はロサンゼルス近郊に滞在しておりゴルフやテニスを習っていた")
}
}
}
}
// 1
// memo: deleted_last_n
func testGradualConversionWithDelete() async throws {
await MainActor.run {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
let text = Array("ようしょうきからてにすすいえいやきゅうしょうりんじけんぽうなどさまざまなすぽーつをけいけんしながらそだちしょうがっこうじだいはろさんぜるすきんこうにたいざいしておりごるふやてにすをならっていた")
let deleteIndices = [1, 4, 8, 10, 15, 18, 20, 21, 23, 25, 26, 28, 29, 33, 34, 37, 39, 40, 42, 44, 45, 49, 51, 54, 58, 60, 62, 64, 67, 69, 70, 75, 80]
for (i, char) in text.enumerated() {
c.insertAtCursorPosition(String(char), inputStyle: .direct)
let results = converter.requestCandidates(c, options: requestOptions())
let results = await converter.requestCandidates(c, options: requestOptions())
if deleteIndices.contains(i) {
let count = i % 3 + 1
c.deleteBackwardFromCursorPosition(count: count)
_ = converter.requestCandidates(c, options: requestOptions())
_ = await converter.requestCandidates(c, options: requestOptions())
c.insertAtCursorPosition(String(text[i - count + 1 ... i]), inputStyle: .direct)
_ = converter.requestCandidates(c, options: requestOptions())
_ = await converter.requestCandidates(c, options: requestOptions())
}
if c.input.count == text.count {
XCTAssertEqual(results.mainResults.first?.text, "幼少期からテニス水泳野球少林寺拳法など様々なスポーツを経験しながら育ち小学校時代はロサンゼルス近郊に滞在しておりゴルフやテニスを習っていた")
}
}
}
}
//
func testMustCases() async throws {
await MainActor.run {
//
do {
let cases: [(input: String, expect: String)] = [
@ -162,19 +151,19 @@ final class ConverterTests: XCTestCase {
var options = requestOptions()
options.requireJapanesePrediction = false
for (input, expect) in cases {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
sequentialInput(&c, sequence: input, inputStyle: .direct)
let results = converter.requestCandidates(c, options: options)
let results = await converter.requestCandidates(c, options: options)
XCTAssertEqual(results.mainResults.first?.text, expect)
}
// gradual input
for (input, expect) in cases {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
for char in input {
c.insertAtCursorPosition(String(char), inputStyle: .direct)
let results = converter.requestCandidates(c, options: options)
let results = await converter.requestCandidates(c, options: options)
if c.input.count == input.count {
XCTAssertEqual(results.mainResults.first?.text, expect)
}
@ -193,20 +182,20 @@ final class ConverterTests: XCTestCase {
var options = requestOptions()
options.requireJapanesePrediction = false
for (input, expect) in cases {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
sequentialInput(&c, sequence: input, inputStyle: .roman2kana)
let results = converter.requestCandidates(c, options: options)
let results = await converter.requestCandidates(c, options: options)
XCTAssertEqual(results.mainResults.first?.text, expect)
}
// gradual input
for (input, expect) in cases {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
for char in input {
c.insertAtCursorPosition(String(char), inputStyle: .roman2kana)
let results = converter.requestCandidates(c, options: options)
let results = await converter.requestCandidates(c, options: options)
if c.input.count == input.count {
XCTAssertEqual(results.mainResults.first?.text, expect)
}
@ -214,12 +203,10 @@ final class ConverterTests: XCTestCase {
}
}
}
}
//
//
func testAccuracy() async throws {
await MainActor.run {
let cases: [(input: String, expect: [String])] = [
("3がつ8にち", ["3月8日"]),
("いっていのわりあい", ["一定の割合"]),
@ -275,10 +262,10 @@ final class ConverterTests: XCTestCase {
var score: Double = 0
for (input, expect) in cases {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
c.insertAtCursorPosition(input, inputStyle: .direct)
let results = converter.requestCandidates(c, options: requestOptions())
let results = await converter.requestCandidates(c, options: requestOptions())
if expect.contains(results.mainResults[0].text) {
score += 1
@ -292,13 +279,11 @@ final class ConverterTests: XCTestCase {
print("\(#function) Result: accuracy \(accuracy), score \(score), count \(cases.count)")
XCTAssertGreaterThan(accuracy, 0.7) // 0.7 < acuracy
}
}
//
//
//
func testVerbalAccuracy() async throws {
await MainActor.run {
let cases: [(input: String, expect: [String])] = [
("うわああああ、まじか", ["うわああああ、マジか", "うわああああ、まじか"]),
("は?", ["は?"]),
@ -326,10 +311,10 @@ final class ConverterTests: XCTestCase {
var score: Double = 0
for (input, expect) in cases {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
c.insertAtCursorPosition(input, inputStyle: .direct)
let results = converter.requestCandidates(c, options: requestOptions())
let results = await converter.requestCandidates(c, options: requestOptions())
if expect.contains(results.mainResults[0].text) {
score += 1
@ -343,11 +328,9 @@ final class ConverterTests: XCTestCase {
print("\(#function) Result: accuracy \(accuracy), score \(score), count \(cases.count)")
XCTAssertGreaterThan(accuracy, 0.7) // 0.7 < acuracy
}
}
/// MID
func testMeaningBasedConversionAccuracy() async throws {
await MainActor.run {
let cases: [(input: String, expect: String)] = [
("しょうぼう、しょうか、ほのお", "消防、消火、炎"),
("いえき、しょうか、こうそ", "胃液、消化、酵素"),
@ -627,12 +610,12 @@ final class ConverterTests: XCTestCase {
var score: Double = 0
for (input, expect) in cases {
let converter = KanaKanjiConverter()
let converter = await KanaKanjiConverter()
var c = ComposingText()
c.insertAtCursorPosition(input, inputStyle: .direct)
var options = requestOptions()
options.requireJapanesePrediction = false
let results = converter.requestCandidates(c, options: options)
let results = await converter.requestCandidates(c, options: options)
if results.mainResults[0].text == expect {
score += 1
@ -646,9 +629,8 @@ final class ConverterTests: XCTestCase {
print("\(#function) Result: accuracy \(accuracy), score \(score), count \(cases.count)")
XCTAssertGreaterThan(accuracy, 0.7) // 0.7 < accuracy
}
}
#if os(macOS) || os(iOS) || os(watchOS) || os(tvOS) || os(visionOS)
#if os(macOS) || os(iOS) || os(watchOS) || os(tvOS) || os(visionOS)
func testMozcEvaluationData() async throws {
// URL
let urlString = "https://raw.githubusercontent.com/google/mozc/master/src/data/dictionary_oss/evaluation.tsv"
@ -729,7 +711,7 @@ final class ConverterTests: XCTestCase {
XCTAssertTrue(mozcScore < azooKeyScore)
}
}
#endif
#endif
enum MozcCommand: Equatable {
/// `arg`

View File

@ -1,2 +1,6 @@
swift build -c release
swift build -c release -Xcxx -xobjective-c++
cp -f .build/release/CliTool /usr/local/bin/anco
# FIXME: Unfortunately, in order to use zenzai in anco, you will need to build CliTool with xcodebuild
# It is highly desirable to make it work only with `swift build`
# xcodebuild -scheme CliTool -destination "platform=macOS,name=Any Mac" -configuration Release