Files
AzooKeyKanaKanjiConverter/Sources/CliTool/Subcommands/EvaluateCommand.swift
Miwa 55ffe3c708 [Experimental] Zenzai (#92)
* experimental rinna integration

* Update impl

* update

* Bump swift-actions/setup-swift from 1 to 2

Bumps [swift-actions/setup-swift](https://github.com/swift-actions/setup-swift) from 1 to 2.
- [Release notes](https://github.com/swift-actions/setup-swift/releases)
- [Commits](https://github.com/swift-actions/setup-swift/compare/v1...v2)

---
updated-dependencies:
- dependency-name: swift-actions/setup-swift
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

* change test

* change impl

* take gpt2 weight as option

* don't use async

* support status check

* enhance error

* avoid percent encode

* update

* GPT-2 based kana-kanji conversion is now perfectly workinggit statusgit status

* fix a bug

* Rename gpt2/llama -> zenz

* cleanup

* internal apiを綺麗にした

* cleanup experimental commands

* update

* partially support incremental input using cache

* fix names

* fix bug

* support roman2kana

* cleanup

* fix minor bugs

* improve logic

* fix minor bug

* fix minor bug

* fix minor bug

* optimize

* optimize performance

* Optimize cache hit

* cli: add anco session command

* fix cache hit bugs

* improve session commands

* maybe this will work better for incremental input environment

* speed up zenzai by using n_best alternatives

* update zenz context

* adding no_typo api

* add inference limit

* fix bug

* reset install_cli

* make package buildable -- but llama.cpp features just do not work at this point because metal is not preprocessed

* add proper availability checks

* change macOS minimum version

* fix several problems

* code cleanup

* enable ubuntu build

* fix build error

* fix ubuntu build

* fix borrowing

* update install_cli.sh

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-05-15 01:36:45 +09:00

197 lines
7.6 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import KanaKanjiConverterModuleWithDefaultDictionary
import ArgumentParser
import Foundation
extension Subcommands {
struct Evaluate: ParsableCommand {
@Argument(help: "ひらがな\\t正解1\\t正解2\\t...形式のTSVファイルへのパス")
var inputFile: String = ""
@Option(name: [.customLong("output")], help: "Output file path.")
var outputFilePath: String? = nil
@Option(name: [.customLong("config_n_best")], help: "The parameter n (n best parameter) for internal viterbi search.")
var configNBest: Int = 10
@Flag(name: [.customLong("stable")], help: "Report only stable properties; timestamps and values will not be reported.")
var stable: Bool = false
@Option(name: [.customLong("zenz")], help: "gguf format model weight for zenz.")
var zenzWeightPath: String = ""
@Option(name: [.customLong("config_zenzai_inference_limit")], help: "inference limit for zenzai.")
var configZenzaiInferenceLimit: Int = .max
static var configuration = CommandConfiguration(commandName: "evaluate", abstract: "Evaluate quality of Conversion for input data.")
private func parseInputFile() throws -> [InputItem] {
let url = URL(fileURLWithPath: self.inputFile)
let lines = (try String(contentsOf: url)).split(separator: "\n", omittingEmptySubsequences: false)
return lines.enumerated().compactMap { (index, line) -> InputItem? in
if line.isEmpty || line.hasPrefix("#") {
return nil
}
let items = line.split(separator: "\t").map(String.init)
if items.count < 2 {
fatalError("Failed to parse input file of line #\(index) in \(url.absoluteString)")
}
return .init(query: items[0], answers: Array(items[1...]))
}
}
@MainActor mutating func run() throws {
let inputItems = try parseInputFile()
let requestOptions = requestOptions()
let converter = KanaKanjiConverter()
let start = Date()
var resultItems: [EvaluateItem] = []
for item in inputItems {
var composingText = ComposingText()
composingText.insertAtCursorPosition(item.query, inputStyle: .direct)
let result = converter.requestCandidates(composingText, options: requestOptions)
let mainResults = result.mainResults.filter {
$0.data.reduce(into: "", {$0.append(contentsOf: $1.ruby)}) == item.query.toKatakana()
}
resultItems.append(
EvaluateItem(
query: item.query,
answers: item.answers,
outputs: mainResults.prefix(self.configNBest).map {
EvaluateItemOutput(text: $0.text, score: Double($0.value))
}
)
)
// Explictly reset state
converter.stopComposition()
}
let end = Date()
var result = EvaluateResult(n_best: self.configNBest, execution_time: end.timeIntervalSince(start), items: resultItems)
if stable {
result.execution_time = 0
result.timestamp = 0
result.items.mutatingForeach {
$0.entropy = Double(Int($0.entropy * 10)) / 10
$0.outputs.mutatingForeach {
$0.score = Double(Int($0.score))
}
}
}
let encoder = JSONEncoder()
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
let json = try encoder.encode(result)
if let outputFilePath {
try json.write(to: URL(fileURLWithPath: outputFilePath))
} else {
let string = String(data: json, encoding: .utf8)!
print(string)
}
}
func requestOptions() -> ConvertRequestOptions {
var option: ConvertRequestOptions = .withDefaultDictionary(
N_best: self.configNBest,
requireJapanesePrediction: false,
requireEnglishPrediction: false,
keyboardLanguage: .ja_JP,
typographyLetterCandidate: false,
unicodeCandidate: true,
englishCandidateInRoman2KanaInput: true,
fullWidthRomanCandidate: false,
halfWidthKanaCandidate: false,
learningType: .nothing,
maxMemoryCount: 0,
shouldResetMemory: false,
memoryDirectoryURL: URL(fileURLWithPath: ""),
sharedContainerURL: URL(fileURLWithPath: ""),
zenzaiMode: self.zenzWeightPath.isEmpty ? .off : .on(weight: URL(string: self.zenzWeightPath)!, inferenceLimit: self.configZenzaiInferenceLimit),
metadata: .init(versionString: "anco for debugging")
)
option.requestQuery = .
return option
}
}
private struct InputItem {
///
var query: String
///
var answers: [String]
}
struct EvaluateResult: Codable {
internal init(n_best: Int, timestamp: TimeInterval = Date().timeIntervalSince1970, execution_time: TimeInterval, items: [Subcommands.EvaluateItem]) {
self.n_best = n_best
self.timestamp = timestamp
self.execution_time = execution_time
self.items = items
var stat = EvaluateStat(query_count: items.count, ranks: [:])
for item in items {
stat.ranks[item.max_rank, default: 0] += 1
}
self.stat = stat
}
/// `N_Best`
var n_best: Int
///
var timestamp = Date().timeIntervalSince1970
///
var execution_time: TimeInterval
///
var stat: EvaluateStat
///
var items: [EvaluateItem]
}
struct EvaluateStat: Codable {
var query_count: Int
var ranks: [Int: Int]
}
struct EvaluateItem: Codable {
init(query: String, answers: [String], outputs: [Subcommands.EvaluateItemOutput]) {
self.query = query
self.answers = answers
self.outputs = outputs
do {
// entropy
let mean = outputs.reduce(into: 0) { $0 += Double($1.score) } / Double(outputs.count)
let expValues = outputs.map { exp(Double($0.score) - mean) }
let sumOfExpValues = expValues.reduce(into: 0, +=)
//
let probs = outputs.map { exp(Double($0.score) - mean) / sumOfExpValues }
self.entropy = -probs.reduce(into: 0) { $0 += $1 * log($1) }
}
do {
self.max_rank = outputs.firstIndex {
answers.contains($0.text)
} ?? -1
}
}
///
var query: String
///
var answers: [String]
///
var outputs: [EvaluateItemOutput]
///
var entropy: Double
/// -1
var max_rank: Int
}
struct EvaluateItemOutput: Codable {
var text: String
var score: Double
}
}