Files
AzooKeyKanaKanjiConverter/Sources/CliTool/Subcommands/EvaluateCommand.swift
Miwa / Ensan c4aa3eee76 [cli] 複数の不具合を修正 (#91)
* stabilize json outpug

* enable entropy calculation for smaller values

* chmod +x
2024-05-09 00:47:07 +09:00

189 lines
7.1 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import KanaKanjiConverterModuleWithDefaultDictionary
import ArgumentParser
import Foundation
extension Subcommands {
struct Evaluate: ParsableCommand {
@Argument(help: "ひらがな\\t正解1\\t正解2\\t...形式のTSVファイルへのパス")
var inputFile: String = ""
@Option(name: [.customLong("output")], help: "Output file path.")
var outputFilePath: String? = nil
@Option(name: [.customLong("config_n_best")], help: "The parameter n (n best parameter) for internal viterbi search.")
var configNBest: Int = 10
@Flag(name: [.customLong("stable")], help: "Report only stable properties; timestamps and values will not be reported.")
var stable: Bool = false
static var configuration = CommandConfiguration(commandName: "evaluate", abstract: "Evaluate quality of Conversion for input data.")
func parseInputFile() throws -> [InputItem] {
let url = URL(fileURLWithPath: self.inputFile)
let lines = (try String(contentsOf: url)).split(separator: "\n", omittingEmptySubsequences: false)
return lines.enumerated().compactMap { (index, line) -> InputItem? in
if line.isEmpty || line.hasPrefix("#") {
return nil
}
let items = line.split(separator: "\t").map(String.init)
if items.count < 2 {
fatalError("Failed to parse input file of line #\(index) in \(url.absoluteString)")
}
return .init(query: items[0], answers: Array(items[1...]))
}
}
@MainActor mutating func run() throws {
let inputItems = try parseInputFile()
let converter = KanaKanjiConverter()
let start = Date()
var resultItems: [EvaluateItem] = []
for item in inputItems {
var composingText = ComposingText()
composingText.insertAtCursorPosition(item.query, inputStyle: .direct)
let result = converter.requestCandidates(composingText, options: requestOptions())
let mainResults = result.mainResults.filter {
$0.data.reduce(into: "", {$0.append(contentsOf: $1.ruby)}) == item.query.toKatakana()
}
resultItems.append(
EvaluateItem(
query: item.query,
answers: item.answers,
outputs: mainResults.prefix(self.configNBest).map {
EvaluateItemOutput(text: $0.text, score: Double($0.value))
}
)
)
}
let end = Date()
var result = EvaluateResult(n_best: self.configNBest, execution_time: end.timeIntervalSince(start), items: resultItems)
if stable {
result.execution_time = 0
result.timestamp = 0
result.items.mutatingForeach {
$0.entropy = Double(Int($0.entropy * 10)) / 10
$0.outputs.mutatingForeach {
$0.score = Double(Int($0.score))
}
}
}
let encoder = JSONEncoder()
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
let json = try encoder.encode(result)
if let outputFilePath {
try json.write(to: URL(fileURLWithPath: outputFilePath))
} else {
let string = String(data: json, encoding: .utf8)!
print(string)
}
}
func requestOptions() -> ConvertRequestOptions {
var option: ConvertRequestOptions = .withDefaultDictionary(
N_best: self.configNBest,
requireJapanesePrediction: false,
requireEnglishPrediction: false,
keyboardLanguage: .ja_JP,
typographyLetterCandidate: false,
unicodeCandidate: true,
englishCandidateInRoman2KanaInput: true,
fullWidthRomanCandidate: false,
halfWidthKanaCandidate: false,
learningType: .nothing,
maxMemoryCount: 0,
shouldResetMemory: false,
memoryDirectoryURL: URL(fileURLWithPath: ""),
sharedContainerURL: URL(fileURLWithPath: ""),
metadata: .init(versionString: "anco for debugging")
)
option.requestQuery = .
return option
}
}
struct InputItem {
///
var query: String
///
var answers: [String]
}
struct EvaluateResult: Codable {
internal init(n_best: Int, timestamp: TimeInterval = Date().timeIntervalSince1970, execution_time: TimeInterval, items: [Subcommands.EvaluateItem]) {
self.n_best = n_best
self.timestamp = timestamp
self.execution_time = execution_time
self.items = items
var stat = EvaluateStat(query_count: items.count, ranks: [:])
for item in items {
stat.ranks[item.max_rank, default: 0] += 1
}
self.stat = stat
}
/// `N_Best`
var n_best: Int
///
var timestamp = Date().timeIntervalSince1970
///
var execution_time: TimeInterval
///
var stat: EvaluateStat
///
var items: [EvaluateItem]
}
struct EvaluateStat: Codable {
var query_count: Int
var ranks: [Int: Int]
}
struct EvaluateItem: Codable {
init(query: String, answers: [String], outputs: [Subcommands.EvaluateItemOutput]) {
self.query = query
self.answers = answers
self.outputs = outputs
do {
// entropy
let mean = outputs.reduce(into: 0) { $0 += Double($1.score) } / Double(outputs.count)
let expValues = outputs.map { exp(Double($0.score) - mean) }
let sumOfExpValues = expValues.reduce(into: 0, +=)
//
let probs = outputs.map { exp(Double($0.score) - mean) / sumOfExpValues }
self.entropy = -probs.reduce(into: 0) { $0 += $1 * log($1) }
}
do {
self.max_rank = outputs.firstIndex {
answers.contains($0.text)
} ?? -1
}
}
///
var query: String
///
var answers: [String]
///
var outputs: [EvaluateItemOutput]
///
var entropy: Double
/// -1
var max_rank: Int
}
struct EvaluateItemOutput: Codable {
var text: String
var score: Double
}
}