mirror of
https://github.com/mii443/AzooKeyKanaKanjiConverter.git
synced 2025-08-22 15:05:26 +00:00
cli: add evaluate command (#90)
This commit is contained in:
21
Docs/cli.md
21
Docs/cli.md
@ -28,6 +28,27 @@ your@pc Desktop % anco にほんごにゅうりょく --disable_prediction -n 10
|
||||
|
||||
`anco run`コマンドを利用して変換を行うことが出来ます。
|
||||
|
||||
## 評価API
|
||||
|
||||
`anco evaluate`コマンドを利用して変換器の評価を行うことが出来ます。
|
||||
|
||||
以下のようなフォーマットの`.tsv`ファイルを用意します。
|
||||
```tsv
|
||||
しかくとさんかく 四角と三角
|
||||
かんたんなさんすう 簡単な算数
|
||||
しけんにでないえいたんご 試験に出ない英単語
|
||||
しごととごらくとべんきょう 仕事と娯楽と勉強
|
||||
しかいをつとめる 司会を務める
|
||||
```
|
||||
|
||||
これを入力し、変換器を評価します。
|
||||
|
||||
```bash
|
||||
$ anco evaluate ./evaluation.tsv --config_n_best 1
|
||||
```
|
||||
|
||||
出力はJSONフォーマットです。出力内容の安定が必要な場合`--stable`を指定することで比較的安定した出力を得られます。ただしスコアやエントロピーは辞書バージョンに依存します。
|
||||
|
||||
## 辞書リーダ
|
||||
|
||||
`anco dict`コマンドを利用して辞書データを解析することが出来ます。
|
||||
|
@ -5,7 +5,7 @@ import ArgumentParser
|
||||
public struct Anco: ParsableCommand {
|
||||
public static var configuration = CommandConfiguration(
|
||||
abstract: "Anco is A(zooKey) Kana-Ka(n)ji (co)nverter",
|
||||
subcommands: [Subcommands.Run.self, Subcommands.Dict.self],
|
||||
subcommands: [Subcommands.Run.self, Subcommands.Dict.self, Subcommands.Evaluate.self],
|
||||
defaultSubcommand: Subcommands.Run.self
|
||||
)
|
||||
|
||||
|
186
Sources/CliTool/Subcommands/EvaluateCommand.swift
Normal file
186
Sources/CliTool/Subcommands/EvaluateCommand.swift
Normal file
@ -0,0 +1,186 @@
|
||||
import KanaKanjiConverterModuleWithDefaultDictionary
|
||||
import ArgumentParser
|
||||
import Foundation
|
||||
|
||||
extension Subcommands {
|
||||
struct Evaluate: ParsableCommand {
|
||||
@Argument(help: "ひらがな\\t正解1\\t正解2\\t...形式のTSVファイルへのパス")
|
||||
var inputFile: String = ""
|
||||
|
||||
@Option(name: [.customLong("output")], help: "Output file path.")
|
||||
var outputFilePath: String? = nil
|
||||
@Option(name: [.customLong("config_n_best")], help: "The parameter n (n best parameter) for internal viterbi search.")
|
||||
var configNBest: Int = 10
|
||||
@Flag(name: [.customLong("stable")], help: "Report only stable properties; timestamps and values will not be reported.")
|
||||
var stable: Bool = false
|
||||
|
||||
static var configuration = CommandConfiguration(commandName: "evaluate", abstract: "Evaluate quality of Conversion for input data.")
|
||||
|
||||
func parseInputFile() throws -> [InputItem] {
|
||||
let url = URL(fileURLWithPath: self.inputFile)
|
||||
let lines = (try String(contentsOf: url)).split(separator: "\n", omittingEmptySubsequences: false)
|
||||
return lines.enumerated().compactMap { (index, line) -> InputItem? in
|
||||
if line.isEmpty || line.hasPrefix("#") {
|
||||
return nil
|
||||
}
|
||||
let items = line.split(separator: "\t").map(String.init)
|
||||
if items.count < 2 {
|
||||
fatalError("Failed to parse input file of line #\(index) in \(url.absoluteString)")
|
||||
}
|
||||
return .init(query: items[0], answers: Array(items[1...]))
|
||||
}
|
||||
}
|
||||
|
||||
@MainActor mutating func run() throws {
|
||||
let inputItems = try parseInputFile()
|
||||
|
||||
let converter = KanaKanjiConverter()
|
||||
let start = Date()
|
||||
var resultItems: [EvaluateItem] = []
|
||||
for item in inputItems {
|
||||
var composingText = ComposingText()
|
||||
composingText.insertAtCursorPosition(item.query, inputStyle: .direct)
|
||||
let result = converter.requestCandidates(composingText, options: requestOptions())
|
||||
let mainResults = result.mainResults.filter {
|
||||
$0.data.reduce(into: "", {$0.append(contentsOf: $1.ruby)}) == item.query.toKatakana()
|
||||
}
|
||||
resultItems.append(
|
||||
EvaluateItem(
|
||||
query: item.query,
|
||||
answers: item.answers,
|
||||
outputs: mainResults.prefix(self.configNBest).map {
|
||||
EvaluateItemOutput(text: $0.text, score: Double($0.value))
|
||||
}
|
||||
)
|
||||
)
|
||||
}
|
||||
let end = Date()
|
||||
var result = EvaluateResult(n_best: self.configNBest, execution_time: end.timeIntervalSince(start), items: resultItems)
|
||||
if stable {
|
||||
result.execution_time = 0
|
||||
result.timestamp = 0
|
||||
result.items.mutatingForeach {
|
||||
$0.entropy = Double(Int($0.entropy * 10)) / 10
|
||||
$0.outputs.mutatingForeach {
|
||||
$0.score = Double(Int($0.score))
|
||||
}
|
||||
}
|
||||
}
|
||||
let json = try JSONEncoder().encode(result)
|
||||
|
||||
if let outputFilePath {
|
||||
try json.write(to: URL(fileURLWithPath: outputFilePath))
|
||||
} else {
|
||||
let string = String(data: json, encoding: .utf8)!
|
||||
print(string)
|
||||
}
|
||||
}
|
||||
|
||||
func requestOptions() -> ConvertRequestOptions {
|
||||
var option: ConvertRequestOptions = .withDefaultDictionary(
|
||||
N_best: self.configNBest,
|
||||
requireJapanesePrediction: false,
|
||||
requireEnglishPrediction: false,
|
||||
keyboardLanguage: .ja_JP,
|
||||
typographyLetterCandidate: false,
|
||||
unicodeCandidate: true,
|
||||
englishCandidateInRoman2KanaInput: true,
|
||||
fullWidthRomanCandidate: false,
|
||||
halfWidthKanaCandidate: false,
|
||||
learningType: .nothing,
|
||||
maxMemoryCount: 0,
|
||||
shouldResetMemory: false,
|
||||
memoryDirectoryURL: URL(fileURLWithPath: ""),
|
||||
sharedContainerURL: URL(fileURLWithPath: ""),
|
||||
metadata: .init(versionString: "anco for debugging")
|
||||
)
|
||||
option.requestQuery = .完全一致
|
||||
return option
|
||||
}
|
||||
}
|
||||
|
||||
struct InputItem {
|
||||
/// 入力クエリ
|
||||
var query: String
|
||||
|
||||
/// 正解データ(優先度順)
|
||||
var answers: [String]
|
||||
}
|
||||
|
||||
struct EvaluateResult: Codable {
|
||||
internal init(n_best: Int, timestamp: TimeInterval = Date().timeIntervalSince1970, execution_time: TimeInterval, items: [Subcommands.EvaluateItem]) {
|
||||
self.n_best = n_best
|
||||
self.timestamp = timestamp
|
||||
self.execution_time = execution_time
|
||||
self.items = items
|
||||
|
||||
var stat = EvaluateStat(query_count: items.count, ranks: [:])
|
||||
for item in items {
|
||||
stat.ranks[item.max_rank, default: 0] += 1
|
||||
}
|
||||
self.stat = stat
|
||||
}
|
||||
|
||||
/// `N_Best`クエリ
|
||||
var n_best: Int
|
||||
|
||||
/// タイムスタンプ
|
||||
var timestamp = Date().timeIntervalSince1970
|
||||
|
||||
/// タイムスタンプ
|
||||
var execution_time: TimeInterval
|
||||
|
||||
/// 統計情報
|
||||
var stat: EvaluateStat
|
||||
|
||||
/// クエリと結果
|
||||
var items: [EvaluateItem]
|
||||
}
|
||||
|
||||
struct EvaluateStat: Codable {
|
||||
var query_count: Int
|
||||
var ranks: [Int: Int]
|
||||
}
|
||||
|
||||
struct EvaluateItem: Codable {
|
||||
init(query: String, answers: [String], outputs: [Subcommands.EvaluateItemOutput]) {
|
||||
self.query = query
|
||||
self.answers = answers
|
||||
self.outputs = outputs
|
||||
do {
|
||||
// entropyを示す
|
||||
let expValues = outputs.map { exp(Double($0.score)) }
|
||||
let sumOfExpValues = expValues.reduce(into: 0, +=)
|
||||
// 確率値に補正
|
||||
let probs = expValues.map { $0 / sumOfExpValues }
|
||||
let entropy = -probs.reduce(into: 0) { $0 += $1 * log($1) }
|
||||
self.entropy = entropy
|
||||
}
|
||||
do {
|
||||
self.max_rank = outputs.firstIndex {
|
||||
answers.contains($0.text)
|
||||
} ?? -1
|
||||
}
|
||||
}
|
||||
|
||||
/// 入力クエリ
|
||||
var query: String
|
||||
|
||||
/// 正解データ(順序無し)
|
||||
var answers: [String]
|
||||
|
||||
/// 出力
|
||||
var outputs: [EvaluateItemOutput]
|
||||
|
||||
/// エントロピー
|
||||
var entropy: Double
|
||||
|
||||
/// 正解と判定出来たものの最高の順位(-1は見つからなかったことを示す)
|
||||
var max_rank: Int
|
||||
}
|
||||
|
||||
struct EvaluateItemOutput: Codable {
|
||||
var text: String
|
||||
var score: Double
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user