[Tools] 辞書データ解析用のサブコマンドをancoに追加 (#85)

* base value should be updated here

* feature: implement cli tool for reading louds data

* Update document
This commit is contained in:
Miwa / Ensan
2024-04-29 19:26:28 +09:00
committed by GitHub
parent f17958af47
commit 4f2750ef3e
8 changed files with 195 additions and 8 deletions

View File

@ -4,13 +4,13 @@
`anco`を利用するには、最初にinstallが必要です。 `anco`を利用するには、最初にinstallが必要です。
``` ```bash
sh install_cli.sh sudo sh install_cli.sh
``` ```
例えば以下のように利用できます。 例えば以下のように利用できます。
``` ```bash
your@pc Desktop % anco にほんごにゅうりょく --disable_prediction -n 10 your@pc Desktop % anco にほんごにゅうりょく --disable_prediction -n 10
日本語入力 日本語入力
にほんご入力 にほんご入力
@ -23,3 +23,40 @@ your@pc Desktop % anco にほんごにゅうりょく --disable_prediction -n 10
にほんご にほんご
2本後 2本後
``` ```
## 変換API
`anco run`コマンドを利用して変換を行うことが出来ます。
## 辞書リーダ
`anco dict`コマンドを利用して辞書データを解析することが出来ます。
```bash
your@pc Desktop % anco dict read ア -d ./Sources/KanaKanjiConverterModuleWithDefaultDictionary/azooKey_dictionary_storage/Dictionary/
=== Summary for target ===
- directory: ./Sources/KanaKanjiConverterModuleWithDefaultDictionary/azooKey_dictionary_storage/Dictionary/
- target: ア
- memory?: false
- count of entry: 24189
- time for execute: 0.0378040075302124
```
`--ruby`および`--word`オプションを利用して、正規表現でフィルターをかけることが出来ます。
```bash
your@pc Desktop % anco dict read ア -d ./Sources/KanaKanjiConverterModuleWithDefaultDictionary/azooKey_dictionary_storage/Dictionary/ --word ".*全"
=== Summary for target ===
- directory: ./Sources/KanaKanjiConverterModuleWithDefaultDictionary/azooKey_dictionary_storage/Dictionary/
- target: ア
- memory?: false
- count of entry: 24189
- time for execute: 0.07062792778015137
=== Found Entries ===
- count of found entry: 3
Ruby: アキラ Word: 全 Value: -11.7107 CID: (1291, 1291) MID: 424
Ruby: アンゼン Word: 安全 Value: -7.241 CID: (1287, 1287) MID: 169
Ruby: アンシンアンゼン Word: 安心安全 Value: -11.7638 CID: (1283, 1287) MID: 17
```
`--sort`オプションを使うとエントリーの並び替えが可能です。

View File

@ -5,7 +5,7 @@ import ArgumentParser
public struct Anco: ParsableCommand { public struct Anco: ParsableCommand {
public static var configuration = CommandConfiguration( public static var configuration = CommandConfiguration(
abstract: "Anco is A(zooKey) Kana-Ka(n)ji (co)nverter", abstract: "Anco is A(zooKey) Kana-Ka(n)ji (co)nverter",
subcommands: [Subcommands.Run.self], subcommands: [Subcommands.Run.self, Subcommands.Dict.self],
defaultSubcommand: Subcommands.Run.self defaultSubcommand: Subcommands.Run.self
) )

View File

@ -0,0 +1,14 @@
//
// DefaultStringInterpolation+CommandLineUtils.swift
//
//
// Created by miwa on 2024/04/29.
//
import Foundation
extension DefaultStringInterpolation {
mutating func appendInterpolation(bold value: String){
self.appendInterpolation("\u{1B}[1m" + value + "\u{1B}[m")
}
}

View File

@ -0,0 +1,13 @@
import Foundation
import KanaKanjiConverterModuleWithDefaultDictionary
import ArgumentParser
extension Subcommands {
struct Dict: ParsableCommand {
static var configuration = CommandConfiguration(
commandName: "dict",
abstract: "Show dict information",
subcommands: [Self.Read.self]
)
}
}

View File

@ -0,0 +1,123 @@
import Foundation
import KanaKanjiConverterModule
import ArgumentParser
extension Subcommands.Dict {
struct Read: ParsableCommand {
enum SortOrder: String, Codable, ExpressibleByArgument {
case value
case ruby
case word
init?(argument: String) {
self.init(rawValue: argument)
}
}
@Argument(help: "辞書データのfilename")
var target: String = ""
@Option(name: [.customLong("dictionary_dir"), .customShort("d")], help: "The directory for dictionary data.")
var dictionaryDirectory: String = "./"
@Option(name: [.customLong("ruby")], help: "Regex for entry ruby filter")
var rubyFilter: String = ""
@Option(name: [.customLong("word")], help: "Regex for entry word filter")
var wordFilter: String = ""
@Option(name: [.customLong("sort")], help: "Sort order")
var sortOrder: SortOrder = .ruby
static var configuration = CommandConfiguration(
commandName: "read",
abstract: "Read dictionary data and extract informations"
)
@MainActor mutating func run() throws {
guard #available(macOS 13, *) else {
return
}
let start = Date()
let isMemory = self.target == "memory"
guard let louds = LOUDS.load(self.target, option: self.requestOptions()) else {
print(
"""
\(bold: "=== Summary for target \(self.target) ===")
- directory: \(self.dictionaryDirectory)
- target: \(self.target)
- memory?: \(isMemory)
- result: LOUDS data was not found
- time for execute: \(Date().timeIntervalSince(start))
"""
)
return
}
//
let nodeIndices = louds.prefixNodeIndices(chars: [], maxDepth: .max)
let store = DicdataStore(convertRequestOptions: self.requestOptions())
let result = store.getDicdataFromLoudstxt3(identifier: self.target, indices: nodeIndices)
var filteredResult = result
var hasFilter = false
if !rubyFilter.isEmpty {
let filter = try Regex(rubyFilter)
hasFilter = true
filteredResult = filteredResult.filter {
$0.ruby.wholeMatch(of: filter) != nil
}
}
if !wordFilter.isEmpty {
let filter = try Regex(wordFilter)
hasFilter = true
filteredResult = filteredResult.filter {
$0.word.wholeMatch(of: filter) != nil
}
}
print(
"""
\(bold: "=== Summary for target \(self.target) ===")
- directory: \(self.dictionaryDirectory)
- target: \(self.target)
- memory?: \(isMemory)
- count of entry: \(result.count)
- time for execute: \(Date().timeIntervalSince(start))
"""
)
if hasFilter {
let sortFunction: (DicdataElement, DicdataElement) -> Bool = switch self.sortOrder {
case .ruby: { $0.ruby < $1.ruby || $0.ruby.count < $1.ruby.count}
case .value: { $0.value() < $1.value() }
case .word: { $0.word < $1.word }
}
print("\(bold: "=== Found Entries ===")")
print("- count of found entry: \(filteredResult.count)")
for entry in filteredResult.sorted(by: sortFunction) {
print("\(bold: "Ruby:") \(entry.ruby) \(bold: "Word:") \(entry.word) \(bold: "Value:") \(entry.value()) \(bold: "CID:") \((entry.lcid, entry.rcid)) \(bold: "MID:") \(entry.mid)")
}
}
}
func requestOptions() -> ConvertRequestOptions {
.init(
N_best: 0,
requireJapanesePrediction: false,
requireEnglishPrediction: false,
keyboardLanguage: .ja_JP,
typographyLetterCandidate: false,
unicodeCandidate: true,
englishCandidateInRoman2KanaInput: true,
fullWidthRomanCandidate: false,
halfWidthKanaCandidate: false,
learningType: .nothing,
maxMemoryCount: 0,
dictionaryResourceURL: URL(fileURLWithPath: self.dictionaryDirectory),
memoryDirectoryURL: URL(fileURLWithPath: self.dictionaryDirectory),
sharedContainerURL: URL(fileURLWithPath: self.dictionaryDirectory),
metadata: .init(appVersionString: "anco")
)
}
}
}

View File

@ -189,7 +189,7 @@ public final class DicdataStore {
return louds.prefixNodeIndices(chars: charIDs, maxDepth: depth) return louds.prefixNodeIndices(chars: charIDs, maxDepth: depth)
} }
func getDicdataFromLoudstxt3(identifier: String, indices: some Sequence<Int>) -> [DicdataElement] { package func getDicdataFromLoudstxt3(identifier: String, indices: some Sequence<Int>) -> [DicdataElement] {
debug("getDicdataFromLoudstxt3", identifier, indices) debug("getDicdataFromLoudstxt3", identifier, indices)
// split = 2048 // split = 2048
let dict = [Int: [Int]].init(grouping: indices, by: {$0 >> 11}) let dict = [Int: [Int]].init(grouping: indices, by: {$0 >> 11})

View File

@ -9,7 +9,7 @@
import Foundation import Foundation
/// LOUDS /// LOUDS
struct LOUDS: Sendable { package struct LOUDS: Sendable {
private typealias Unit = UInt64 private typealias Unit = UInt64
private static let unit = 64 private static let unit = 64
private static let uExp = 6 private static let uExp = 6
@ -182,7 +182,7 @@ struct LOUDS: Sendable {
/// - Parameter chars: CharID /// - Parameter chars: CharID
/// - Parameter maxDepth: /// - Parameter maxDepth:
/// - Returns: loudstxt3 /// - Returns: loudstxt3
@inlinable func prefixNodeIndices(chars: [UInt8], maxDepth: Int) -> [Int] { @inlinable package func prefixNodeIndices(chars: [UInt8], maxDepth: Int) -> [Int] {
guard let nodeIndex = self.searchNodeIndex(chars: chars) else { guard let nodeIndex = self.searchNodeIndex(chars: chars) else {
return [] return []
} }

View File

@ -54,7 +54,7 @@ extension LOUDS {
/// LOUDS /// LOUDS
/// - Parameter identifier: /// - Parameter identifier:
/// - Returns: LOUDS`nil` /// - Returns: LOUDS`nil`
static func load(_ identifier: String, option: ConvertRequestOptions) -> LOUDS? { package static func load(_ identifier: String, option: ConvertRequestOptions) -> LOUDS? {
let (charsURL, loudsURL) = getLOUDSURL(identifier, option: option) let (charsURL, loudsURL) = getLOUDSURL(identifier, option: option)
let nodeIndex2ID: [UInt8] let nodeIndex2ID: [UInt8]
do { do {