mirror of
https://github.com/mii443/AzooKeyKanaKanjiConverter.git
synced 2025-08-22 15:05:26 +00:00
[Experimental] ConvertGraphを実装し、その上での完全一致変換を実装 (#47)
* ConvertGraphを実装し、その上での完全一致変換を実装 * 名前空間を汚染していたので修正 * Implementation completed (without test) * move directory to use default dictionary * fix implementations to enable conversion * add test cases * Backward searchで発見された候補を明示的に削除 * fix tests * simplify
This commit is contained in:
@ -73,6 +73,6 @@ public struct DicdataElement: Equatable, Hashable, Sendable {
|
||||
|
||||
extension DicdataElement: CustomDebugStringConvertible {
|
||||
public var debugDescription: String {
|
||||
"(ruby: \(self.ruby), word: \(self.word), cid: (\(self.lcid), \(self.rcid)), mid: \(self.mid), value: \(self.baseValue)+\(self.adjust)=\(self.value())"
|
||||
"(ruby: \(self.ruby), word: \(self.word), cid: (\(self.lcid), \(self.rcid)), mid: \(self.mid), value: \(self.baseValue)+\(self.adjust)=\(self.value()))"
|
||||
}
|
||||
}
|
||||
|
@ -102,6 +102,10 @@ public final class DicdataStore {
|
||||
}
|
||||
}
|
||||
|
||||
func character2charId(_ character: Character) -> UInt8 {
|
||||
self.charsID[character, default: .max]
|
||||
}
|
||||
|
||||
private func reloadMemory() {
|
||||
self.loudses.removeValue(forKey: "memory")
|
||||
self.importedLoudses.remove("memory")
|
||||
@ -143,7 +147,7 @@ public final class DicdataStore {
|
||||
return Self.getPenalty(data: data) < -d
|
||||
}
|
||||
|
||||
private func loadLOUDS(identifier: String) -> LOUDS? {
|
||||
func loadLOUDS(identifier: String) -> LOUDS? {
|
||||
if importedLoudses.contains(identifier) {
|
||||
return self.loudses[identifier]
|
||||
}
|
||||
@ -213,7 +217,7 @@ public final class DicdataStore {
|
||||
var stringToInfo = inputData.getRangesWithTypos(fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight)
|
||||
|
||||
// MARK: 検索対象を列挙していく。
|
||||
let stringSet = stringToInfo.keys.map {($0, $0.map {self.charsID[$0, default: .max]})}
|
||||
let stringSet = stringToInfo.keys.map {($0, $0.map(self.character2charId))}
|
||||
let (minCharIDsCount, maxCharIDsCount) = stringSet.lazy.map {$0.1.count}.minAndMax() ?? (0, -1)
|
||||
// 先頭の文字: そこで検索したい文字列の集合
|
||||
let group = [Character: [([Character], [UInt8])]].init(grouping: stringSet, by: {$0.0.first!})
|
||||
@ -318,7 +322,7 @@ public final class DicdataStore {
|
||||
// MARK: 検索によって得たindicesから辞書データを実際に取り出していく
|
||||
// 先頭の文字: そこで検索したい文字列の集合
|
||||
let strings = string2penalty.keys.map {
|
||||
(key: $0, charIDs: $0.map {self.charsID[$0, default: .max]})
|
||||
(key: $0, charIDs: $0.map(self.character2charId))
|
||||
}
|
||||
let group = [Character: [(key: [Character], charIDs: [UInt8])]].init(grouping: strings, by: {$0.key.first!})
|
||||
|
||||
@ -433,7 +437,7 @@ public final class DicdataStore {
|
||||
} else if count == 2 {
|
||||
var result: [DicdataElement] = []
|
||||
let first = String(key.first!)
|
||||
let charIDs = key.map {self.charsID[$0, default: .max]}
|
||||
let charIDs = key.map(self.character2charId)
|
||||
// 最大700件に絞ることによって低速化を回避する。
|
||||
let prefixIndices = self.prefixMatchLOUDS(identifier: first, charIDs: charIDs, depth: 5).prefix(700)
|
||||
result.append(
|
||||
@ -451,7 +455,7 @@ public final class DicdataStore {
|
||||
} else {
|
||||
var result: [DicdataElement] = []
|
||||
let first = String(key.first!)
|
||||
let charIDs = key.map {self.charsID[$0, default: .max]}
|
||||
let charIDs = key.map(self.character2charId)
|
||||
// 最大700件に絞ることによって低速化を回避する。
|
||||
let prefixIndices = self.prefixMatchLOUDS(identifier: first, charIDs: charIDs).prefix(700)
|
||||
result.append(
|
||||
|
@ -127,24 +127,24 @@ extension LOUDS {
|
||||
}
|
||||
|
||||
/// indexとの対応を維持したバージョン
|
||||
static func getDataForLoudstxt3(_ identifier: String, indices: [Int], option: ConvertRequestOptions) -> [Int: [DicdataElement]] {
|
||||
static func getDataForLoudstxt3(_ identifier: String, indices: [Int], option: ConvertRequestOptions) -> [(loudsNodeIndex: Int, dicdata: [DicdataElement])] {
|
||||
let binary: Data
|
||||
do {
|
||||
let url = getLoudstxt3URL(identifier, option: option)
|
||||
binary = try Data(contentsOf: url)
|
||||
} catch {
|
||||
debug("getDataForLoudstxt3: \(error)")
|
||||
return [:]
|
||||
return []
|
||||
}
|
||||
|
||||
let lc = binary[0..<2].toArray(of: UInt16.self)[0]
|
||||
let header_endIndex: UInt32 = 2 + UInt32(lc) * UInt32(MemoryLayout<UInt32>.size)
|
||||
let ui32array = binary[2..<header_endIndex].toArray(of: UInt32.self)
|
||||
var result: [Int: [DicdataElement]] = [:]
|
||||
var result: [(loudsNodeIndex: Int, dicdata: [DicdataElement])] = []
|
||||
for index in indices {
|
||||
let startIndex = Int(ui32array[index])
|
||||
let endIndex = index == (lc - 1) ? binary.endIndex : Int(ui32array[index + 1])
|
||||
result[index] = parseBinary(binary: binary[startIndex ..< endIndex])
|
||||
result.append((index, parseBinary(binary: binary[startIndex ..< endIndex])))
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
@ -1,213 +0,0 @@
|
||||
//
|
||||
// LookupGraphTests.swift
|
||||
//
|
||||
//
|
||||
// Created by miwa on 2024/02/23.
|
||||
//
|
||||
|
||||
import XCTest
|
||||
import Foundation
|
||||
@testable import KanaKanjiConverterModule
|
||||
|
||||
|
||||
struct LookupGraph {
|
||||
struct Node: Equatable {
|
||||
var charId: UInt8
|
||||
var loudsNodeIndices: Set<Int> = []
|
||||
var displayedTextRange: InputGraphStructure.Range
|
||||
var inputElementsRange: InputGraphStructure.Range
|
||||
var correction: InputGraph.Correction = .none
|
||||
}
|
||||
|
||||
var nodes: [Node] = [
|
||||
// root node
|
||||
Node(charId: 0x00, displayedTextRange: .endIndex(0), inputElementsRange: .endIndex(0))
|
||||
]
|
||||
|
||||
var structure: InputGraphStructure = InputGraphStructure()
|
||||
|
||||
var root: Node {
|
||||
nodes[0]
|
||||
}
|
||||
|
||||
func nextIndices(for node: Node) -> IndexSet {
|
||||
self.structure.nextIndices(
|
||||
displayedTextEndIndex: node.displayedTextRange.endIndex,
|
||||
inputElementsEndIndex: node.inputElementsRange.endIndex
|
||||
)
|
||||
}
|
||||
|
||||
func next(for node: Node) -> [Node] {
|
||||
nextIndices(for: node).map{ self.nodes[$0] }
|
||||
}
|
||||
|
||||
func prevIndices(for node: Node) -> IndexSet {
|
||||
self.structure.prevIndices(
|
||||
displayedTextStartIndex: node.displayedTextRange.startIndex,
|
||||
inputElementsStartIndex: node.inputElementsRange.startIndex
|
||||
)
|
||||
}
|
||||
|
||||
func prev(for node: Node) -> [Node] {
|
||||
prevIndices(for: node).map{ self.nodes[$0] }
|
||||
}
|
||||
|
||||
mutating func remove(at index: Int) {
|
||||
assert(index != 0, "Node at index 0 is root and must not be removed.")
|
||||
self.structure.remove(at: index)
|
||||
}
|
||||
|
||||
mutating func insert(_ node: Node) {
|
||||
self.structure.insert(node, nodes: &self.nodes, displayedTextRange: node.displayedTextRange, inputElementsRange: node.inputElementsRange)
|
||||
}
|
||||
|
||||
static func build(input: InputGraph, character2CharId: (Character) -> UInt8) -> Self {
|
||||
let nodes = input.nodes.map {
|
||||
Node(charId: character2CharId($0.character), displayedTextRange: $0.displayedTextRange, inputElementsRange: $0.inputElementsRange, correction: $0.correction)
|
||||
}
|
||||
return Self(nodes: nodes, structure: input.structure)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extension LOUDS {
|
||||
func byfixNodeIndices(_ inputGraph: LookupGraph) -> (IndexSet, [Int: Set<Int>]) {
|
||||
var indexSet = IndexSet(integer: 1)
|
||||
// loudsのノードとLookupGraphのノードの対応を取るための辞書
|
||||
var loudsNodeIndex2GraphNodeIndices: [Int: Set<Int>] = [:]
|
||||
typealias SearchItem = (
|
||||
node: LookupGraph.Node,
|
||||
nodeIndex: Int,
|
||||
lastLoudsNodeIndex: Int
|
||||
)
|
||||
var stack: [SearchItem] = inputGraph.nextIndices(for: inputGraph.root).map { (inputGraph.nodes[$0], $0, 1) }
|
||||
while let (cNode, cNodeIndex, cLastLoudsNodeIndex) = stack.popLast() {
|
||||
// nextNodesを探索
|
||||
if let loudsNodeIndex = self.searchCharNodeIndex(from: cLastLoudsNodeIndex, char: cNode.charId) {
|
||||
loudsNodeIndex2GraphNodeIndices[loudsNodeIndex, default: []].insert(cNodeIndex)
|
||||
indexSet.insert(loudsNodeIndex)
|
||||
stack.append(contentsOf: inputGraph.nextIndices(for: cNode).map { (inputGraph.nodes[$0], $0, loudsNodeIndex) })
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
}
|
||||
return (indexSet, loudsNodeIndex2GraphNodeIndices)
|
||||
}
|
||||
}
|
||||
|
||||
extension DicdataStore {
|
||||
func getDicdataFromLoudstxt3(identifier: String, indices: some Sequence<Int>, option: ConvertRequestOptions) -> [Int: [DicdataElement]] {
|
||||
// split = 2048
|
||||
let dict = [Int: [Int]].init(grouping: indices, by: {$0 >> 11})
|
||||
var data: [Int: [DicdataElement]] = [:]
|
||||
for (key, value) in dict {
|
||||
// FIXME: use local value
|
||||
data.merge(LOUDS.getDataForLoudstxt3(identifier + "\(key)", indices: value.map {$0 & 2047}, option: option)) {
|
||||
$0 + $1
|
||||
}
|
||||
}
|
||||
return data
|
||||
}
|
||||
}
|
||||
|
||||
final class LookupGraphTests: XCTestCase {
|
||||
static var resourceURL = Bundle.module.resourceURL!.standardizedFileURL.appendingPathComponent("DictionaryMock", isDirectory: true)
|
||||
func requestOptions() -> ConvertRequestOptions {
|
||||
var options: ConvertRequestOptions = .default
|
||||
options.dictionaryResourceURL = Self.resourceURL
|
||||
return options
|
||||
}
|
||||
|
||||
func loadCharIDs() -> [Character: UInt8] {
|
||||
do {
|
||||
let string = try String(contentsOf: Self.resourceURL.appendingPathComponent("louds/charID.chid", isDirectory: false), encoding: String.Encoding.utf8)
|
||||
return [Character: UInt8](uniqueKeysWithValues: string.enumerated().map {($0.element, UInt8($0.offset))})
|
||||
} catch {
|
||||
print("ファイルが見つかりませんでした")
|
||||
return [:]
|
||||
}
|
||||
}
|
||||
|
||||
func testByfixNodeIndices() throws {
|
||||
let dicdataStore = DicdataStore(requestOptions: requestOptions())
|
||||
let charIDs = loadCharIDs()
|
||||
let louds = LOUDS.load("シ", option: requestOptions())
|
||||
XCTAssertNotNil(louds)
|
||||
guard let louds else { return }
|
||||
do {
|
||||
let inputGraph = InputGraph.build(input: [
|
||||
.init(character: "し", inputStyle: .direct),
|
||||
.init(character: "か", inputStyle: .direct),
|
||||
.init(character: "い", inputStyle: .direct),
|
||||
])
|
||||
let lookupGraph = LookupGraph.build(input: inputGraph, character2CharId: {charIDs[$0.toKatakana()] ?? 0x00})
|
||||
let (loudsNodeIndices, loudsNodeIndex2GraphNodeIndices) = louds.byfixNodeIndices(lookupGraph)
|
||||
let dicdataWithIndex: [Int: [DicdataElement]] = dicdataStore.getDicdataFromLoudstxt3(identifier: "シ", indices: loudsNodeIndices, option: requestOptions())
|
||||
let dicdata = dicdataWithIndex.values.flatMapSet { $0 }
|
||||
// シ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死"})
|
||||
// シカ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "鹿"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "歯科"})
|
||||
// シガ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "滋賀"})
|
||||
// シカイ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "司会"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "視界"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死界"})
|
||||
// シガイ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "市外"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "市街"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死骸"})
|
||||
}
|
||||
do {
|
||||
// ts -> ta
|
||||
let inputGraph = InputGraph.build(input: [
|
||||
.init(character: "s", inputStyle: .roman2kana),
|
||||
.init(character: "i", inputStyle: .roman2kana),
|
||||
.init(character: "t", inputStyle: .roman2kana),
|
||||
.init(character: "s", inputStyle: .roman2kana),
|
||||
.init(character: "i", inputStyle: .roman2kana),
|
||||
])
|
||||
let lookupGraph = LookupGraph.build(input: inputGraph, character2CharId: {charIDs[$0.toKatakana()] ?? 0x00})
|
||||
let (loudsNodeIndices, loudsNodeIndex2GraphNodeIndices) = louds.byfixNodeIndices(lookupGraph)
|
||||
let dicdataWithIndex: [Int: [DicdataElement]] = dicdataStore.getDicdataFromLoudstxt3(identifier: "シ", indices: loudsNodeIndices, option: requestOptions())
|
||||
let dicdata = dicdataWithIndex.values.flatMapSet { $0 }
|
||||
// シ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死"})
|
||||
// [シツ]ィ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "質"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "室"})
|
||||
// シタ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "下"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "舌"})
|
||||
// シタイ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死体"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "肢体"})
|
||||
}
|
||||
do {
|
||||
// 「しっ」の候補が存在するかどうかを確認
|
||||
let inputGraph = InputGraph.build(input: [
|
||||
.init(character: "s", inputStyle: .roman2kana),
|
||||
.init(character: "i", inputStyle: .roman2kana),
|
||||
.init(character: "t", inputStyle: .roman2kana),
|
||||
.init(character: "t", inputStyle: .roman2kana),
|
||||
.init(character: "a", inputStyle: .roman2kana),
|
||||
.init(character: "i", inputStyle: .roman2kana),
|
||||
])
|
||||
let lookupGraph = LookupGraph.build(input: inputGraph, character2CharId: {charIDs[$0.toKatakana()] ?? 0x00})
|
||||
let (loudsNodeIndices, loudsNodeIndex2GraphNodeIndices) = louds.byfixNodeIndices(lookupGraph)
|
||||
let dicdataWithIndex: [Int: [DicdataElement]] = dicdataStore.getDicdataFromLoudstxt3(identifier: "シ", indices: loudsNodeIndices, option: requestOptions())
|
||||
let dicdata = dicdataWithIndex.values.flatMapSet { $0 }
|
||||
// シ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死"})
|
||||
// シッ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "知っ"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "しっ"})
|
||||
// シッタ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "叱咤"})
|
||||
// シッタイ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "失態"})
|
||||
}
|
||||
}
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
//
|
||||
// extension Kana2Kanji+InputGraph.swift
|
||||
//
|
||||
//
|
||||
// Created by miwa on 2024/02/23.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
@testable import KanaKanjiConverterModule
|
||||
|
||||
|
||||
extension Kana2Kanji {
|
||||
func kana2lattice_all(_ inputData: InputGraph, N_best: Int) {
|
||||
// 辞書ルックアップ
|
||||
|
||||
// 変換
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +0,0 @@
|
||||
//
|
||||
// extension LOUDS+InputGraph.swift
|
||||
//
|
||||
//
|
||||
// Created by miwa on 2024/02/22.
|
||||
//
|
@ -0,0 +1,211 @@
|
||||
//
|
||||
// ConvertGraph.swift
|
||||
//
|
||||
//
|
||||
// Created by miwa on 2024/02/23.
|
||||
//
|
||||
|
||||
import XCTest
|
||||
import Foundation
|
||||
@testable import KanaKanjiConverterModule
|
||||
|
||||
struct ConvertGraph: InputGraphProtocol {
|
||||
struct Node: InputGraphNodeProtocol {
|
||||
var latticeNodes: [LatticeNode]
|
||||
var displayedTextRange: InputGraphStructure.Range
|
||||
var inputElementsRange: InputGraphStructure.Range
|
||||
var correction: InputGraph.Correction = .none
|
||||
}
|
||||
|
||||
var nodes: [Node] = [
|
||||
// root node
|
||||
Node(latticeNodes: [], displayedTextRange: .endIndex(0), inputElementsRange: .endIndex(0))
|
||||
]
|
||||
|
||||
var structure: InputGraphStructure = InputGraphStructure()
|
||||
|
||||
static func build(input: LookupGraph, nodeIndex2LatticeNode: [Int: [LatticeNode]]) -> Self {
|
||||
let nodes = input.nodes.enumerated().map { (index, node) in
|
||||
Node(latticeNodes: nodeIndex2LatticeNode[index, default: []], displayedTextRange: node.displayedTextRange, inputElementsRange: node.inputElementsRange, correction: node.correction)
|
||||
}
|
||||
return Self(nodes: nodes, structure: input.structure)
|
||||
}
|
||||
}
|
||||
extension ConvertGraph {
|
||||
/// ラティスのノード。これを用いて計算する。
|
||||
final class LatticeNode: CustomStringConvertible {
|
||||
/// このノードが保持する辞書データ
|
||||
public let data: DicdataElement
|
||||
/// このノードの前に来ているノード。`N_best`の分だけ保存する
|
||||
var prevs: [RegisteredNode] = []
|
||||
/// `prevs`の各要素に対応するスコアのデータ
|
||||
var values: [PValue] = []
|
||||
/// inputData.input内のrange
|
||||
var displayedTextRange: InputGraphStructure.Range
|
||||
var inputElementsRange: InputGraphStructure.Range
|
||||
|
||||
/// `EOS`に対応するノード。
|
||||
static var EOSNode: LatticeNode {
|
||||
LatticeNode(data: DicdataElement.EOSData, displayedTextRange: .unknown, inputElementsRange: .unknown)
|
||||
}
|
||||
|
||||
init(data: DicdataElement, displayedTextRange: InputGraphStructure.Range, inputElementsRange: InputGraphStructure.Range, prevs: [RegisteredNode] = []) {
|
||||
self.data = data
|
||||
self.values = [data.value()]
|
||||
self.displayedTextRange = displayedTextRange
|
||||
self.inputElementsRange = inputElementsRange
|
||||
self.prevs = prevs
|
||||
}
|
||||
|
||||
/// `LatticeNode`の持っている情報を反映した`RegisteredNode`を作成する
|
||||
/// `LatticeNode`は複数の過去のノードを持つことができるが、`RegisteredNode`は1つしか持たない。
|
||||
func getRegisteredNode(_ index: Int, value: PValue) -> RegisteredNode {
|
||||
// FIXME: 適当に実装した
|
||||
RegisteredNode(
|
||||
data: self.data,
|
||||
registered: self.prevs[index],
|
||||
totalValue: value,
|
||||
displayedTextRange: self.displayedTextRange,
|
||||
inputElementsRange: self.inputElementsRange
|
||||
)
|
||||
}
|
||||
|
||||
var description: String {
|
||||
"LatticeNode(data: \(data), ...)"
|
||||
}
|
||||
}
|
||||
struct RegisteredNode: RegisteredNodeProtocol {
|
||||
/// このノードが保持する辞書データ
|
||||
let data: DicdataElement
|
||||
/// 1つ前のノードのデータ
|
||||
let prev: (any RegisteredNodeProtocol)?
|
||||
/// 始点からこのノードまでのコスト
|
||||
let totalValue: PValue
|
||||
/// inputData.input内のrange
|
||||
var displayedTextRange: InputGraphStructure.Range
|
||||
var inputElementsRange: InputGraphStructure.Range
|
||||
|
||||
init(data: DicdataElement, registered: RegisteredNode?, totalValue: PValue, displayedTextRange: InputGraphStructure.Range, inputElementsRange: InputGraphStructure.Range) {
|
||||
self.data = data
|
||||
self.prev = registered
|
||||
self.totalValue = totalValue
|
||||
self.displayedTextRange = displayedTextRange
|
||||
self.inputElementsRange = inputElementsRange
|
||||
}
|
||||
|
||||
/// 始点ノードを生成する関数
|
||||
/// - Returns: 始点ノードのデータ
|
||||
static func BOSNode() -> RegisteredNode {
|
||||
RegisteredNode(data: DicdataElement.BOSData, registered: nil, totalValue: 0, displayedTextRange: .endIndex(0), inputElementsRange: .endIndex(0))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// `struct`の`RegisteredNode`を再帰的に所持できるようにするため、Existential Typeで抽象化する。
|
||||
/// - Note: `indirect enum`との比較はまだやっていない。
|
||||
protocol RegisteredNodeProtocol {
|
||||
var data: DicdataElement {get}
|
||||
var prev: (any RegisteredNodeProtocol)? {get}
|
||||
var totalValue: PValue {get}
|
||||
/// inputData.input内のrange
|
||||
var displayedTextRange: InputGraphStructure.Range {get}
|
||||
var inputElementsRange: InputGraphStructure.Range {get}
|
||||
}
|
||||
|
||||
extension ConvertGraph {
|
||||
func convertAll(option: borrowing ConvertRequestOptions, dicdataStore: DicdataStore) -> LatticeNode {
|
||||
let result: LatticeNode = LatticeNode.EOSNode
|
||||
result.displayedTextRange = .startIndex(self.structure.displayedTextEndIndexToNodeIndices.endIndex)
|
||||
result.inputElementsRange = .startIndex(self.structure.inputElementsEndIndexToNodeIndices.endIndex)
|
||||
var processStack = Array(self.nodes.enumerated().reversed())
|
||||
var processedIndices: IndexSet = [0] // root
|
||||
var invalidIndices: IndexSet = []
|
||||
// 「i文字目から始まるnodes」に対して
|
||||
while let (i, graphNode) = processStack.popLast() {
|
||||
// 処理済みなら無視する
|
||||
guard !processedIndices.contains(i), !invalidIndices.contains(i) else {
|
||||
continue
|
||||
}
|
||||
// 全てのprevNodeが処理済みか確かめる
|
||||
let prevIndices = self.structure.prevIndices(displayedTextStartIndex: graphNode.displayedTextRange.startIndex, inputElementsStartIndex: graphNode.inputElementsRange.startIndex)
|
||||
guard !prevIndices.isEmpty else {
|
||||
invalidIndices.insert(i)
|
||||
continue
|
||||
}
|
||||
var unprocessedPrevs: [(Int, Node)] = []
|
||||
for prevIndex in prevIndices {
|
||||
if !processedIndices.contains(prevIndex) && !invalidIndices.contains(prevIndex) {
|
||||
unprocessedPrevs.append((prevIndex, self.nodes[prevIndex]))
|
||||
}
|
||||
}
|
||||
// 未処理のprevNodeがある場合、それらをstackの末尾に追加してもう一度やり直す
|
||||
guard unprocessedPrevs.isEmpty else {
|
||||
processStack.append((i, graphNode))
|
||||
processStack.append(contentsOf: unprocessedPrevs)
|
||||
continue
|
||||
}
|
||||
print(i, graphNode.displayedTextRange, graphNode.inputElementsRange)
|
||||
processedIndices.insert(i)
|
||||
// 処理を実施する
|
||||
for node in graphNode.latticeNodes {
|
||||
if node.prevs.isEmpty {
|
||||
continue
|
||||
}
|
||||
if dicdataStore.shouldBeRemoved(data: node.data) {
|
||||
continue
|
||||
}
|
||||
// 生起確率を取得する。
|
||||
let wValue: PValue = node.data.value()
|
||||
if i == 0 {
|
||||
// valuesを更新する
|
||||
node.values = node.prevs.map {$0.totalValue + wValue + dicdataStore.getCCValue($0.data.rcid, node.data.lcid)}
|
||||
} else {
|
||||
// valuesを更新する
|
||||
node.values = node.prevs.map {$0.totalValue + wValue}
|
||||
}
|
||||
// このLatticeNodeに後続するグラフのノードを検索
|
||||
let nextIndices = self.structure.nextIndices(
|
||||
displayedTextEndIndex: node.displayedTextRange.endIndex,
|
||||
inputElementsEndIndex: node.inputElementsRange.endIndex
|
||||
)
|
||||
// 文字数がcountと等しい場合登録する
|
||||
if nextIndices.isEmpty || self.structure.inputElementsStartIndexToNodeIndices.endIndex == node.inputElementsRange.endIndex {
|
||||
for index in node.prevs.indices {
|
||||
let newnode: RegisteredNode = node.getRegisteredNode(index, value: node.values[index])
|
||||
result.prevs.append(newnode)
|
||||
}
|
||||
} else {
|
||||
for nextIndex in nextIndices {
|
||||
// nodeの繋がる次にあり得る全てのnextnodeに対して
|
||||
for nextnode in self.nodes[nextIndex].latticeNodes {
|
||||
// この関数はこの時点で呼び出して、後のnode.registered.isEmptyで最終的に弾くのが良い。
|
||||
if dicdataStore.shouldBeRemoved(data: nextnode.data) {
|
||||
continue
|
||||
}
|
||||
// クラスの連続確率を計算する。
|
||||
let ccValue: PValue = dicdataStore.getCCValue(node.data.rcid, nextnode.data.lcid)
|
||||
// nodeの持っている全てのprevnodeに対して
|
||||
for (index, value) in node.values.enumerated() {
|
||||
let newValue: PValue = ccValue + value
|
||||
// 追加すべきindexを取得する
|
||||
let lastindex: Int = (nextnode.prevs.lastIndex(where: {$0.totalValue >= newValue}) ?? -1) + 1
|
||||
if lastindex == option.N_best {
|
||||
continue
|
||||
}
|
||||
let newnode: RegisteredNode = node.getRegisteredNode(index, value: newValue)
|
||||
// カウントがオーバーしている場合は除去する
|
||||
if nextnode.prevs.count >= option.N_best {
|
||||
nextnode.prevs.removeLast()
|
||||
}
|
||||
// removeしてからinsertした方が速い (insertはO(N)なので)
|
||||
nextnode.prevs.insert(newnode, at: lastindex)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
}
|
@ -44,18 +44,39 @@ enum CorrectPrefixTree {
|
||||
|
||||
static let roman2kana: Node = {
|
||||
Node([
|
||||
"t": Node([
|
||||
"s": .terminal(["ta"]),
|
||||
"z": .terminal(["ta"]),
|
||||
"q": .terminal(["ta"]),
|
||||
"p": .terminal(["to"]),
|
||||
]),
|
||||
"g": Node([
|
||||
"s": .terminal(["ga"]),
|
||||
"z": .terminal(["ga"]),
|
||||
"q": .terminal(["ga"]),
|
||||
"d": .terminal(["ge"]),
|
||||
"r": .terminal(["ge"]),
|
||||
"w": .terminal(["ge"]),
|
||||
"k": .terminal(["gi"]),
|
||||
"l": .terminal(["go"]),
|
||||
"p": .terminal(["go"]),
|
||||
])
|
||||
"j": .terminal(["gu"]),
|
||||
]),
|
||||
"m": Node([
|
||||
"s": .terminal(["ma"]),
|
||||
"q": .terminal(["ma"]),
|
||||
"d": .terminal(["me"]),
|
||||
"r": .terminal(["me"]),
|
||||
"w": .terminal(["me"]),
|
||||
"k": .terminal(["mi"]),
|
||||
"l": .terminal(["mo"]),
|
||||
"p": .terminal(["mo"]),
|
||||
"j": .terminal(["mu"]),
|
||||
]),
|
||||
"t": Node([
|
||||
"s": .terminal(["ta"]),
|
||||
"q": .terminal(["ta"]),
|
||||
"d": .terminal(["te"]),
|
||||
"r": .terminal(["te"]),
|
||||
"w": .terminal(["te"]),
|
||||
"k": .terminal(["ti"]),
|
||||
"l": .terminal(["to"]),
|
||||
"p": .terminal(["to"]),
|
||||
"j": .terminal(["tu"]),
|
||||
]),
|
||||
])
|
||||
}()
|
||||
static let direct: Node = {
|
@ -70,7 +70,8 @@ struct InputGraphStructure {
|
||||
return indexSet
|
||||
}
|
||||
|
||||
mutating func insert<T>(_ node: T, nodes: inout [T], displayedTextRange: Range, inputElementsRange: Range) {
|
||||
/// 戻り値は`index`
|
||||
mutating func insert<T>(_ node: T, nodes: inout [T], displayedTextRange: Range, inputElementsRange: Range) -> Int {
|
||||
// 可能ならdeadNodeIndicesを再利用する
|
||||
let index: Int
|
||||
if let deadIndex = self.deadNodeIndices.popLast() {
|
||||
@ -104,6 +105,7 @@ struct InputGraphStructure {
|
||||
}
|
||||
self.inputElementsEndIndexToNodeIndices[endIndex].insert(index)
|
||||
}
|
||||
return index
|
||||
}
|
||||
|
||||
mutating func remove(at index: Int) {
|
||||
@ -125,7 +127,7 @@ struct InputGraphStructure {
|
||||
}
|
||||
}
|
||||
|
||||
struct InputGraph {
|
||||
struct InputGraph: InputGraphProtocol {
|
||||
struct InputStyle: Identifiable {
|
||||
init(from deprecatedInputStyle: KanaKanjiConverterModule.InputStyle) {
|
||||
switch deprecatedInputStyle {
|
||||
@ -211,7 +213,7 @@ struct InputGraph {
|
||||
}
|
||||
}
|
||||
|
||||
struct Node: Equatable, CustomStringConvertible {
|
||||
struct Node: InputGraphNodeProtocol, Equatable, CustomStringConvertible {
|
||||
var character: Character
|
||||
var displayedTextRange: InputGraphStructure.Range
|
||||
var inputElementsRange: InputGraphStructure.Range
|
||||
@ -222,7 +224,7 @@ struct InputGraph {
|
||||
let de = displayedTextRange.endIndex?.description ?? "?"
|
||||
let `is` = inputElementsRange.startIndex?.description ?? "?"
|
||||
let ie = inputElementsRange.endIndex?.description ?? "?"
|
||||
return "Node(\"\(character)\", d(\(ds)..<\(de)), i(\(`is`)..<\(ie)), isTypo: \(correction.isTypo)"
|
||||
return "Node(\"\(character)\", d(\(ds)..<\(de)), i(\(`is`)..<\(ie)), isTypo: \(correction.isTypo))"
|
||||
}
|
||||
}
|
||||
|
||||
@ -233,41 +235,6 @@ struct InputGraph {
|
||||
|
||||
var structure: InputGraphStructure = InputGraphStructure()
|
||||
|
||||
var root: Node {
|
||||
nodes[0]
|
||||
}
|
||||
|
||||
func nextIndices(for node: Node) -> IndexSet {
|
||||
self.structure.nextIndices(
|
||||
displayedTextEndIndex: node.displayedTextRange.endIndex,
|
||||
inputElementsEndIndex: node.inputElementsRange.endIndex
|
||||
)
|
||||
}
|
||||
|
||||
func next(for node: Node) -> [Node] {
|
||||
nextIndices(for: node).map{ self.nodes[$0] }
|
||||
}
|
||||
|
||||
func prevIndices(for node: Node) -> IndexSet {
|
||||
self.structure.prevIndices(
|
||||
displayedTextStartIndex: node.displayedTextRange.startIndex,
|
||||
inputElementsStartIndex: node.inputElementsRange.startIndex
|
||||
)
|
||||
}
|
||||
|
||||
func prev(for node: Node) -> [Node] {
|
||||
prevIndices(for: node).map{ self.nodes[$0] }
|
||||
}
|
||||
|
||||
mutating func remove(at index: Int) {
|
||||
assert(index != 0, "Node at index 0 is root and must not be removed.")
|
||||
self.structure.remove(at: index)
|
||||
}
|
||||
|
||||
mutating func insert(_ node: Node) {
|
||||
self.structure.insert(node, nodes: &self.nodes, displayedTextRange: node.displayedTextRange, inputElementsRange: node.inputElementsRange)
|
||||
}
|
||||
|
||||
static func build(input: [ComposingText.InputElement]) -> Self {
|
||||
var inputGraph = Self()
|
||||
// アルゴリズム
|
||||
@ -328,8 +295,15 @@ struct InputGraph {
|
||||
// たとえば「itta」を打つとき、ittまでの処理で[い][っ][t]が生成されている
|
||||
// そこでaを処理するタイミングで、前方の[t]に遡って[a]を追加し、これを[ta]にする処理を行う必要がある
|
||||
// TODO: まだtypoの処理が不十分
|
||||
typealias Match = (displayedTextStartIndex: Int?, inputElementsStartIndex: Int?, inputElementsEndIndex: Int, value: String, correction: Correction)
|
||||
typealias BackSearchMatch = (endNode: ReplacePrefixTree.Node, route: [Character], inputStyleId: InputStyle.ID, correction: Correction, longestMatch: Match)
|
||||
typealias Match = (
|
||||
displayedTextStartIndex: Int?,
|
||||
inputElementsStartIndex: Int?,
|
||||
inputElementsEndIndex: Int,
|
||||
backwardRoute: [Int],
|
||||
value: String,
|
||||
correction: Correction
|
||||
)
|
||||
typealias BackSearchMatch = (endNode: ReplacePrefixTree.Node, route: [Int], inputStyleId: InputStyle.ID, correction: Correction, longestMatch: Match)
|
||||
var backSearchMatch: [BackSearchMatch] = []
|
||||
do {
|
||||
if let characterNodes = ReplacePrefixTree.characterNodes[.init(from: item.inputStyle)],
|
||||
@ -377,14 +351,13 @@ struct InputGraph {
|
||||
guard let pNode = endNode.parent else { continue }
|
||||
let inputElementsStartIndex = if cRoute.isEmpty { index } else { inputGraph.nodes[cRoute.first!].inputElementsRange.startIndex }
|
||||
let displayedTextStartIndex = cRoute.first.flatMap { inputGraph.nodes[$0].displayedTextRange.startIndex }
|
||||
let characterRoute = cRoute.map{inputGraph.nodes[$0].character}
|
||||
backSearchMatch.append(
|
||||
(
|
||||
pNode,
|
||||
characterRoute,
|
||||
cRoute,
|
||||
cInputStyleId,
|
||||
cCorrection,
|
||||
(displayedTextStartIndex, inputElementsStartIndex, index, "", cCorrection)
|
||||
(displayedTextStartIndex, inputElementsStartIndex, index, cRoute, "", cCorrection)
|
||||
)
|
||||
)
|
||||
}
|
||||
@ -395,39 +368,38 @@ struct InputGraph {
|
||||
typealias SearchItem = (
|
||||
node: ReplacePrefixTree.Node,
|
||||
nextIndex: Int,
|
||||
route: [Character],
|
||||
inputStyleId: InputStyle.ID,
|
||||
longestMatch: Match
|
||||
)
|
||||
var stack: [SearchItem] = []
|
||||
for match in backSearchMatch {
|
||||
stack.append((match.endNode, index, match.route, match.inputStyleId, match.longestMatch))
|
||||
stack.append((match.endNode, index, match.inputStyleId, match.longestMatch))
|
||||
}
|
||||
if stack.isEmpty {
|
||||
stack.append((replacePrefixTree, index, [], .all, (nil, index, index, value: "", correction: .none)))
|
||||
stack.append((replacePrefixTree, index, .all, (nil, index, index, backwardRoute: [], value: "", correction: .none)))
|
||||
}
|
||||
var matches: [Match] = []
|
||||
while let (cNode, cIndex, cRoute, cInputStyleId, cLongestMatch) = stack.popLast() {
|
||||
while let (cNode, cIndex, cInputStyleId, cLongestMatch) = stack.popLast() {
|
||||
let continuous = cIndex < input.endIndex && cInputStyleId.isCompatible(with: .init(from: input[cIndex].inputStyle))
|
||||
if continuous, let nNode = cNode.find(key: input[cIndex].character) {
|
||||
if let value = nNode.value {
|
||||
// valueがある場合longestMatchを更新
|
||||
stack.append((nNode, cIndex + 1, cRoute + [input[cIndex].character], .init(from: input[cIndex].inputStyle), (cLongestMatch.displayedTextStartIndex, cLongestMatch.inputElementsStartIndex, cIndex + 1, value, cLongestMatch.correction)))
|
||||
} else if cRoute.isEmpty {
|
||||
stack.append((nNode, cIndex + 1, .init(from: input[cIndex].inputStyle), (cLongestMatch.displayedTextStartIndex, cLongestMatch.inputElementsStartIndex, cIndex + 1, cLongestMatch.backwardRoute, value, cLongestMatch.correction)))
|
||||
} else if (cIndex == index && cLongestMatch.backwardRoute.isEmpty) {
|
||||
// valueがなくても、1文字だけの場合はlongestMatchを更新
|
||||
stack.append((nNode, cIndex + 1, cRoute + [input[cIndex].character], .init(from: input[cIndex].inputStyle), (cLongestMatch.displayedTextStartIndex, cIndex, cIndex + 1, String(input[cIndex].character), .none)))
|
||||
stack.append((nNode, cIndex + 1, .init(from: input[cIndex].inputStyle), (cLongestMatch.displayedTextStartIndex, cIndex, cIndex + 1, cLongestMatch.backwardRoute, String(input[cIndex].character), .none)))
|
||||
} else {
|
||||
// それ以外の場合は普通に先に進む
|
||||
stack.append((nNode, cIndex + 1, cRoute + [input[cIndex].character], .init(from: input[cIndex].inputStyle), cLongestMatch))
|
||||
stack.append((nNode, cIndex + 1, .init(from: input[cIndex].inputStyle), cLongestMatch))
|
||||
}
|
||||
} else {
|
||||
if cLongestMatch.inputElementsStartIndex != cLongestMatch.inputElementsEndIndex {
|
||||
if cLongestMatch.inputElementsStartIndex != cLongestMatch.inputElementsEndIndex && !cLongestMatch.value.isEmpty {
|
||||
// longestMatch候補があれば、現在地点で打ち切ってmatchを確定する
|
||||
matches.append(cLongestMatch)
|
||||
} else if cRoute.isEmpty {
|
||||
} else if (cIndex == index && cLongestMatch.backwardRoute.isEmpty) {
|
||||
// 1文字目がrootに存在しない場合、character自体をmatchに登録する
|
||||
// これは置換ルールとして正規表現で.->\1が存在していると考えれば良い
|
||||
matches.append((nil, index, index + 1, value: String(input[cIndex].character), correction: .none))
|
||||
matches.append((nil, index, index + 1, [], value: String(input[cIndex].character), correction: .none))
|
||||
}
|
||||
}
|
||||
// 誤字訂正を追加する
|
||||
@ -454,9 +426,8 @@ struct InputGraph {
|
||||
(
|
||||
.init(),
|
||||
cIndex + item.inputCount,
|
||||
cRoute + Array(item.replace),
|
||||
.init(from: input[cIndex].inputStyle),
|
||||
(cLongestMatch.displayedTextStartIndex, cLongestMatch.inputElementsStartIndex, cIndex + item.inputCount, item.replace, .typo)
|
||||
(cLongestMatch.displayedTextStartIndex, cLongestMatch.inputElementsStartIndex, cIndex + item.inputCount, cLongestMatch.backwardRoute, item.replace, .typo)
|
||||
)
|
||||
)
|
||||
}
|
||||
@ -467,9 +438,8 @@ struct InputGraph {
|
||||
(
|
||||
node,
|
||||
cIndex + item.inputCount,
|
||||
cRoute + Array(item.replace),
|
||||
.init(from: input[cIndex].inputStyle),
|
||||
(cLongestMatch.displayedTextStartIndex, cLongestMatch.inputElementsStartIndex, cIndex + item.inputCount, value, .typo)
|
||||
(cLongestMatch.displayedTextStartIndex, cLongestMatch.inputElementsStartIndex, cIndex + item.inputCount, cLongestMatch.backwardRoute, value, .typo)
|
||||
)
|
||||
)
|
||||
} else {
|
||||
@ -477,9 +447,8 @@ struct InputGraph {
|
||||
(
|
||||
node,
|
||||
cIndex + item.inputCount,
|
||||
cRoute + Array(item.replace),
|
||||
.init(from: input[cIndex].inputStyle),
|
||||
(cLongestMatch.displayedTextStartIndex, cLongestMatch.inputElementsStartIndex, cIndex + item.inputCount, cLongestMatch.value, .typo)
|
||||
(cLongestMatch.displayedTextStartIndex, cLongestMatch.inputElementsStartIndex, cIndex + item.inputCount, cLongestMatch.backwardRoute, cLongestMatch.value, .typo)
|
||||
)
|
||||
)
|
||||
}
|
||||
@ -487,17 +456,25 @@ struct InputGraph {
|
||||
}
|
||||
}
|
||||
// matchをinsertする
|
||||
for match in matches {
|
||||
var removedNodeIndices: Set<Int> = []
|
||||
for match in matches.sorted(by: { $0.backwardRoute.count > $1.backwardRoute.count }) {
|
||||
let displayedTextStartIndex = if let d = match.displayedTextStartIndex {
|
||||
d
|
||||
} else if let beforeNodeIndex = inputGraph.structure.inputElementsEndIndexToNodeIndices[index].first,
|
||||
let d = inputGraph.nodes[beforeNodeIndex].displayedTextRange.endIndex {
|
||||
d
|
||||
} else {
|
||||
Int?.none
|
||||
}
|
||||
} else if let beforeNodeIndex = inputGraph.structure.inputElementsEndIndexToNodeIndices[index].first {
|
||||
inputGraph.nodes[beforeNodeIndex].displayedTextRange.endIndex
|
||||
} else {
|
||||
Int?.none
|
||||
}
|
||||
guard let displayedTextStartIndex else { continue }
|
||||
|
||||
for backNodeIndex in match.backwardRoute {
|
||||
if removedNodeIndices.contains(backNodeIndex) {
|
||||
continue
|
||||
}
|
||||
inputGraph.structure.remove(at: backNodeIndex)
|
||||
removedNodeIndices.insert(backNodeIndex)
|
||||
}
|
||||
|
||||
let characters = Array(match.value)
|
||||
for (i, c) in zip(characters.indices, characters) {
|
||||
let inputElementRange: InputGraphStructure.Range = if i == characters.startIndex && i+1 == characters.endIndex {
|
||||
@ -527,7 +504,6 @@ struct InputGraph {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return consume inputGraph
|
||||
return inputGraph
|
||||
}
|
||||
}
|
@ -0,0 +1,59 @@
|
||||
//
|
||||
// InputGraphProtocol.swift
|
||||
//
|
||||
//
|
||||
// Created by miwa on 2024/02/23.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
|
||||
protocol InputGraphNodeProtocol {
|
||||
var displayedTextRange: InputGraphStructure.Range { get set }
|
||||
var inputElementsRange: InputGraphStructure.Range { get set }
|
||||
}
|
||||
|
||||
protocol InputGraphProtocol {
|
||||
associatedtype Node: InputGraphNodeProtocol
|
||||
var nodes: [Node] { get set }
|
||||
|
||||
var structure: InputGraphStructure { get set }
|
||||
}
|
||||
|
||||
extension InputGraphProtocol {
|
||||
var root: Node {
|
||||
nodes[0]
|
||||
}
|
||||
|
||||
func nextIndices(for node: Node) -> IndexSet {
|
||||
self.structure.nextIndices(
|
||||
displayedTextEndIndex: node.displayedTextRange.endIndex,
|
||||
inputElementsEndIndex: node.inputElementsRange.endIndex
|
||||
)
|
||||
}
|
||||
|
||||
func next(for node: Node) -> [Node] {
|
||||
nextIndices(for: node).map{ self.nodes[$0] }
|
||||
}
|
||||
|
||||
func prevIndices(for node: Node) -> IndexSet {
|
||||
self.structure.prevIndices(
|
||||
displayedTextStartIndex: node.displayedTextRange.startIndex,
|
||||
inputElementsStartIndex: node.inputElementsRange.startIndex
|
||||
)
|
||||
}
|
||||
|
||||
func prev(for node: Node) -> [Node] {
|
||||
prevIndices(for: node).map{ self.nodes[$0] }
|
||||
}
|
||||
|
||||
mutating func remove(at index: Int) {
|
||||
assert(index != 0, "Node at index 0 is root and must not be removed.")
|
||||
self.structure.remove(at: index)
|
||||
}
|
||||
|
||||
mutating func insert(_ node: Node) {
|
||||
var nodes = self.nodes
|
||||
let _ = self.structure.insert(node, nodes: &nodes, displayedTextRange: node.displayedTextRange, inputElementsRange: node.inputElementsRange)
|
||||
self.nodes = consume nodes
|
||||
}
|
||||
}
|
@ -46,6 +46,7 @@ final class InputGraphTests: XCTestCase {
|
||||
.init(character: "a", inputStyle: .roman2kana),
|
||||
])
|
||||
XCTAssertEqual(graph.nodes.count, 3) // Root nodes
|
||||
XCTAssertNil(graph.nodes.first(where: {$0.character == "あ"}))
|
||||
}
|
||||
do {
|
||||
let graph = InputGraph.build(input: [
|
||||
@ -123,10 +124,31 @@ final class InputGraphTests: XCTestCase {
|
||||
do {
|
||||
// tt→っt
|
||||
let graph = InputGraph.build(input: [
|
||||
.init(character: "i", inputStyle: .roman2kana),
|
||||
.init(character: "t", inputStyle: .roman2kana),
|
||||
.init(character: "t", inputStyle: .roman2kana),
|
||||
.init(character: "a", inputStyle: .roman2kana),
|
||||
])
|
||||
XCTAssertNil(graph.nodes.first(where: {$0.character == "t"}))
|
||||
XCTAssertNil(graph.nodes.first(where: {$0.character == "あ"}))
|
||||
XCTAssertEqual(
|
||||
graph.nodes.first(where: {$0.character == "っ"}),
|
||||
.init(character: "っ", displayedTextRange: .range(1, 2), inputElementsRange: .startIndex(1), correction: .none)
|
||||
)
|
||||
XCTAssertEqual(
|
||||
graph.nodes.first(where: {$0.character == "た"}),
|
||||
.init(character: "た", displayedTextRange: .range(2, 3), inputElementsRange: .endIndex(4), correction: .none)
|
||||
)
|
||||
}
|
||||
do {
|
||||
// tt→っt
|
||||
let graph = InputGraph.build(input: [
|
||||
.init(character: "t", inputStyle: .roman2kana),
|
||||
.init(character: "t", inputStyle: .roman2kana),
|
||||
.init(character: "a", inputStyle: .roman2kana),
|
||||
])
|
||||
XCTAssertNil(graph.nodes.first(where: {$0.character == "t"}))
|
||||
XCTAssertNil(graph.nodes.first(where: {$0.character == "あ"}))
|
||||
XCTAssertEqual(
|
||||
graph.nodes.first(where: {$0.character == "っ"}),
|
||||
.init(character: "っ", displayedTextRange: .range(0, 1), inputElementsRange: .startIndex(0), correction: .none)
|
||||
@ -143,6 +165,8 @@ final class InputGraphTests: XCTestCase {
|
||||
.init(character: "t", inputStyle: .roman2kana),
|
||||
.init(character: "a", inputStyle: .roman2kana),
|
||||
])
|
||||
XCTAssertNil(graph.nodes.first(where: {$0.character == "t"}))
|
||||
XCTAssertNil(graph.nodes.first(where: {$0.character == "あ"}))
|
||||
XCTAssertEqual(
|
||||
graph.nodes.first(where: {$0.character == "ん"}),
|
||||
.init(character: "ん", displayedTextRange: .range(0, 1), inputElementsRange: .startIndex(0), correction: .none)
|
@ -0,0 +1,228 @@
|
||||
//
|
||||
// LookupGraphTests.swift
|
||||
//
|
||||
//
|
||||
// Created by miwa on 2024/02/23.
|
||||
//
|
||||
|
||||
import XCTest
|
||||
import Foundation
|
||||
@testable import KanaKanjiConverterModule
|
||||
|
||||
struct LookupGraph: InputGraphProtocol {
|
||||
struct Node: Equatable, InputGraphNodeProtocol {
|
||||
var character: Character
|
||||
var charId: UInt8
|
||||
var loudsNodeIndices: Set<Int> = []
|
||||
var displayedTextRange: InputGraphStructure.Range
|
||||
var inputElementsRange: InputGraphStructure.Range
|
||||
var correction: InputGraph.Correction = .none
|
||||
}
|
||||
|
||||
var nodes: [Node] = [
|
||||
// root node
|
||||
Node(character: "\0", charId: 0x00, displayedTextRange: .endIndex(0), inputElementsRange: .endIndex(0))
|
||||
]
|
||||
|
||||
var structure: InputGraphStructure = InputGraphStructure()
|
||||
|
||||
static func build(input: InputGraph, character2CharId: (Character) -> UInt8) -> Self {
|
||||
let nodes = input.nodes.map {
|
||||
Node(character: $0.character, charId: character2CharId($0.character), displayedTextRange: $0.displayedTextRange, inputElementsRange: $0.inputElementsRange, correction: $0.correction)
|
||||
}
|
||||
return Self(nodes: nodes, structure: input.structure)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extension LOUDS {
|
||||
func byfixNodeIndices(_ lookupGraph: LookupGraph, startGraphNodeIndex: Int = 0) -> (IndexSet, [Int: [(displayedTextEndIndex: Int?, inputElementsEndIndex: Int?)]]) {
|
||||
var indexSet = IndexSet(integer: 1)
|
||||
// loudsのノードとLookupGraphのノードの対応を取るための辞書
|
||||
var loudsNodeIndex2GraphNodeEndIndices: [Int: [(displayedTextEndIndex: Int?, inputElementsEndIndex: Int?)]] = [:]
|
||||
typealias SearchItem = (
|
||||
node: LookupGraph.Node,
|
||||
lastLoudsNodeIndex: Int
|
||||
)
|
||||
var stack: [SearchItem] = [(lookupGraph.nodes[startGraphNodeIndex], 1)]
|
||||
while let (cNode, cLastLoudsNodeIndex) = stack.popLast() {
|
||||
// nextNodesを探索
|
||||
if let loudsNodeIndex = self.searchCharNodeIndex(from: cLastLoudsNodeIndex, char: cNode.charId) {
|
||||
loudsNodeIndex2GraphNodeEndIndices[loudsNodeIndex, default: []].append((cNode.displayedTextRange.endIndex, cNode.inputElementsRange.endIndex))
|
||||
indexSet.insert(loudsNodeIndex)
|
||||
stack.append(contentsOf: lookupGraph.nextIndices(for: cNode).compactMap { index in
|
||||
let node = lookupGraph.nodes[index]
|
||||
// endIndexをチェックする
|
||||
// endIndexは単調増加である必要がある
|
||||
if let cDisplayedTextEndIndex = cNode.displayedTextRange.endIndex,
|
||||
let nDisplayedTextEndIndex = node.displayedTextRange.endIndex {
|
||||
guard cDisplayedTextEndIndex < nDisplayedTextEndIndex else {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
if let cInputElementsEndIndex = cNode.inputElementsRange.endIndex,
|
||||
let nInputElementsEndIndex = node.inputElementsRange.endIndex {
|
||||
guard cInputElementsEndIndex < nInputElementsEndIndex else {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return (node, loudsNodeIndex)
|
||||
})
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
}
|
||||
return (indexSet, loudsNodeIndex2GraphNodeEndIndices)
|
||||
}
|
||||
}
|
||||
|
||||
extension DicdataStore {
|
||||
func buildConvertGraph(inputGraph: consuming InputGraph, option: ConvertRequestOptions) -> ConvertGraph {
|
||||
let lookupGraph = LookupGraph.build(input: consume inputGraph, character2CharId: { self.character2charId($0.toKatakana()) } )
|
||||
var stack: [Int] = Array(lookupGraph.nextIndices(for: lookupGraph.root))
|
||||
var graphNodeIndex2LatticeNodes: [Int: [ConvertGraph.LatticeNode]] = [:]
|
||||
while let graphNodeIndex = stack.popLast() {
|
||||
let graphNode = lookupGraph.nodes[graphNodeIndex]
|
||||
guard let louds = self.loadLOUDS(identifier: String(graphNode.character.toKatakana())) else {
|
||||
continue
|
||||
}
|
||||
let (loudsNodeIndices, loudsNodeIndex2GraphEndIndices) = louds.byfixNodeIndices(lookupGraph, startGraphNodeIndex: graphNodeIndex)
|
||||
let dicdataWithIndex: [(loudsNodeIndex: Int, dicdata: [DicdataElement])] = self.getDicdataFromLoudstxt3(identifier: String(graphNode.character.toKatakana()), indices: loudsNodeIndices, option: option)
|
||||
var latticeNodes: [ConvertGraph.LatticeNode] = []
|
||||
for (loudsNodeIndex, dicdata) in dicdataWithIndex {
|
||||
for endIndex in loudsNodeIndex2GraphEndIndices[loudsNodeIndex, default: []] {
|
||||
let displayedTextRange: InputGraphStructure.Range = switch (graphNode.displayedTextRange.startIndex, endIndex.displayedTextEndIndex) {
|
||||
case let (s?, e?): .range(s, e)
|
||||
case (let s?, nil): .startIndex(s)
|
||||
case (nil, let e?): .endIndex(e)
|
||||
case (nil, nil): .unknown
|
||||
}
|
||||
let inputElementsRange: InputGraphStructure.Range = switch (graphNode.inputElementsRange.startIndex, endIndex.inputElementsEndIndex) {
|
||||
case let (s?, e?): .range(s, e)
|
||||
case (let s?, nil): .startIndex(s)
|
||||
case (nil, let e?): .endIndex(e)
|
||||
case (nil, nil): .unknown
|
||||
}
|
||||
if graphNode.displayedTextRange.startIndex == 0 || graphNode.inputElementsRange.startIndex == 0 {
|
||||
latticeNodes.append(contentsOf: dicdata.map {
|
||||
.init(data: $0, displayedTextRange: displayedTextRange, inputElementsRange: inputElementsRange, prevs: [.BOSNode()])
|
||||
})
|
||||
} else {
|
||||
latticeNodes.append(contentsOf: dicdata.map {
|
||||
.init(data: $0, displayedTextRange: displayedTextRange, inputElementsRange: inputElementsRange)
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
graphNodeIndex2LatticeNodes[graphNodeIndex] = latticeNodes
|
||||
stack.append(contentsOf: lookupGraph.nextIndices(for: graphNode))
|
||||
}
|
||||
return ConvertGraph.build(input: consume lookupGraph, nodeIndex2LatticeNode: graphNodeIndex2LatticeNodes)
|
||||
}
|
||||
|
||||
func getDicdataFromLoudstxt3(identifier: String, indices: some Sequence<Int>, option: ConvertRequestOptions) -> [(loudsNodeIndex: Int, dicdata: [DicdataElement])] {
|
||||
// split = 2048
|
||||
let dict = [Int: [Int]].init(grouping: indices, by: {$0 >> 11})
|
||||
var data: [(loudsNodeIndex: Int, dicdata: [DicdataElement])] = []
|
||||
for (key, value) in dict {
|
||||
// FIXME: use local option
|
||||
data.append(contentsOf: LOUDS.getDataForLoudstxt3(identifier + "\(key)", indices: value.map {$0 & 2047}, option: option))
|
||||
}
|
||||
return data
|
||||
}
|
||||
}
|
||||
|
||||
final class LookupGraphTests: XCTestCase {
|
||||
func requestOptions() -> ConvertRequestOptions {
|
||||
.withDefaultDictionary(requireJapanesePrediction: false, requireEnglishPrediction: false, keyboardLanguage: .ja_JP, learningType: .nothing, memoryDirectoryURL: URL(fileURLWithPath: ""), sharedContainerURL: URL(fileURLWithPath: ""), metadata: .init(appVersionString: "Test"))
|
||||
}
|
||||
|
||||
func testByfixNodeIndices() throws {
|
||||
let dicdataStore = DicdataStore(convertRequestOptions: requestOptions())
|
||||
let character2CharId: (Character) -> UInt8 = { dicdataStore.character2charId($0.toKatakana()) }
|
||||
let louds = LOUDS.load("シ", option: requestOptions())
|
||||
XCTAssertNotNil(louds)
|
||||
guard let louds else { return }
|
||||
do {
|
||||
let inputGraph = InputGraph.build(input: [
|
||||
.init(character: "し", inputStyle: .direct),
|
||||
.init(character: "か", inputStyle: .direct),
|
||||
.init(character: "い", inputStyle: .direct),
|
||||
])
|
||||
let lookupGraph = LookupGraph.build(input: inputGraph, character2CharId: character2CharId)
|
||||
let startNodeIndex = lookupGraph.nextIndices(for: lookupGraph.root).first(where: { lookupGraph.nodes[$0].character == "し" })
|
||||
XCTAssertNotNil(startNodeIndex)
|
||||
let (loudsNodeIndices, _) = louds.byfixNodeIndices(lookupGraph, startGraphNodeIndex: startNodeIndex ?? 0)
|
||||
let dicdataWithIndex = dicdataStore.getDicdataFromLoudstxt3(identifier: "シ", indices: loudsNodeIndices, option: requestOptions())
|
||||
let dicdata = dicdataWithIndex.flatMapSet { $0.dicdata }
|
||||
// シ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死"})
|
||||
// シカ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "鹿"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "歯科"})
|
||||
// シガ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "滋賀"})
|
||||
// シカイ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "司会"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "視界"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死界"})
|
||||
// シガイ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "市外"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "市街"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死骸"})
|
||||
}
|
||||
do {
|
||||
// ts -> ta
|
||||
let inputGraph = InputGraph.build(input: [
|
||||
.init(character: "s", inputStyle: .roman2kana),
|
||||
.init(character: "i", inputStyle: .roman2kana),
|
||||
.init(character: "t", inputStyle: .roman2kana),
|
||||
.init(character: "s", inputStyle: .roman2kana),
|
||||
.init(character: "i", inputStyle: .roman2kana),
|
||||
])
|
||||
let lookupGraph = LookupGraph.build(input: inputGraph, character2CharId: character2CharId)
|
||||
let startNodeIndex = lookupGraph.nextIndices(for: lookupGraph.root).first(where: { lookupGraph.nodes[$0].character == "し" })
|
||||
XCTAssertNotNil(startNodeIndex)
|
||||
let (loudsNodeIndices, _) = louds.byfixNodeIndices(lookupGraph, startGraphNodeIndex: startNodeIndex ?? 0)
|
||||
let dicdataWithIndex = dicdataStore.getDicdataFromLoudstxt3(identifier: "シ", indices: loudsNodeIndices, option: requestOptions())
|
||||
let dicdata = dicdataWithIndex.flatMapSet { $0.dicdata }
|
||||
// シ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死"})
|
||||
// [シツ]ィ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "質"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "室"})
|
||||
// シタ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "下"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "舌"})
|
||||
// シタイ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死体"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "肢体"})
|
||||
}
|
||||
do {
|
||||
// 「しっ」の候補が存在するかどうかを確認
|
||||
let inputGraph = InputGraph.build(input: [
|
||||
.init(character: "s", inputStyle: .roman2kana),
|
||||
.init(character: "i", inputStyle: .roman2kana),
|
||||
.init(character: "t", inputStyle: .roman2kana),
|
||||
.init(character: "t", inputStyle: .roman2kana),
|
||||
.init(character: "a", inputStyle: .roman2kana),
|
||||
.init(character: "i", inputStyle: .roman2kana),
|
||||
])
|
||||
let lookupGraph = LookupGraph.build(input: inputGraph, character2CharId: character2CharId)
|
||||
let startNodeIndex = lookupGraph.nextIndices(for: lookupGraph.root).first(where: { lookupGraph.nodes[$0].character == "し" })
|
||||
XCTAssertNotNil(startNodeIndex)
|
||||
let (loudsNodeIndices, _) = louds.byfixNodeIndices(lookupGraph, startGraphNodeIndex: startNodeIndex ?? 0)
|
||||
let dicdataWithIndex = dicdataStore.getDicdataFromLoudstxt3(identifier: "シ", indices: loudsNodeIndices, option: requestOptions())
|
||||
let dicdata = dicdataWithIndex.flatMapSet { $0.dicdata }
|
||||
// シ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "死"})
|
||||
// シッ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "知っ"})
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "しっ"})
|
||||
// シッタ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "叱咤"})
|
||||
// シッタイ
|
||||
XCTAssertTrue(dicdata.contains {$0.word == "失態"})
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,95 @@
|
||||
//
|
||||
// extension Kana2Kanji+InputGraph.swift
|
||||
//
|
||||
//
|
||||
// Created by miwa on 2024/02/23.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
@testable import KanaKanjiConverterModule
|
||||
|
||||
import XCTest
|
||||
|
||||
extension Kana2Kanji {
|
||||
func _experimental_all(_ inputData: ComposingText, option: ConvertRequestOptions) -> ConvertGraph.LatticeNode {
|
||||
// グラフ構築
|
||||
print(#file, "start")
|
||||
let inputGraph = InputGraph.build(input: inputData.input)
|
||||
// 辞書ルックアップによりconvertGraphを構築
|
||||
print(#file, "lookup", inputGraph)
|
||||
let convertGraph = self.dicdataStore.buildConvertGraph(inputGraph: consume inputGraph, option: option)
|
||||
print(#file, "convert", convertGraph)
|
||||
let result = convertGraph.convertAll(option: option, dicdataStore: self.dicdataStore)
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
private extension ConvertGraph.LatticeNode {
|
||||
func joinedPrevs() -> [String] {
|
||||
var result: [String] = []
|
||||
for prev in self.prevs {
|
||||
var words = [self.data.word, prev.data.word]
|
||||
var curPrev: (any RegisteredNodeProtocol) = prev
|
||||
while let newPrev = curPrev.prev {
|
||||
words.append(newPrev.data.word)
|
||||
curPrev = newPrev
|
||||
}
|
||||
result.append(words.reversed().joined())
|
||||
}
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
final class ExperimentalConversionTests: XCTestCase {
|
||||
func requestOptions() -> ConvertRequestOptions {
|
||||
.withDefaultDictionary(requireJapanesePrediction: false, requireEnglishPrediction: false, keyboardLanguage: .ja_JP, learningType: .nothing, memoryDirectoryURL: URL(fileURLWithPath: ""), sharedContainerURL: URL(fileURLWithPath: ""), metadata: .init(appVersionString: "Test"))
|
||||
}
|
||||
|
||||
func testConversion() throws {
|
||||
let dicdataStore = DicdataStore(requestOptions: requestOptions())
|
||||
let kana2kanji = Kana2Kanji(dicdataStore: dicdataStore)
|
||||
do {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition("みらいえいが", inputStyle: .direct)
|
||||
let result = kana2kanji._experimental_all(c, option: requestOptions())
|
||||
XCTAssertTrue(result.joinedPrevs().contains("未来映画"))
|
||||
}
|
||||
do {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition("miraieiga", inputStyle: .roman2kana)
|
||||
let result = kana2kanji._experimental_all(c, option: requestOptions())
|
||||
XCTAssertTrue(result.joinedPrevs().contains("未来映画"))
|
||||
}
|
||||
do {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition("sitta", inputStyle: .roman2kana)
|
||||
let result = kana2kanji._experimental_all(c, option: requestOptions())
|
||||
XCTAssertTrue(result.joinedPrevs().contains("知った"))
|
||||
}
|
||||
do {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition("unda", inputStyle: .roman2kana)
|
||||
let result = kana2kanji._experimental_all(c, option: requestOptions())
|
||||
XCTAssertTrue(result.joinedPrevs().contains("産んだ"))
|
||||
}
|
||||
do {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition("ixtsuta", inputStyle: .roman2kana)
|
||||
let result = kana2kanji._experimental_all(c, option: requestOptions())
|
||||
XCTAssertTrue(result.joinedPrevs().contains("言った"))
|
||||
}
|
||||
do {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition("its", inputStyle: .roman2kana)
|
||||
let result = kana2kanji._experimental_all(c, option: requestOptions())
|
||||
XCTAssertTrue(result.joinedPrevs().contains("いた"))
|
||||
}
|
||||
do {
|
||||
var c = ComposingText()
|
||||
c.insertAtCursorPosition("itsi", inputStyle: .roman2kana)
|
||||
let result = kana2kanji._experimental_all(c, option: requestOptions())
|
||||
print(result.joinedPrevs())
|
||||
XCTAssertTrue(result.joinedPrevs().contains("痛い"))
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user