fix: now FullInputProcessing.swift natively works with new index system

This commit is contained in:
ensan-hcl
2025-07-14 21:17:33 +09:00
parent 7374b18eae
commit 16363be738
9 changed files with 103 additions and 70 deletions

View File

@ -32,17 +32,17 @@ extension Kana2Kanji {
let inputCount: Int = inputData.input.count
let surfaceCount = inputData.convertTarget.count
let result: LatticeNode = LatticeNode.EOSNode
let i2sMap = inputData.inputIndexToSurfaceIndexMap()
let latticeIndices = Lattice.indices(inputCount: inputCount, surfaceCount: surfaceCount, inputIndexToSurfaceIndexMap: i2sMap)
let rawNodes = latticeIndices.map { (iIndex, sIndex) in
let surfaceRange: (startIndex: Int, endIndexRange: Range<Int>?)? = if let sIndex {
let i2sMap = LatticeDualIndexMap(inputData)
let latticeIndices = Lattice.indices(inputCount: inputCount, surfaceCount: surfaceCount, map: i2sMap)
let rawNodes = latticeIndices.map { index in
let surfaceRange: (startIndex: Int, endIndexRange: Range<Int>?)? = if let sIndex = index.surfaceIndex {
(sIndex, nil)
} else {
nil
}
return dicdataStore.getLOUDSDataInRange(
inputData: inputData,
from: iIndex,
from: index.inputIndex,
surfaceRange: surfaceRange,
needTypoCorrection: needTypoCorrection
)
@ -72,11 +72,7 @@ extension Kana2Kanji {
node.values = node.prevs.map {$0.totalValue + wValue}
}
// index
let nextIndex: (inputIndex: Int?, surfaceIndex: Int?) = switch node.range.endIndex {
case .input(let index): (index, i2sMap[index])
case .surface(let index): (i2sMap.filter { $0.value == index}.first?.key, index)
}
print(nextIndex, node.data.word, node.data.ruby)
let nextIndex = i2sMap.dualIndex(for: node.range.endIndex)
// count
if nextIndex.inputIndex == inputCount && nextIndex.surfaceIndex == surfaceCount {
self.updateResultNode(with: node, resultNode: result)

View File

@ -36,7 +36,9 @@ struct Kana2Kanji {
let lastMid = data.clauses.last!.clause.mid
let composingCount: ComposingCount = data.clauses.reduce(into: .inputCount(0)) {
$0 = .composite($0, $1.clause.range.count)
for range in $1.clause.ranges {
$0 = .composite($0, range.count)
}
}
return Candidate(
text: text,

View File

@ -12,6 +12,54 @@ struct LatticeNodeArray: Sequence {
}
}
struct LatticeDualIndexMap {
private(set) var inputIndexToSurfaceIndexMap: [Int: Int]
init(_ composingText: ComposingText) {
self.inputIndexToSurfaceIndexMap = composingText.inputIndexToSurfaceIndexMap()
}
enum DualIndex: Sendable, Equatable, Hashable {
case inputIndex(Int)
case surfaceIndex(Int)
case bothIndex(inputIndex: Int, surfaceIndex: Int)
var inputIndex: Int? {
switch self {
case .inputIndex(let index), .bothIndex(let index, _):
index
case .surfaceIndex:
nil
}
}
var surfaceIndex: Int? {
switch self {
case .inputIndex:
nil
case .surfaceIndex(let index), .bothIndex(_, let index):
index
}
}
}
func dualIndex(for latticeIndex: Lattice.LatticeIndex) -> DualIndex {
switch latticeIndex {
case .input(let iIndex):
if let sIndex = self.inputIndexToSurfaceIndexMap[iIndex] {
.bothIndex(inputIndex: iIndex, surfaceIndex: sIndex)
} else {
.inputIndex(iIndex)
}
case .surface(let sIndex):
if let iIndex = self.inputIndexToSurfaceIndexMap.filter({ $0.value == sIndex}).first?.key {
.bothIndex(inputIndex: iIndex, surfaceIndex: sIndex)
} else {
.surfaceIndex(sIndex)
}
}
}
}
struct Lattice: Sequence {
typealias Element = LatticeNodeArray
@ -44,22 +92,22 @@ struct Lattice: Sequence {
private var inputIndexedNodes: [[LatticeNode]]
private var surfaceIndexedNodes: [[LatticeNode]]
static func indices(inputCount: Int, surfaceCount: Int, inputIndexToSurfaceIndexMap: [Int: Int]) -> [(inputIndex: Int?, surfaceIndex: Int?)] {
var indices: [(inputIndex: Int?, surfaceIndex: Int?)] = []
static func indices(inputCount: Int, surfaceCount: Int, map: LatticeDualIndexMap) -> [LatticeDualIndexMap.DualIndex] {
var indices: [LatticeDualIndexMap.DualIndex] = []
var sIndexPointer = 0
for i in 0 ..< inputCount {
if let sIndex = inputIndexToSurfaceIndexMap[i] {
if let sIndex = map.inputIndexToSurfaceIndexMap[i] {
for j in sIndexPointer ..< sIndex {
indices.append((nil, j))
indices.append(.surfaceIndex(j))
}
indices.append((i, sIndex))
indices.append(.bothIndex(inputIndex: i, surfaceIndex: sIndex))
sIndexPointer = sIndex + 1
} else {
indices.append((i, nil))
indices.append(.inputIndex(i))
}
}
for j in sIndexPointer ..< surfaceCount {
indices.append((nil, j))
indices.append(.surfaceIndex(j))
}
return indices
}
@ -124,7 +172,7 @@ struct Lattice: Sequence {
}
}
subscript(index index: (inputIndex: Int?, surfaceIndex: Int?)) -> LatticeNodeArray {
subscript(index index: LatticeDualIndexMap.DualIndex) -> LatticeNodeArray {
get {
let iNodes: [LatticeNode] = if let iIndex = index.inputIndex { self.inputIndexedNodes[iIndex] } else { [] }
let sNodes: [LatticeNode] = if let sIndex = index.surfaceIndex { self.surfaceIndexedNodes[sIndex] } else { [] }
@ -132,7 +180,7 @@ struct Lattice: Sequence {
}
}
func indexedNodes(indices: [(inputIndex: Int?, surfaceIndex: Int?)]) -> some Sequence<(isHead: Bool, nodes: LatticeNodeArray)> {
func indexedNodes(indices: [LatticeDualIndexMap.DualIndex]) -> some Sequence<(isHead: Bool, nodes: LatticeNodeArray)> {
indices.lazy.map { index in
return (index.inputIndex == 0 && index.surfaceIndex == 0, self[index: index])
}
@ -221,25 +269,6 @@ struct Lattice: Sequence {
}
}
func merged(with other: Self) -> Self? {
return switch (self, other) {
case (let .surface(l, ml), let .surface(mr, r)):
if ml == mr {
.surface(from: l, to: r)
} else {
nil
}
case (let .input(l, ml), let .input(mr, r)):
if ml == mr {
.input(from: l, to: r)
} else {
nil
}
case (.surface, .input), (.input, .surface):
nil
}
}
func offseted(inputOffset: Int, surfaceOffset: Int) -> Self {
switch self {
case .surface(from: let from, to: let to):

View File

@ -22,12 +22,15 @@ extension Kana2Kanji {
/// - note:
///
func getPredictionCandidates(composingText: ComposingText, prepart: CandidateData, lastClause: ClauseDataUnit, N_best: Int) -> [Candidate] {
debug("getPredictionCandidates", composingText, lastClause.range, lastClause.text)
let lastRuby = switch lastClause.range {
case let .input(left, right):
ComposingText.getConvertTarget(for: composingText.input[left..<right]).toKatakana()
case let .surface(left, right):
String(composingText.convertTarget.dropFirst(left).prefix(right - left))
debug("getPredictionCandidates", composingText, lastClause.ranges, lastClause.text)
let lastRuby = lastClause.ranges.reduce(into: "") {
let ruby = switch $1 {
case let .input(left, right):
ComposingText.getConvertTarget(for: composingText.input[left..<right]).toKatakana()
case let .surface(left, right):
String(composingText.convertTarget.dropFirst(left).prefix(right - left))
}
$0.append(ruby)
}
let lastRubyCount = lastRuby.count
let datas: [DicdataElement]

View File

@ -59,7 +59,7 @@ extension RegisteredNodeProtocol {
guard let prev else {
let unit = ClauseDataUnit()
unit.mid = self.data.mid
unit.range = self.range
unit.ranges = [self.range]
return CandidateData(clauses: [(clause: unit, value: .zero)], data: [])
}
var lastcandidate = prev.getCandidateData() // registerd
@ -75,11 +75,7 @@ extension RegisteredNodeProtocol {
if lastClause.text.isEmpty || !DicdataStore.isClause(prev.data.rcid, self.data.lcid) {
//
lastClause.text.append(self.data.word)
if let newRange = lastClause.range.merged(with: self.range) {
lastClause.range = newRange
} else {
fatalError("このケースは想定していません。")
}
lastClause.ranges.append(self.range)
//
if (lastClause.mid == 500 && self.data.mid != 500) || DicdataStore.includeMMValueCalculation(self.data) {
lastClause.mid = self.data.mid
@ -92,7 +88,7 @@ extension RegisteredNodeProtocol {
else {
let unit = ClauseDataUnit()
unit.text = self.data.word
unit.range = self.range
unit.ranges.append(self.range)
if DicdataStore.includeMMValueCalculation(self.data) {
unit.mid = self.data.mid
}

View File

@ -17,32 +17,28 @@ final class ClauseDataUnit {
/// The text of the unit.
var text: String = ""
/// The range of the unit in input text.
var range: Lattice.LatticeRange = .zero
var ranges: [Lattice.LatticeRange] = []
/// Merge the given unit to this unit.
/// - Parameter:
/// - unit: The unit to merge.
func merge(with unit: ClauseDataUnit) {
self.text.append(unit.text)
if let newRange = self.range.merged(with: unit.range) {
self.range = newRange
} else {
fatalError("このケースは想定していません。")
}
self.ranges.append(contentsOf: unit.ranges)
self.nextLcid = unit.nextLcid
}
}
extension ClauseDataUnit: Equatable {
static func == (lhs: ClauseDataUnit, rhs: ClauseDataUnit) -> Bool {
lhs.mid == rhs.mid && lhs.nextLcid == rhs.nextLcid && lhs.text == rhs.text && lhs.range == rhs.range
lhs.mid == rhs.mid && lhs.nextLcid == rhs.nextLcid && lhs.text == rhs.text && lhs.ranges == rhs.ranges
}
}
#if DEBUG
extension ClauseDataUnit: CustomDebugStringConvertible {
var debugDescription: String {
"ClauseDataUnit(mid: \(mid), nextLcid: \(nextLcid), text: \(text), range: \(range))"
"ClauseDataUnit(mid: \(mid), nextLcid: \(nextLcid), text: \(text), ranges: \(ranges))"
}
}
#endif
@ -78,7 +74,18 @@ public enum ComposingCount: Equatable, Sendable {
case surfaceCount(Int)
///
indirect case composite(Self, Self)
indirect case composite(lhs: Self, rhs: Self)
static func composite(_ lhs: Self, _ rhs: Self) -> Self {
switch (lhs, rhs) {
case (.inputCount(let l), .inputCount(let r)):
.inputCount(l + r)
case (.surfaceCount(let l), .surfaceCount(let r)):
.surfaceCount(l + r)
default:
.composite(lhs: lhs, rhs: rhs)
}
}
}
///

View File

@ -472,7 +472,7 @@ import EfficientNGram
return Candidate(
text: first.clause.text,
value: first.value,
composingCount: first.clause.range.count,
composingCount: first.clause.ranges.reduce(into: .inputCount(0)) { $0 = .composite($0, $1.count) },
lastMid: first.clause.mid,
data: Array(candidateData.data[0...count])
)

View File

@ -14,19 +14,19 @@ final class ClauseDataUnitTests: XCTestCase {
do {
let unit1 = ClauseDataUnit()
unit1.text = "僕が"
unit1.range = .input(from: 0, to: 3)
unit1.ranges = [.input(from: 0, to: 3)]
unit1.mid = 0
unit1.nextLcid = 0
let unit2 = ClauseDataUnit()
unit2.text = "走る"
unit2.range = .input(from: 3, to: 6)
unit2.ranges = [.input(from: 3, to: 6)]
unit2.mid = 1
unit2.nextLcid = 1
unit1.merge(with: unit2)
XCTAssertEqual(unit1.text, "僕が走る")
XCTAssertEqual(unit1.range, .input(from: 0, to: 6))
XCTAssertEqual(unit1.ranges, [.input(from: 0, to: 3), .input(from: 3, to: 6)])
XCTAssertEqual(unit1.nextLcid, 1)
XCTAssertEqual(unit1.mid, 0)
}
@ -34,19 +34,19 @@ final class ClauseDataUnitTests: XCTestCase {
do {
let unit1 = ClauseDataUnit()
unit1.text = "君は"
unit1.range = .input(from: 0, to: 3)
unit1.ranges = [.input(from: 0, to: 3)]
unit1.mid = 0
unit1.nextLcid = 0
let unit2 = ClauseDataUnit()
unit2.text = "笑った"
unit2.range = .input(from: 3, to: 7)
unit2.ranges = [.input(from: 3, to: 7)]
unit2.mid = 3
unit2.nextLcid = 3
unit1.merge(with: unit2)
XCTAssertEqual(unit1.text, "君は笑った")
XCTAssertEqual(unit1.range, .input(from: 0, to: 7))
XCTAssertEqual(unit1.ranges, [.input(from: 0, to: 3), .input(from: 3, to: 7)])
XCTAssertEqual(unit1.nextLcid, 3)
XCTAssertEqual(unit1.mid, 0)
}

View File

@ -58,13 +58,13 @@ final class RegisteredNodeTests: XCTestCase {
let clause1 = ClauseDataUnit()
clause1.text = "我輩は"
clause1.nextLcid = CIDData..cid
clause1.range = .input(from: 0, to: 5)
clause1.ranges = [.input(from: 0, to: 5)]
clause1.mid = 1
let clause2 = ClauseDataUnit()
clause2.text = "猫です"
clause2.nextLcid = CIDData.EOS.cid
clause2.range = .input(from: 5, to: 9)
clause2.ranges = [.input(from: 5, to: 9)]
clause2.mid = 3
let expectedResult: CandidateData = CandidateData(