wip: test is not passing, but commit/push it for

working in another env
This commit is contained in:
Miwa / Ensan
2025-07-14 00:42:53 +09:00
parent ee17b238a2
commit 02fcdd4dc1
6 changed files with 310 additions and 87 deletions

View File

@ -31,10 +31,35 @@ extension Kana2Kanji {
let inputCount: Int = inputData.input.count
let surfaceCount = inputData.convertTarget.count
let result: LatticeNode = LatticeNode.EOSNode
let i2sMap = inputData.inputIndexToSurfaceIndexMap()
var rawNodes = (.zero ..< inputCount).map {
let surfaceRange: (startIndex: Int, endIndexRange: Range<Int>?)? = if let sIndex = i2sMap[$0] {
(sIndex, nil)
} else {
nil
}
return dicdataStore.getLOUDSDataInRange(
inputData: inputData,
from: $0,
surfaceRange: surfaceRange,
needTypoCorrection: needTypoCorrection
)
}
for sIndex in 0 ..< inputData.convertTarget.count where !i2sMap.values.contains(sIndex) {
// inputIndexsIndexrawNodes
rawNodes.append(
dicdataStore.getLOUDSDataInRange(
inputData: inputData,
from: nil,
surfaceRange: (sIndex, nil),
needTypoCorrection: needTypoCorrection
)
)
}
let lattice: Lattice = Lattice(
inputCount: inputCount,
surfaceCount: surfaceCount,
rawNodes: (.zero ..< inputCount).map {dicdataStore.getLOUDSDataInRange(inputData: inputData, from: $0, needTypoCorrection: needTypoCorrection)}
rawNodes: rawNodes
)
// inodes
for (i, nodeArray) in lattice.indexedNodes() {
@ -55,8 +80,12 @@ extension Kana2Kanji {
// values
node.values = node.prevs.map {$0.totalValue + wValue}
}
//
let nextIndex = node.range.endIndex
// index
let nextIndex: Lattice.LatticeIndex = switch node.range.endIndex {
case .input(let index): if let sIndex = i2sMap[index] { .surface(sIndex) } else { node.range.endIndex }
case .surface: node.range.endIndex
}
print(nextIndex, node.data.word, node.data.ruby, lattice[index: nextIndex].count)
// count
if nextIndex == .input(inputCount) || nextIndex == .surface(surfaceCount) {
self.updateResultNode(with: node, resultNode: result)

View File

@ -1,8 +1,19 @@
import Algorithms
import SwiftUtils
struct LatticeNodeArray: Sequence {
typealias Element = LatticeNode
var inputIndexedNodes: [LatticeNode]
var surfaceIndexedNodes: [LatticeNode]
func makeIterator() -> Chain2Sequence<[LatticeNode], [LatticeNode]>.Iterator {
inputIndexedNodes.chained(surfaceIndexedNodes).makeIterator()
}
}
struct Lattice: Sequence {
typealias Element = [LatticeNode]
typealias Element = LatticeNodeArray
init() {
self.inputIndexedNodes = []
@ -15,11 +26,12 @@ struct Lattice: Sequence {
for nodes in rawNodes {
guard let first = nodes.first else { continue }
print(nodes.mapSet { $0.range.startIndex }, nodes.count)
switch first.range.startIndex {
case .surface(let i):
self.surfaceIndexedNodes[i] = nodes
self.surfaceIndexedNodes[i].append(contentsOf: nodes)
case .input(let i):
self.inputIndexedNodes[i] = nodes
self.inputIndexedNodes[i].append(contentsOf: nodes)
}
}
}
@ -97,15 +109,42 @@ struct Lattice: Sequence {
.chained(self.surfaceIndexedNodes.enumerated().lazy.map { (.surface($0.offset), $0.element) })
}
func makeIterator() -> Chain2Sequence<[[LatticeNode]], [[LatticeNode]]>.Iterator {
self.inputIndexedNodes.chained(self.surfaceIndexedNodes).makeIterator()
struct Iterator: IteratorProtocol {
init(lattice: Lattice) {
self.lattice = lattice
self.indices = (0, lattice.surfaceIndexedNodes.endIndex, 0, lattice.inputIndexedNodes.endIndex)
}
typealias Element = LatticeNodeArray
let lattice: Lattice
var indices: (currentSurfaceIndex: Int, surfaceEndIndex: Int, currentInputIndex: Int, inputEndIndex: Int)
mutating func next() -> LatticeNodeArray? {
if self.indices.currentSurfaceIndex < self.indices.surfaceEndIndex {
defer {
self.indices.currentSurfaceIndex += 1
}
return .init(inputIndexedNodes: [], surfaceIndexedNodes: self.lattice.surfaceIndexedNodes[self.indices.currentSurfaceIndex])
} else if self.indices.currentInputIndex < self.indices.inputEndIndex {
defer {
self.indices.currentInputIndex += 1
}
return .init(inputIndexedNodes: self.lattice.inputIndexedNodes[self.indices.currentInputIndex], surfaceIndexedNodes: [])
} else {
return nil
}
}
}
func makeIterator() -> Iterator {
Iterator(lattice: self)
}
var isEmpty: Bool {
self.inputIndexedNodes.isEmpty && self.surfaceIndexedNodes.isEmpty
}
enum LatticeIndex: Sendable, Equatable {
enum LatticeIndex: Sendable, Equatable, Hashable {
case surface(Int)
case input(Int)
@ -114,7 +153,7 @@ struct Lattice: Sequence {
}
}
enum LatticeRange: Sendable, Equatable {
enum LatticeRange: Sendable, Equatable, Hashable {
static var zero: Self {
.input(from: 0, to: 0)
}
@ -149,7 +188,7 @@ struct Lattice: Sequence {
}
func merged(with other: Self) -> Self? {
switch (self, other) {
return switch (self, other) {
case (let .surface(l, ml), let .surface(mr, r)):
if ml == mr {
.surface(from: l, to: r)

View File

@ -40,6 +40,6 @@ public final class LatticeNode {
/// - Returns:
/// - Note: `EOS`API
func getCandidateData() -> [CandidateData] {
self.prevs.map {$0.getCandidateData()}
return self.prevs.map {$0.getCandidateData()}
}
}

View File

@ -242,20 +242,93 @@ public final class DicdataStore {
return [louds.searchNodeIndex(chars: charIDs)].compactMap {$0}
}
private struct UnifiedGenerator {
struct SurfaceGenerator {
var surface: [Character] = []
var range: TypoCorrectionGenerator.ProcessRange
var currentIndex: Int
init(surface: [Character], range: TypoCorrectionGenerator.ProcessRange) {
self.surface = surface
self.range = range
self.currentIndex = range.rightIndexRange.lowerBound
}
mutating func setUnreachablePath<C: Collection<Character>>(target: C) where C.Indices == Range<Int> {
if self.surface[self.range.leftIndex...].hasPrefix(target) {
// new upper bound
let currentLowerBound = self.range.rightIndexRange.lowerBound
let currentUpperBound = self.range.rightIndexRange.upperBound
let targetUpperBound = self.range.leftIndex + target.indices.upperBound
self.range.rightIndexRange = min(currentLowerBound, targetUpperBound) ..< min(currentUpperBound, targetUpperBound)
}
}
mutating func next() -> ([Character], (endIndex: Lattice.LatticeIndex, penalty: PValue))? {
if self.surface.indices.contains(self.currentIndex), self.currentIndex < self.range.rightIndexRange.upperBound {
defer {
self.currentIndex += 1
}
let characters = Array(self.surface[self.range.leftIndex ... self.currentIndex])
return (characters, (.surface(self.currentIndex), 0))
}
return nil
}
}
var typoCorrectionGenerator: TypoCorrectionGenerator? = nil
var surfaceGenerator: SurfaceGenerator? = nil
mutating func register(_ generator: TypoCorrectionGenerator) {
self.typoCorrectionGenerator = generator
}
mutating func register(_ generator: SurfaceGenerator) {
self.surfaceGenerator = generator
}
mutating func setUnreachablePath<C: Collection<Character>>(target: C) where C.Indices == Range<Int> {
self.typoCorrectionGenerator?.setUnreachablePath(target: target)
self.surfaceGenerator?.setUnreachablePath(target: target)
}
mutating func next() -> ([Character], (endIndex: Lattice.LatticeIndex, penalty: PValue))? {
if let next = self.surfaceGenerator?.next() {
return next
}
if let next = self.typoCorrectionGenerator?.next() {
return next
}
return nil
}
}
func movingTowardPrefixSearch(
inputs: [ComposingText.InputElement],
leftIndex: Int,
rightIndexRange: Range<Int>,
composingText: ComposingText,
inputProcessRange: TypoCorrectionGenerator.ProcessRange?,
surfaceProcessRange: TypoCorrectionGenerator.ProcessRange?,
useMemory: Bool,
needTypoCorrection: Bool
) -> (
stringToInfo: [[Character]: (endIndex: Int, penalty: PValue)],
stringToInfo: [[Character]: (endIndex: Lattice.LatticeIndex, penalty: PValue)],
indices: [(key: String, indices: [Int])],
temporaryMemoryDicdata: [DicdataElement]
) {
var generator = TypoCorrectionGenerator(inputs: inputs, leftIndex: leftIndex, rightIndexRange: rightIndexRange, needTypoCorrection: needTypoCorrection)
var generator = UnifiedGenerator()
if let surfaceProcessRange {
let surfaceGenerator = UnifiedGenerator.SurfaceGenerator(
surface: Array(composingText.convertTarget.toKatakana()),
range: surfaceProcessRange
)
generator.register(surfaceGenerator)
}
if let inputProcessRange {
let typoCorrectionGenerator = TypoCorrectionGenerator(
inputs: composingText.input,
range: inputProcessRange,
needTypoCorrection: needTypoCorrection
)
generator.register(typoCorrectionGenerator)
}
var targetLOUDS: [String: LOUDS.MovingTowardPrefixSearchHelper] = [:]
var stringToInfo: [([Character], (endIndex: Int, penalty: PValue))] = []
var stringToInfo: [([Character], (endIndex: Lattice.LatticeIndex, penalty: PValue))] = []
//
var dynamicDicdata: [Int: [DicdataElement]] = [:]
//
@ -332,8 +405,25 @@ public final class DicdataStore {
}
let minCount = stringToInfo.map {$0.0.count}.min() ?? 0
return (
Dictionary(stringToInfo, uniquingKeysWith: {$0.penalty < $1.penalty ? $1 : $0}),
targetLOUDS.map { ($0.key, $0.value.indicesInDepth(depth: minCount - 1 ..< .max) )},
Dictionary(
stringToInfo,
uniquingKeysWith: { (lhs, rhs) in
if lhs.penalty < rhs.penalty {
return lhs
} else if lhs.penalty == rhs.penalty {
return switch (lhs.endIndex, rhs.endIndex) {
case (.input, .input), (.surface, .surface): lhs //
case (.surface, .input): lhs // surfaceIndex
case (.input, .surface): rhs // surfaceIndex
}
} else {
return rhs
}
}
),
targetLOUDS.map {
($0.key, $0.value.indicesInDepth(depth: minCount - 1 ..< .max))
},
dynamicDicdata.flatMap {
minCount < $0.key + 1 ? $0.value : []
}
@ -381,24 +471,64 @@ public final class DicdataStore {
/// - inputData:
/// - from:
/// - toIndexRange: `from ..< (toIndexRange)`
public func getLOUDSDataInRange(inputData: ComposingText, from fromIndex: Int, toIndexRange: Range<Int>? = nil, needTypoCorrection: Bool = true) -> [LatticeNode] {
let toIndexLeft = toIndexRange?.startIndex ?? fromIndex
let toIndexRight = min(toIndexRange?.endIndex ?? inputData.input.count, fromIndex + self.maxlength)
if fromIndex > toIndexLeft || toIndexLeft >= toIndexRight {
debug(#function, "index is wrong")
return []
public func getLOUDSDataInRange(
inputData: ComposingText,
from fromInputIndex: Int?,
toIndexRange: Range<Int>? = nil,
surfaceRange: (startIndex: Int, endIndexRange: Range<Int>?)? = nil,
needTypoCorrection: Bool = true
) -> [LatticeNode] {
let inputProcessRange: TypoCorrectionGenerator.ProcessRange?
// TODO: make `fromInputIndex` optional later.
if let fromInputIndex {
let toInputIndexLeft = toIndexRange?.startIndex ?? fromInputIndex
let toInputIndexRight = min(
toIndexRange?.endIndex ?? inputData.input.count,
fromInputIndex + self.maxlength
)
if fromInputIndex > toInputIndexLeft || toInputIndexLeft >= toInputIndexRight {
debug(#function, "index is wrong")
return []
}
inputProcessRange = .init(leftIndex: fromInputIndex, rightIndexRange: toInputIndexLeft ..< toInputIndexRight)
} else {
inputProcessRange = nil
}
let segments = (fromIndex ..< toIndexRight).reduce(into: []) { (segments: inout [String], rightIndex: Int) in
segments.append((segments.last ?? "") + String(inputData.input[rightIndex].character.toKatakana()))
let surfaceProcessRange: TypoCorrectionGenerator.ProcessRange?
if let surfaceRange {
let toSurfaceIndexLeft = surfaceRange.endIndexRange?.startIndex ?? surfaceRange.startIndex
let toSurfaceIndexRight = min(
surfaceRange.endIndexRange?.endIndex ?? inputData.convertTarget.count,
surfaceRange.startIndex + self.maxlength
)
if surfaceRange.startIndex > toSurfaceIndexLeft || toSurfaceIndexLeft >= toSurfaceIndexRight {
debug(#function, "index is wrong")
return []
}
surfaceProcessRange = .init(leftIndex: surfaceRange.startIndex, rightIndexRange: toSurfaceIndexLeft ..< toSurfaceIndexRight)
} else {
surfaceProcessRange = nil
}
if inputProcessRange == nil && surfaceProcessRange == nil {
debug(#function, "either of inputProcessRange and surfaceProcessRange must not be nil")
return []
}
// MARK:
var (stringToInfo, indices, dicdata) = self.movingTowardPrefixSearch(inputs: inputData.input, leftIndex: fromIndex, rightIndexRange: toIndexLeft ..< toIndexRight, useMemory: self.learningManager.enabled, needTypoCorrection: needTypoCorrection)
var (stringToInfo, indices, dicdata) = self.movingTowardPrefixSearch(
composingText: inputData,
inputProcessRange: inputProcessRange,
surfaceProcessRange: surfaceProcessRange,
useMemory: self.learningManager.enabled,
needTypoCorrection: needTypoCorrection
)
print(stringToInfo)
// MARK: indices
for (identifier, value) in indices {
let result: [DicdataElement] = self.getDicdataFromLoudstxt3(identifier: identifier, indices: value).compactMap { (data) -> DicdataElement? in
let rubyArray = Array(data.ruby)
let penalty = stringToInfo[rubyArray, default: (0, .zero)].penalty
let penalty = stringToInfo[rubyArray]?.penalty ?? 0
if penalty.isZero {
return data
}
@ -413,34 +543,40 @@ public final class DicdataStore {
dicdata.append(contentsOf: result)
}
for i in toIndexLeft ..< toIndexRight {
do {
let result = self.getWiseDicdata(convertTarget: segments[i - fromIndex], inputData: inputData, inputRange: fromIndex ..< i + 1)
for item in result {
stringToInfo[Array(item.ruby)] = (i, 0)
if let inputProcessRange {
let segments = (inputProcessRange.leftIndex ..< inputProcessRange.rightIndexRange.endIndex).reduce(into: []) { (segments: inout [String], rightIndex: Int) in
segments.append((segments.last ?? "") + String(inputData.input[rightIndex].character.toKatakana()))
}
for i in inputProcessRange.rightIndexRange {
do {
let result = self.getWiseDicdata(
convertTarget: segments[i - inputProcessRange.leftIndex],
inputData: inputData,
inputRange: inputProcessRange.leftIndex ..< i + 1
)
for item in result {
stringToInfo[Array(item.ruby)] = (.input(i), 0)
}
dicdata.append(contentsOf: result)
}
dicdata.append(contentsOf: result)
}
}
if fromIndex == .zero {
let result: [LatticeNode] = dicdata.compactMap {
guard let endIndex = stringToInfo[Array($0.ruby)]?.endIndex else {
return nil
}
let node = LatticeNode(data: $0, range: .input(from: fromIndex, to: endIndex + 1))
let needBOS = fromInputIndex == .zero
let result: [LatticeNode] = dicdata.compactMap {
guard let endIndex = stringToInfo[Array($0.ruby)]?.endIndex else {
return nil
}
let range: Lattice.LatticeRange = switch endIndex {
case .input(let endIndex): .input(from: fromInputIndex!, to: endIndex + 1)
case .surface(let endIndex): .surface(from: (surfaceRange?.startIndex)!, to: endIndex + 1)
}
let node = LatticeNode(data: $0, range: range)
if needBOS {
node.prevs.append(RegisteredNode.BOSNode())
return node
}
return result
} else {
let result: [LatticeNode] = dicdata.compactMap {
guard let endIndex = stringToInfo[Array($0.ruby)]?.endIndex else {
return nil
}
return LatticeNode(data: $0, range: .input(from: fromIndex, to: endIndex + 1))
}
return result
return node
}
return result
}
func getZeroHintPredictionDicdata(lastRcid: Int) -> [DicdataElement] {

View File

@ -1,13 +1,12 @@
import SwiftUtils
struct TypoCorrectionGenerator: Sendable {
init(inputs: [ComposingText.InputElement], leftIndex left: Int, rightIndexRange: Range<Int>, needTypoCorrection: Bool) {
init(inputs: [ComposingText.InputElement], range: ProcessRange, needTypoCorrection: Bool) {
self.maxPenalty = needTypoCorrection ? 3.5 * 3 : 0
self.inputs = inputs
self.left = left
self.rightIndexRange = rightIndexRange
self.range = range
let count = rightIndexRange.endIndex - left
let count = self.range.rightIndexRange.endIndex - range.leftIndex
self.count = count
self.nodes = (0..<count).map {(i: Int) in
Self.lengths.flatMap {(k: Int) -> [TypoCandidate] in
@ -15,7 +14,7 @@ struct TypoCorrectionGenerator: Sendable {
if count <= j {
return []
}
return Self.getTypo(inputs[left + i ... left + j], frozen: !needTypoCorrection)
return Self.getTypo(inputs[range.leftIndex + i ... range.leftIndex + j], frozen: !needTypoCorrection)
}
}
//
@ -23,7 +22,7 @@ struct TypoCorrectionGenerator: Sendable {
guard let firstElement = typoCandidate.inputElements.first else {
return nil
}
if ComposingText.isLeftSideValid(first: firstElement, of: inputs, from: left) {
if ComposingText.isLeftSideValid(first: firstElement, of: inputs, from: range.leftIndex) {
var convertTargetElements = [ComposingText.ConvertTargetElement]()
for element in typoCandidate.inputElements {
ComposingText.updateConvertTargetElements(currentElements: &convertTargetElements, newElement: element)
@ -36,11 +35,15 @@ struct TypoCorrectionGenerator: Sendable {
let maxPenalty: PValue
let inputs: [ComposingText.InputElement]
let left: Int
let rightIndexRange: Range<Int>
let range: ProcessRange
let nodes: [[TypoCandidate]]
let count: Int
struct ProcessRange: Sendable, Equatable {
var leftIndex: Int
var rightIndexRange: Range<Int>
}
var stack: [(convertTargetElements: [ComposingText.ConvertTargetElement], lastElement: ComposingText.InputElement, count: Int, penalty: PValue)]
/// `target`
@ -75,12 +78,12 @@ struct TypoCorrectionGenerator: Sendable {
}
}
mutating func next() -> ([Character], (endIndex: Int, penalty: PValue))? {
mutating func next() -> ([Character], (endIndex: Lattice.LatticeIndex, penalty: PValue))? {
while let (convertTargetElements, lastElement, count, penalty) = self.stack.popLast() {
var result: ([Character], (endIndex: Int, penalty: PValue))? = nil
if rightIndexRange.contains(count + left - 1) {
if let convertTarget = ComposingText.getConvertTargetIfRightSideIsValid(lastElement: lastElement, of: inputs, to: count + left, convertTargetElements: convertTargetElements)?.map({$0.toKatakana()}) {
result = (convertTarget, (count + left - 1, penalty))
var result: ([Character], (endIndex: Lattice.LatticeIndex, penalty: PValue))? = nil
if self.range.rightIndexRange.contains(count + self.range.leftIndex - 1) {
if let convertTarget = ComposingText.getConvertTargetIfRightSideIsValid(lastElement: lastElement, of: inputs, to: count + self.range.leftIndex, convertTargetElements: convertTargetElements)?.map({$0.toKatakana()}) {
result = (convertTarget, (.input(count + self.range.leftIndex - 1), penalty))
}
}
//
@ -94,7 +97,7 @@ struct TypoCorrectionGenerator: Sendable {
// (3)
if penalty >= maxPenalty {
var convertTargetElements = convertTargetElements
let correct = [inputs[left + count]].map {ComposingText.InputElement(character: $0.character.toKatakana(), inputStyle: $0.inputStyle)}
let correct = [inputs[self.range.leftIndex + count]].map {ComposingText.InputElement(character: $0.character.toKatakana(), inputStyle: $0.inputStyle)}
if count + correct.count > self.nodes.endIndex {
if let result {
return result

View File

@ -218,28 +218,44 @@ final class ComposingTextTests: XCTestCase {
}
func testIndexMap() throws {
var c = ComposingText()
sequentialInput(&c, sequence: "kyouhaiitenkida", inputStyle: .roman2kana)
let map = c.inputIndexToSurfaceIndexMap()
do {
var c = ComposingText()
sequentialInput(&c, sequence: "kyouhaiitenkida", inputStyle: .roman2kana)
let map = c.inputIndexToSurfaceIndexMap()
// Note: n""
XCTAssertEqual(c.input[10], .init(character: "", inputStyle: .direct))
// Note: n""
XCTAssertEqual(c.input[10], .init(character: "", inputStyle: .direct))
XCTAssertEqual(map[0], 0) // ""
XCTAssertEqual(map[1], nil) // k
XCTAssertEqual(map[2], nil) // y
XCTAssertEqual(map[3], 2) // o
XCTAssertEqual(map[4], 3) // u
XCTAssertEqual(map[5], nil) // h
XCTAssertEqual(map[6], 4) // a
XCTAssertEqual(map[7], 5) // i
XCTAssertEqual(map[8], 6) // i
XCTAssertEqual(map[9], nil) // t
XCTAssertEqual(map[10], 7) // e
XCTAssertEqual(map[11], 8) // n // nnil
XCTAssertEqual(map[12], nil) // k
XCTAssertEqual(map[13], 9) // i
XCTAssertEqual(map[14], nil) // d
XCTAssertEqual(map[15], 10) // a
XCTAssertEqual(map[0], 0) // ""
XCTAssertEqual(map[1], nil) // k
XCTAssertEqual(map[2], nil) // y
XCTAssertEqual(map[3], 2) // o
XCTAssertEqual(map[4], 3) // u
XCTAssertEqual(map[5], nil) // h
XCTAssertEqual(map[6], 4) // a
XCTAssertEqual(map[7], 5) // i
XCTAssertEqual(map[8], 6) // i
XCTAssertEqual(map[9], nil) // t
XCTAssertEqual(map[10], 7) // e
XCTAssertEqual(map[11], 8) // n // nnil
XCTAssertEqual(map[12], nil) // k
XCTAssertEqual(map[13], 9) // i
XCTAssertEqual(map[14], nil) // d
XCTAssertEqual(map[15], 10) // a
}
do {
var c = ComposingText()
sequentialInput(&c, sequence: "sakujoshori", inputStyle: .roman2kana)
let map = c.inputIndexToSurfaceIndexMap()
let reversedMap = (0 ..< c.convertTarget.count + 1).compactMap {
if map.values.contains($0) {
String(c.convertTarget.prefix($0))
} else {
nil
}
}
XCTAssertFalse(reversedMap.contains("さくじ"))
XCTAssertFalse(reversedMap.contains("さくじょし"))
}
}
}