[Fix] 学習データの更新ロジックを修正 (#86)

* base value should be updated here

* fix logic

* chore: 処理順の変更

* fix: だいぶ怪しい処理を修正

* debug: add debug print

* debug: add debug print

* debug: add debug print

* debug: add debug print

* debug: add debug print

* debug: add debug print

* debug: add debug print

* fix: use memory stride, not size

* implement: migration from wrong impl

* implement: migration from wrong impl

* cleanup: remove debug codes
This commit is contained in:
Miwa / Ensan
2024-04-30 23:06:36 +09:00
committed by GitHub
parent 4f2750ef3e
commit 73ed9c02d4
2 changed files with 38 additions and 32 deletions

View File

@@ -111,11 +111,13 @@ struct LongTermLearningMemory {
func makeBinary() -> Data {
var data = Data()
var metadata: [MetadataElement] = self.metadata.map { MetadataElement(day: $0.lastUsedDay, count: $0.count) }
// 1byte
var count = UInt8(self.metadata.count)
var count = UInt8(metadata.count)
data.append(contentsOf: Data(bytes: &count, count: MemoryLayout<UInt8>.size))
var metadata = self.metadata.map {MetadataElement(day: $0.lastUsedDay, count: $0.count)}
data.append(contentsOf: Data(bytes: &metadata, count: MemoryLayout<MetadataElement>.size * metadata.count))
for i in metadata.indices {
data.append(contentsOf: Data(bytes: &metadata[i], count: MemoryLayout<MetadataElement>.size))
}
return data
}
}
@@ -135,7 +137,7 @@ struct LongTermLearningMemory {
if self.ruby.isEmpty {
self.ruby = element.ruby
}
self.data.append((element.word, element.lcid, element.rcid, element.mid, element.baseValue))
self.data.append((element.word, element.lcid, element.rcid, element.mid, element.value()))
}
}
@@ -195,7 +197,7 @@ struct LongTermLearningMemory {
}
///
static func merge(tempTrie: TemporalLearningMemoryTrie, forgetTargets: [DicdataElement] = [], directoryURL: URL, maxMemoryCount: Int, char2UInt8: [Character: UInt8]) throws {
static func merge(tempTrie: consuming TemporalLearningMemoryTrie, forgetTargets: [DicdataElement] = [], directoryURL: URL, maxMemoryCount: Int, char2UInt8: [Character: UInt8]) throws {
// MARK: `.pause``merge``.2``merge`
if fileExist(pauseFileURL(directoryURL: directoryURL)) {
debug("LongTermLearningMemory merge collapsion detected, trying recovery...")
@@ -212,13 +214,13 @@ struct LongTermLearningMemory {
// MARK:
let startTime = Date()
let today = LearningManager.today
var newTrie = tempTrie
var newTrie = consume tempTrie
// :
// dataCount(UInt32), count, data*count, count, data*count, ...
// MARK: `metadataFile`
let ltMetadata = (try? Data(contentsOf: metadataFileURL(asTemporaryFile: false, directoryURL: directoryURL))) ?? Data([.zero, .zero, .zero, .zero])
// 4byteentry count
var metadataOffset = 0
// 4byteentry count
let entryCount = ltMetadata[metadataOffset ..< metadataOffset + 4].toArray(of: UInt32.self)[0]
metadataOffset += 4
@@ -226,7 +228,6 @@ struct LongTermLearningMemory {
// loudstxt3
for loudstxtIndex in 0 ..< Int(entryCount) / txtFileSplit + 1 {
// loudstxt3
let loudstxtData: Data
do {
loudstxtData = try Data(contentsOf: loudsTxt3FileURL("\(loudstxtIndex)", asTemporaryFile: false, directoryURL: directoryURL))
@@ -234,6 +235,7 @@ struct LongTermLearningMemory {
debug("LongTermLearningMemory merge failed to read \(loudstxtIndex)", error)
continue
}
// loudstxt3
let count = Int(loudstxtData[0 ..< 2].toArray(of: UInt16.self)[0])
let indices = loudstxtData[2 ..< 2 + 4 * count].toArray(of: UInt32.self)
for i in 0 ..< count {
@@ -241,50 +243,59 @@ struct LongTermLearningMemory {
// 1byte
let itemCount = Int(ltMetadata[metadataOffset ..< metadataOffset + 1].toArray(of: UInt8.self)[0])
metadataOffset += 1
let metadata = ltMetadata[metadataOffset ..< metadataOffset + itemCount * 5].toArray(of: MetadataElement.self)
metadataOffset += itemCount * 5
let metadata = (0 ..< itemCount).map {
let range = metadataOffset + $0 * MemoryLayout<MetadataElement>.size ..< metadataOffset + ($0 + 1) * MemoryLayout<MetadataElement>.size
return ltMetadata[range].toArray(of: MetadataElement.self)[0]
}
metadataOffset += itemCount * MemoryLayout<MetadataElement>.size
// index
let startIndex = Int(indices[i])
let endIndex = i == (indices.endIndex - 1) ? loudstxtData.endIndex : Int(indices[i + 1])
let elements = LOUDS.parseBinary(binary: loudstxtData[startIndex ..< endIndex])
// trie
guard let ruby = elements.first?.ruby else {
guard let ruby = elements.first?.ruby,
let chars = LearningManager.keyToChars(ruby, char2UInt8: char2UInt8) else {
continue
}
var newDicdata: [DicdataElement] = []
var newMetadata: [MetadataElement] = []
assert(elements.count == metadata.count, "elements count and metadata count must be equal.")
for (dicdataElement, metadataElement) in zip(elements, metadata) {
//
if forgetTargets.contains(dicdataElement) {
debug("LongTermLearningMemory merge stopped because it is a forget target", dicdataElement)
continue
}
if ruby != dicdataElement.ruby {
debug("LongTermLearningMemory merge stopped because dicdataElement has different ruby", dicdataElement, ruby)
continue
}
var metadataElement = metadataElement
if today < metadataElement.lastUpdatedDay || today < metadataElement.lastUsedDay {
//
metadataElement = MetadataElement(day: today, count: 1)
}
guard today - metadataElement.lastUsedDay < 128 else {
// 128使
debug("LongTermLearningMemory merge stopped because metadata is strange", dicdataElement, metadataElement, today)
continue
}
var dicdataElement = dicdataElement
var metadataElement = metadataElement
guard today >= metadataElement.lastUpdatedDay else {
//
//
continue
}
// 32
while today - metadataElement.lastUpdatedDay > 32 {
metadataElement.count >>= 1
metadataElement.lastUpdatedDay += 32
}
// 128使
if metadataElement.count == 0 || today - metadataElement.lastUsedDay >= 128 {
//
guard metadataElement.count > 0 else {
debug("LongTermLearningMemory merge stopped because count is zero", dicdataElement, metadataElement)
continue
}
dicdataElement.baseValue = valueForData(metadata: metadataElement, dicdata: dicdataElement)
newDicdata.append(dicdataElement)
newMetadata.append(metadataElement)
}
guard let chars = LearningManager.keyToChars(ruby, char2UInt8: char2UInt8) else {
continue
}
newTrie.append(dicdata: newDicdata, chars: chars, metadata: newMetadata)
}
//
@@ -353,8 +364,7 @@ struct LongTermLearningMemory {
while !currentNodes.isEmpty {
currentNodes.forEach {char, nodeIndex in
nodes2Characters.append(char)
let dicdataBlock = DataBlock(dicdata: trie.nodes[nodeIndex].dataIndices.map {trie.dicdata[$0]})
dicdata.append(dicdataBlock)
dicdata.append(DataBlock(dicdata: trie.nodes[nodeIndex].dataIndices.map {trie.dicdata[$0]}))
metadata.append(MetadataBlock(metadata: trie.nodes[nodeIndex].dataIndices.map {trie.metadata[$0]}))
bits += [Bool](repeating: true, count: trie.nodes[nodeIndex].children.count) + [false]
@@ -376,10 +386,9 @@ struct LongTermLearningMemory {
}
let metadataFileTemp = metadataFileURL(asTemporaryFile: true, directoryURL: directoryURL)
do {
var binary = Data()
binary += Data(bytes: [UInt32(metadata.count)], count: 4) // UInt32
let binary = Data(bytes: [UInt32(metadata.count)], count: 4) // UInt32
let result = metadata.reduce(into: binary) {
$0.append($1.makeBinary())
$0.append(contentsOf: $1.makeBinary())
}
try result.write(to: metadataFileTemp)
}
@@ -468,10 +477,7 @@ struct TemporalLearningMemoryTrie {
///
///
fileprivate mutating func append(dicdata: [DicdataElement], chars: [UInt8], metadata: [MetadataElement]) {
if dicdata.count != metadata.count {
debug("TemporalLearningMemoryTrie append: count of dicdata and metadata do not match")
return
}
assert(dicdata.count == metadata.count, "count of dicdata and metadata do not match")
var index = 0
for char in chars {
if let nextIndex = nodes[index].children[char] {

View File

@@ -90,7 +90,7 @@ extension LOUDS {
debug("getDataForLoudstxt3: failed to parse", dicdata)
return []
}
for (index, substring) in substrings[1...].enumerated() {
for (index, substring) in zip(dicdata.indices, substrings[1...]) {
guard let word = String(data: substring, encoding: .utf8) else {
debug("getDataForLoudstxt3: failed to parse", ruby)
continue