fix: 前方文脈に半角スペースが入ると変換が不安定になっていた問題を修正

This commit is contained in:
ensan-hcl
2025-03-08 15:52:13 +09:00
parent ce48df60ab
commit 99625951c7

View File

@@ -351,7 +351,7 @@ final class ZenzContext {
let outputTag = "\u{EE01}"
let contextTag = "\u{EE02}"
//
let prompt: String = switch versionDependentConfig {
var prompt: String = switch versionDependentConfig {
case .v1:
inputTag + input + outputTag
case .v2:
@@ -375,9 +375,11 @@ final class ZenzContext {
conditions.joined(separator: "") + inputTag + input + outputTag
}
}
//
prompt = self.preprocessText(text: prompt)
// Therefore, tokens = prompt_tokens + candidate_tokens is an appropriate operation.
let prompt_tokens = self.tokenize(text: prompt, add_bos: true, add_eos: false)
let candidate_tokens = self.tokenize(text: candidate.text, add_bos: false, add_eos: false)
let candidate_tokens = self.tokenize(text: self.preprocessText(text: candidate.text), add_bos: false, add_eos: false)
let tokens = prompt_tokens + candidate_tokens
let startOffset = prompt_tokens.count - 1
let pos_max = llama_kv_cache_seq_pos_max(self.context, 0)
@@ -513,10 +515,12 @@ final class ZenzContext {
batch.n_tokens += 1
}
private func tokenize(text: String, add_bos: Bool, add_eos: Bool = false) -> [llama_token] {
private func preprocessText(text: String) -> String {
// replace space into ideographic space (\u3000) for zenz tokenizer
// replace newline into null for zenz tokenizer
let text = text.replacingOccurrences(of: " ", with: "\u{3000}").replacingOccurrences(of: "\n", with: "")
return text.replacingOccurrences(of: " ", with: "\u{3000}").replacingOccurrences(of: "\n", with: "")
}
private func tokenize(text: String, add_bos: Bool, add_eos: Bool = false) -> [llama_token] {
let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)