fix: プロフィール付き変換で半角コロンを使うように変更、トークナイザの苦手な文字を削除

This commit is contained in:
Miwa / Ensan
2024-09-14 23:14:02 +09:00
parent 269c47dd3c
commit ca8da67f73

View File

@ -255,13 +255,13 @@ class ZenzContext {
let lsContext = leftSideContext.suffix(40)
if let profile = mode.profile, !profile.isEmpty {
let pf = profile.suffix(25)
prompt = "\u{EE00}\(input)\u{EE02}プロフィール\(pf)・発言\(lsContext)\u{EE01}"
prompt = "\u{EE00}\(input)\u{EE02}プロフィール:\(pf)・発言:\(lsContext)\u{EE01}"
} else {
prompt = "\u{EE00}\(input)\u{EE02}\(lsContext)\u{EE01}"
}
} else if let profile = mode.profile, !profile.isEmpty {
let pf = profile.suffix(25)
prompt = "\u{EE00}\(input)\u{EE02}プロフィール\(pf)・発言\u{EE01}"
prompt = "\u{EE00}\(input)\u{EE02}プロフィール:\(pf)・発言:\u{EE01}"
} else {
prompt = "\u{EE00}\(input)\u{EE01}"
}
@ -381,6 +381,9 @@ class ZenzContext {
}
private func tokenize(text: String, add_bos: Bool, add_eos: Bool = false) -> [llama_token] {
// replace space into ideographic space (\u3000) for zenz tokenizer
// replace newline into null for zenz tokenizer
let text = text.replacingOccurrences(of: " ", with: "\u{3000}").replacingOccurrences(of: "\n", with: "")
let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)