Files
AzooKeyKanaKanjiConverter/Sources/KanaKanjiConverterModule/ConversionAlgorithms/Zenzai/Zenz/ZenzContext.swift
2025-07-24 21:52:07 +09:00

624 lines
27 KiB
Swift

#if Zenzai
// Zenzaillama-mock.swift
import llama
#endif
import SwiftUtils
import HeapModule
import Algorithms
import Foundation
import EfficientNGram
private let debugLogFile = FileManager.default.temporaryDirectory.appendingPathComponent("zenz_debug_\(ProcessInfo.processInfo.processIdentifier).log")
private func debugLog(_ items: Any..., separator: String = " ", terminator: String = "\n") {
let message = items.map { "\($0)" }.joined(separator: separator) + terminator
let timestamp = ISO8601DateFormatter().string(from: Date())
let logEntry = "[\(timestamp)] \(message)"
print(logEntry, terminator: "")
debug(message)
if let data = logEntry.data(using: .utf8) {
if !FileManager.default.fileExists(atPath: debugLogFile.path) {
FileManager.default.createFile(atPath: debugLogFile.path, contents: data)
print("[DEBUG] Created log file at: \(debugLogFile.path)")
} else {
if let fileHandle = try? FileHandle(forWritingTo: debugLogFile) {
fileHandle.seekToEndOfFile()
fileHandle.write(data)
fileHandle.closeFile()
}
}
}
}
struct FixedSizeHeap<Element: Comparable> {
private var size: Int
private var heap: Heap<Element>
init(size: Int) {
self.size = size
self.heap = []
}
mutating func removeMax() {
self.heap.removeMax()
}
mutating func removeMin() {
self.heap.removeMin()
}
@discardableResult
mutating func insertIfPossible(_ element: Element) -> Bool {
if self.heap.count < self.size {
self.heap.insert(element)
return true
} else if let min = self.heap.min, element > min {
self.heap.replaceMin(with: element)
return true
} else {
return false
}
}
var unordered: [Element] {
self.heap.unordered
}
var max: Element? {
self.heap.max
}
var min: Element? {
self.heap.min
}
var isEmpty: Bool {
self.heap.isEmpty
}
}
enum ZenzError: LocalizedError {
case couldNotLoadModel(path: String)
case couldNotLoadContext
case couldNotLoadVocab
var errorDescription: String? {
switch self {
case .couldNotLoadContext: return "failed to load context"
case .couldNotLoadModel(path: let path): return "could not load model weight at \(path)"
case .couldNotLoadVocab: return "failed to load vocab"
}
}
}
final class ZenzContext {
private var model: OpaquePointer
private var context: OpaquePointer
private var vocab: OpaquePointer
private var prevInput: [llama_token] = []
private let n_len: Int32 = 512
init(model: OpaquePointer, context: OpaquePointer, vocab: OpaquePointer) {
self.model = model
self.context = context
self.vocab = vocab
}
deinit {
llama_free(context)
llama_model_free(model)
llama_backend_free()
}
private static var ctx_params: llama_context_params {
let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
debug("Using \(n_threads) threads")
var ctx_params = llama_context_default_params()
ctx_params.n_ctx = 512
ctx_params.n_threads = Int32(n_threads)
ctx_params.n_threads_batch = Int32(n_threads)
ctx_params.n_batch = 512
return ctx_params
}
static func createContext(path: String, ngl: Int = 0) throws -> ZenzContext {
debugLog("[ZenzContext] createContext called with path: \(path), ngl: \(ngl)")
llama_backend_init()
var model_params = llama_model_default_params()
debugLog("[ZenzContext] Default model_params.n_gpu_layers: \(model_params.n_gpu_layers)")
// IMPORTANT: Set fields in the same order as C struct definition
// model_params.devices is already NULL by default
model_params.n_gpu_layers = Int32(ngl)
// model_params.split_mode = 0 // LLAMA_SPLIT_MODE_NONE (should be default)
// model_params.main_gpu = 0 // Use first GPU (should be default)
// model_params.tensor_split = nil // NULL (should be default)
model_params.use_mmap = true
if ngl > 0 {
debugLog("[ZenzContext] Requesting \(ngl) layers to be offloaded to GPU")
}
debugLog("[ZenzContext] After setting, model_params.n_gpu_layers: \(model_params.n_gpu_layers)")
debugLog("[ZenzContext] Loading model from: \(path)")
// Log struct details
withUnsafePointer(to: &model_params) { ptr in
debugLog("[ZenzContext] model_params address: \(ptr)")
withUnsafeBytes(of: model_params) { bytes in
let hexString = bytes.map { String(format: "%02X", $0) }.joined(separator: " ")
debugLog("[ZenzContext] model_params bytes (first 32): \(String(hexString.prefix(95)))...")
}
}
debugLog("[ZenzContext] Calling llama_model_load_from_file...")
let model = llama_model_load_from_file(path, model_params)
guard let model else {
debugLog("[ZenzContext] ERROR: Could not load model at \(path)")
throw ZenzError.couldNotLoadModel(path: path)
}
debugLog("[ZenzContext] Model loaded successfully")
let context = llama_init_from_model(model, ctx_params)
guard let context else {
debug("Could not load context!")
throw ZenzError.couldNotLoadContext
}
let vocab = llama_model_get_vocab(model)
guard let vocab else {
debug("Could not load vocab!")
throw ZenzError.couldNotLoadVocab
}
return ZenzContext(model: model, context: context, vocab: vocab)
}
func reset_context() throws {
llama_free(self.context)
let context = llama_init_from_model(self.model, Self.ctx_params)
guard let context else {
debug("Could not load context!")
throw ZenzError.couldNotLoadContext
}
self.context = context
}
private func get_logits(tokens: [llama_token], logits_start_index: Int = 0) -> UnsafeMutablePointer<Float>? {
// manage kv_cache
do {
let commonTokens = self.prevInput.commonPrefix(with: tokens)
llama_kv_cache_seq_rm(context, 0, llama_pos(commonTokens.count), -1)
}
var batch = llama_batch_init(512, 0, 1)
let n_ctx = llama_n_ctx(context)
let n_kv_req = tokens.count + (Int(n_len) - tokens.count)
if n_kv_req > n_ctx {
debug("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
}
for i in tokens.indices {
llama_batch_add(&batch, tokens[i], Int32(i), [0], logits: logits_start_index <= i)
}
//
if llama_decode(context, batch) != 0 {
debug("llama_decode() failed")
return nil
}
return llama_get_logits(context)
}
func evaluate(text: String, ignorePrompt: String = "") -> Float {
let tokens_list = self.tokenize(text: text, add_bos: true, add_eos: true)
guard let logits = self.get_logits(tokens: tokens_list) else {
debug("logits unavailable")
return .nan
}
let tokenizedPromptCount = ignorePrompt.isEmpty ? 1 : tokenize(text: ignorePrompt, add_bos: true, add_eos: false).count
let n_vocab = llama_vocab_n_tokens(vocab)
var sum: Float = 0
//
for (i, token_id) in tokens_list.indexed().dropFirst(tokenizedPromptCount) {
// FIXME: there can be more efficient implementations, poossibly using Accelerate or other frameworks.
var log_prob: Float = 0
for index in ((i - 1) * Int(n_vocab)) ..< (i * Int(n_vocab)) {
log_prob += expf(logits[index])
}
log_prob = logf(log_prob)
log_prob = logits[Int((i - 1) * Int(n_vocab) + Int(token_id))] - log_prob
sum += log_prob
}
return sum
}
enum CandidateEvaluationResult: Sendable, Equatable, Hashable {
case error
case pass(score: Float, alternativeConstraints: [AlternativeConstraint])
case fixRequired(prefixConstraint: [UInt8])
case wholeResult(String)
struct AlternativeConstraint: Sendable, Equatable, Hashable {
var probabilityRatio: Float
var prefixConstraint: [UInt8]
}
}
func getLearningPriority(data: DicdataElement) -> Float {
//
//
return if 1 <= data.ruby.count && data.ruby.count <= 4 {
Float(data.ruby.count + 2)
} else if 5 <= data.ruby.count && data.ruby.count <= 15 {
Float(data.ruby.count * 2)
} else {
30
}
}
///
func pure_greedy_decoding(leftSideContext: String, maxCount: Int = .max) -> String {
var prompt_tokens = self.tokenize(text: leftSideContext, add_bos: false)
let initial_count = prompt_tokens.count
let eos_token = llama_vocab_eos(vocab)
while prompt_tokens.count - initial_count < maxCount {
let startOffset = prompt_tokens.count - 1
guard let logits = self.get_logits(tokens: prompt_tokens, logits_start_index: startOffset) else {
debug("logits unavailable")
return ""
}
let n_vocab = llama_vocab_n_tokens(vocab)
let startIndex = (prompt_tokens.count - 1 - startOffset) * Int(n_vocab)
let endIndex = (prompt_tokens.count - startOffset) * Int(n_vocab)
// Min-Heap使n-best
var max_token: llama_token = -1
var max_value: Float = Float.infinity * -1
for index in startIndex..<endIndex {
let token = llama_token(index - startIndex)
if max_value < logits[index] {
max_token = token
max_value = logits[index]
}
}
if max_token == eos_token {
break
} else {
prompt_tokens.append(max_token)
}
}
// Heap
let cchars: [CChar] = prompt_tokens.dropFirst(initial_count).flatMap(self.token_to_piece)
let data = Data(cchars.map { UInt8(bitPattern: $0) })
let result: String = String(data: data, encoding: .utf8) ?? ""
return result
}
func predict_next_character(leftSideContext: String, count: Int) -> [(character: Character, value: Float)] {
struct NextCharacterCandidate: Comparable {
static func < (lhs: NextCharacterCandidate, rhs: NextCharacterCandidate) -> Bool {
lhs.value < rhs.value
}
var character: Character
var value: Float
}
//
// \u{EE01}
let prompt_tokens = self.tokenize(text: "\u{EE00}。\u{EE02}\(leftSideContext)", add_bos: false)
let startOffset = prompt_tokens.count - 1
guard let logits = self.get_logits(tokens: prompt_tokens, logits_start_index: startOffset) else {
debug("logits unavailable")
return []
}
let n_vocab = llama_vocab_n_tokens(vocab)
var exp_sum: Float = 0
let startIndex = (prompt_tokens.count - 1 - startOffset) * Int(n_vocab)
let endIndex = (prompt_tokens.count - startOffset) * Int(n_vocab)
// Min-Heap使n-best
var minHeap: FixedSizeHeap<NextCharacterCandidate> = .init(size: count)
let token_to_penalty_weight: [llama_token: Float] = prompt_tokens.indexed().reduce(into: [:]) { dict, item in
let (index, token) = item
//
dict[token, default: 0] += 2 / Float(prompt_tokens.count - index)
}
for index in startIndex..<endIndex {
let token = llama_token(index - startIndex)
let repeat_penalty = Float(1.0 + token_to_penalty_weight[token, default: 0])
let v = expf(logits[index] / repeat_penalty)
exp_sum += v
let tokenPieceData = Data((token_to_piece(token: token)).map(UInt8.init))
let character: Character
if let validCharacter = String(data: tokenPieceData, encoding: .utf8), let c = validCharacter.first {
character = c
} else {
continue
}
minHeap.insertIfPossible(NextCharacterCandidate(character: character, value: v))
}
// Heap
return minHeap.unordered.sorted { $0.value > $1.value }.map { ($0.character, $0.value / exp_sum) }
}
func evaluate_candidate(
input: String,
candidate: Candidate,
requestRichCandidates: Bool,
personalizationMode: (mode: ConvertRequestOptions.ZenzaiMode.PersonalizationMode, base: EfficientNGram, personal: EfficientNGram)?,
versionDependentConfig: ConvertRequestOptions.ZenzaiVersionDependentMode
) -> CandidateEvaluationResult {
debug("Evaluate", candidate)
// For zenz-v1 model, \u{EE00} is a token used for 'start query', and \u{EE01} is a token used for 'start answer'
// We assume \u{EE01}\(candidate) is always splitted into \u{EE01}_\(candidate) by zenz-v1 tokenizer
var userDictionaryPrompt: String = ""
for item in candidate.data where item.metadata.contains(.isFromUserDictionary) {
userDictionaryPrompt += "\(item.word)(\(item.ruby.toHiragana()))"
}
var conditions: [String] = []
//
if !userDictionaryPrompt.isEmpty {
conditions.append("辞書:\(userDictionaryPrompt)")
}
//
switch versionDependentConfig {
case .v1: break
case .v2(let mode):
if let profile = mode.profile, !profile.isEmpty {
let pf = profile.suffix(25)
conditions.append("プロフィール:\(pf)")
}
case .v3(let mode):
if let profile = mode.profile, !profile.isEmpty {
let pf = profile.suffix(25)
conditions.append("\u{EE03}\(pf)")
}
if let topic = mode.topic, !topic.isEmpty {
let tp = topic.suffix(25)
conditions.append("\u{EE04}\(tp)")
}
if let style = mode.style, !style.isEmpty {
let st = style.suffix(25)
conditions.append("\u{EE05}\(st)")
}
if let preference = mode.preference, !preference.isEmpty {
let pr = preference.suffix(25)
conditions.append("\u{EE06}\(pr)")
}
}
//
let leftSideContext: String = switch versionDependentConfig {
case .v1: ""
case .v2(let mode):
if let leftSideContext = mode.leftSideContext {
String(leftSideContext.suffix(40))
} else {
""
}
case .v3(let mode):
if let leftSideContext = mode.leftSideContext {
String(leftSideContext.suffix(40))
} else {
""
}
}
let inputTag = "\u{EE00}"
let outputTag = "\u{EE01}"
let contextTag = "\u{EE02}"
//
var prompt: String = switch versionDependentConfig {
case .v1:
inputTag + input + outputTag
case .v2:
if !conditions.isEmpty {
// empty:
inputTag + input + contextTag + conditions.joined(separator: "") + "・発言:\(leftSideContext)" + outputTag
} else if !leftSideContext.isEmpty {
// emptyleftSideContext
inputTag + input + contextTag + leftSideContext + outputTag
} else {
//
inputTag + input + outputTag
}
case .v3:
if !leftSideContext.isEmpty {
// leftSideContextEmpty
// contextinput(KV-caching)
conditions.joined(separator: "") + contextTag + leftSideContext + inputTag + input + outputTag
} else {
//
conditions.joined(separator: "") + inputTag + input + outputTag
}
}
//
prompt = self.preprocessText(text: prompt)
// Therefore, tokens = prompt_tokens + candidate_tokens is an appropriate operation.
let prompt_tokens = self.tokenize(text: prompt, add_bos: true, add_eos: false)
let candidate_tokens = self.tokenize(text: self.preprocessText(text: candidate.text), add_bos: false, add_eos: false)
let tokens = prompt_tokens + candidate_tokens
let startOffset = prompt_tokens.count - 1
let pos_max = llama_kv_cache_seq_pos_max(self.context, 0)
debug("pos max:", pos_max)
guard let logits = self.get_logits(tokens: tokens, logits_start_index: startOffset) else {
debug("logits unavailable")
return .error
}
let n_vocab = llama_vocab_n_tokens(vocab)
let is_learned_token: [(isLearned: Bool, priority: Float)] = Array(repeating: (false, 0), count: prompt_tokens.count) + candidate.data.flatMap {
// priority
Array(repeating: ($0.metadata.contains(.isLearned), logf(getLearningPriority(data: $0))), count: self.tokenize(text: $0.word, add_bos: false).count)
}
var score: Float = 0
struct AlternativeHighProbToken: Comparable {
static func < (lhs: AlternativeHighProbToken, rhs: AlternativeHighProbToken) -> Bool {
lhs.probabilityRatioToMaxProb < rhs.probabilityRatioToMaxProb
}
var token: llama_token
var constraint: [UInt8]
// probability
var probabilityRatioToMaxProb: Float
}
var altTokens = FixedSizeHeap<AlternativeHighProbToken>(size: requestRichCandidates ? 5 : 0)
for (i, token_id) in tokens.indexed().dropFirst(prompt_tokens.count) {
//
// softmaxmaxlogits
// log_probsoftmax
struct TokenAndLogprob: Comparable {
static func < (lhs: TokenAndLogprob, rhs: TokenAndLogprob) -> Bool {
lhs.logprob < rhs.logprob
}
var token: llama_token
var logprob: Float
}
var sumexp: Float = 0
let startIndex = (i - 1 - startOffset) * Int(n_vocab)
let endIndex = (i - startOffset) * Int(n_vocab)
var tokenHeap = FixedSizeHeap<TokenAndLogprob>(size: requestRichCandidates ? 3 : 1)
for index in startIndex ..< endIndex {
sumexp += expf(logits[index])
}
let logsumexp = logf(sumexp)
if let (mode, baseLM, personalLM) = personalizationMode, mode.alpha > 0 {
let prefix = tokens[..<i].dropFirst(prompt_tokens.count).map(Int.init)
let baseProb: [Float]
let personalProb: [Float]
// SwiftNgramLM(Unigram)
if !prefix.isEmpty {
baseProb = baseLM.bulkPredict(prefix).map { logf(Float($0) + 1e-7) }
personalProb = personalLM.bulkPredict(prefix).map { logf(Float($0) + 1e-7) }
} else {
baseProb = Array(repeating: 0, count: Int(n_vocab))
personalProb = baseProb
}
// p = probabilityBuffer / exp_sum
// p' = p / p_b * p_p
for (i, (lpb, lpp)) in zip(0 ..< Int(n_vocab), zip(baseProb, personalProb)) {
let logp = logits[startIndex + i] - logsumexp
let logp_ = logp + mode.alpha * (lpp - lpb) // personalized probability
tokenHeap.insertIfPossible(TokenAndLogprob(token: llama_token(i), logprob: logp_))
}
} else {
// p = probabilityBuffer / exp_sum
for i in startIndex ..< endIndex {
let logp = logits[i] - logsumexp
tokenHeap.insertIfPossible(TokenAndLogprob(token: llama_token(i - startIndex), logprob: logp))
}
}
guard let maxItem = tokenHeap.max else {
debug("Max Item could not be found for unknown reason")
return .error
}
//
if maxItem.token != token_id {
if maxItem.token == llama_vocab_eos(vocab) {
let cchars: [CChar] = tokens[..<i].reduce(into: []) {
$0.append(contentsOf: token_to_piece(token: $1))
}
let data = Data(cchars.map { UInt8(bitPattern: $0) })
let string: String = String(data: data, encoding: .utf8) ?? ""
//
let wholeResult = String(string.dropFirst(prompt.count))
return .wholeResult(wholeResult)
} else {
let actual_logp: Float = logits[startIndex + Int(token_id)] - logsumexp
// actual_exp
let preferLearnedToken = is_learned_token[i].isLearned && actual_logp + is_learned_token[i].priority > maxItem.logprob
if !preferLearnedToken {
// adding "\0"
let cchars = tokens[..<i].reduce(into: []) {
$0.append(contentsOf: token_to_piece(token: $1))
} + token_to_piece(token: maxItem.token)
return .fixRequired(prefixConstraint: cchars.dropFirst(prompt.utf8.count).map(UInt8.init))
}
}
} else if !tokenHeap.isEmpty {
tokenHeap.removeMax()
let prefix = tokens[..<i].reduce(into: []) {
$0.append(contentsOf: token_to_piece(token: $1))
}.dropFirst(prompt.utf8.count)
for item in tokenHeap.unordered {
altTokens.insertIfPossible(
AlternativeHighProbToken(
token: item.token,
constraint: prefix.map(UInt8.init) + token_to_piece(token: item.token).map(UInt8.init),
probabilityRatioToMaxProb: expf(item.logprob - maxItem.logprob)
)
)
}
}
score += maxItem.logprob
}
return .pass(score: score, alternativeConstraints: altTokens.unordered.sorted(by: >).map {.init(probabilityRatio: $0.probabilityRatioToMaxProb, prefixConstraint: $0.constraint)})
}
private func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], logits: Bool) {
batch.token [Int(batch.n_tokens)] = id
batch.pos [Int(batch.n_tokens)] = pos
batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
for i in 0..<seq_ids.count {
batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
}
batch.logits [Int(batch.n_tokens)] = logits ? 1 : 0
batch.n_tokens += 1
}
private func preprocessText(text: String) -> String {
// replace space into ideographic space (\u3000) for zenz tokenizer
// replace newline into null for zenz tokenizer
return text.replacingOccurrences(of: " ", with: "\u{3000}").replacingOccurrences(of: "\n", with: "")
}
private func tokenize(text: String, add_bos: Bool, add_eos: Bool = false) -> [llama_token] {
let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
var swiftTokens: [llama_token] = if tokenCount < 0 {
[llama_vocab_bos(vocab)]
} else {
(0..<tokenCount).map {tokens[Int($0)]}
}
tokens.deallocate()
if add_eos {
swiftTokens.append(llama_vocab_eos(vocab))
}
return swiftTokens
}
/// - note: The result does not contain null-terminator
private func token_to_piece(token: llama_token) -> [CChar] {
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
result.initialize(repeating: Int8(0), count: 8)
defer {
result.deallocate()
}
let nTokens = llama_token_to_piece(vocab, token, result, 8, 0, false)
if nTokens < 0 {
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
defer {
newResult.deallocate()
}
let nNewTokens = llama_token_to_piece(vocab, token, newResult, Int32(-nTokens), 0, false)
let bufferPointer: UnsafeBufferPointer<Int8> = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else {
let bufferPointer: UnsafeBufferPointer<Int8> = UnsafeBufferPointer(start: result, count: Int(nTokens))
return Array(bufferPointer)
}
}
}