fix: 前方文脈に半角スペースが入ると変換が不安定になっていた問題を修正

2025-12-03 02:58:27 +00:00 · 2025-03-08 15:52:13 +09:00
parent ce48df60ab
commit 99625951c7
1 changed files with 8 additions and 4 deletions
--- a/Sources/KanaKanjiConverterModule/Zenz/ZenzContext.swift
+++ b/Sources/KanaKanjiConverterModule/Zenz/ZenzContext.swift
@@ -351,7 +351,7 @@ final class ZenzContext {
        let outputTag = "\u{EE01}"
        let contextTag = "\u{EE02}"
        // プロンプトを作成
-        let prompt: String = switch versionDependentConfig {
+        var prompt: String = switch versionDependentConfig {
        case .v1:
            inputTag + input + outputTag
        case .v2:
@@ -375,9 +375,11 @@ final class ZenzContext {
                conditions.joined(separator: "") + inputTag + input + outputTag
            }
        }
+        // プロンプトの前処理を適用
+        prompt = self.preprocessText(text: prompt)
        // Therefore, tokens = prompt_tokens + candidate_tokens is an appropriate operation.
        let prompt_tokens = self.tokenize(text: prompt, add_bos: true, add_eos: false)
-        let candidate_tokens = self.tokenize(text: candidate.text, add_bos: false, add_eos: false)
+        let candidate_tokens = self.tokenize(text: self.preprocessText(text: candidate.text), add_bos: false, add_eos: false)
        let tokens = prompt_tokens + candidate_tokens
        let startOffset = prompt_tokens.count - 1
        let pos_max = llama_kv_cache_seq_pos_max(self.context, 0)
@@ -513,10 +515,12 @@ final class ZenzContext {
        batch.n_tokens += 1
    }

-    private func tokenize(text: String, add_bos: Bool, add_eos: Bool = false) -> [llama_token] {
+    private func preprocessText(text: String) -> String {
        // replace space into ideographic space (\u3000) for zenz tokenizer
        // replace newline into null for zenz tokenizer
-        let text = text.replacingOccurrences(of: " ", with: "\u{3000}").replacingOccurrences(of: "\n", with: "")
+        return text.replacingOccurrences(of: " ", with: "\u{3000}").replacingOccurrences(of: "\n", with: "")
+    }
+    private func tokenize(text: String, add_bos: Bool, add_eos: Bool = false) -> [llama_token] {
        let utf8Count = text.utf8.count
        let n_tokens = utf8Count + (add_bos ? 1 : 0)
        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)