Merge pull request #157 from fkunn1326/latest_llama

feat: 最新のllama.cppに対応
2025-08-22 15:05:26 +00:00 · 2025-03-18 23:03:14 +09:00
parent 271222a82a a9f568ddd3
commit 7d5dd99fd7
13 changed files with 1607 additions and 1266 deletions
--- a/.github/workflows/swift-in-devcontainer.yml
+++ b/.github/workflows/swift-in-devcontainer.yml
@ -14,10 +14,19 @@ jobs:
      with:
        submodules: true

+    - name: Download pre-built llama.cpp binaries
+      run: |
+        wget -O llama-cpp-bin.zip "https://github.com/fkunn1326/llama.cpp/releases/download/b4846/llama-b4846-bin-ubuntu-x64.zip"
+        unzip llama-cpp-bin.zip -d llama-cpp-bin
+
+    - name: Copy llama.cpp binaries
+      run: cp llama-cpp-bin/build/bin/lib*.so ./
+
    - name: Build and Test in DevContainer
      uses: devcontainers/ci@v0.3
      with:
        push: never
        runCmd: |
-          swift build -Xswiftc -strict-concurrency=complete -v
-          swift test -c release -Xswiftc -strict-concurrency=complete -v
+          swift build -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v
+          cp llama-cpp-bin/build/bin/lib*.so .build/*/release/
+          swift test -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v
--- a/.github/workflows/swift.yml
+++ b/.github/workflows/swift.yml
@ -39,10 +39,18 @@ jobs:
      - uses: actions/checkout@v4
        with:
          submodules: true
+      - name: Download pre-built llama.cpp binaries
+        run: |
+          wget -O llama-cpp-bin.zip "https://github.com/fkunn1326/llama.cpp/releases/download/b4846/llama-b4846-bin-ubuntu-x64.zip"
+          unzip llama-cpp-bin.zip -d llama-cpp-bin
+      - name: Copy llama.cpp binaries
+        run: cp llama-cpp-bin/build/bin/lib*.so ./
      - name: Build
-        run: swift build -Xswiftc -strict-concurrency=complete -v
+        run: swift build -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v
      - name: Run tests
-        run: swift test -c release -Xswiftc -strict-concurrency=complete -v
+        run: |
+          cp llama-cpp-bin/build/bin/lib*.so .build/*/release/
+          swift test -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v
  windows-build:
    name: Swift ${{ matrix.swift-version.tag }} on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
@ -62,18 +70,20 @@ jobs:
      - uses: actions/checkout@v4
        with:
          submodules: true
-      - name: Clone llama.cpp
-        run: git clone -b ku-nlp/gpt2-japanese-char https://github.com/ensan-hcl/llama.cpp.git
-      - name: Build llama.cpp
+      - name: Download pre-built llama.cpp binaries
+        shell: pwsh
        run: |
-          cmake -B build -DBUILD_SHARED_LIBS=ON
-          cmake --build build --config Release
-        working-directory: ./llama.cpp
-      - name: Copy built files
+          $zipUrl = "https://github.com/fkunn1326/llama.cpp/releases/download/b4846/llama-b4846-bin-win-avx-x64.zip"
+          $zipPath = "llama-cpp-bin.zip"
+          Invoke-WebRequest -Uri $zipUrl -OutFile $zipPath
+          Expand-Archive -Path $zipPath -DestinationPath llama-cpp-bin
+      - name: Copy llama.cpp binaries
+        shell: pwsh
        run: |
-          cp ./build/bin/Release/llama.dll ../
-          cp ./build/Release/llama.lib ../
-        working-directory: ./llama.cpp
+          Copy-Item -Path "llama-cpp-bin/llama.dll" -Destination "./"
+          Copy-Item -Path "llama-cpp-bin/llama.lib" -Destination "./"
+          Copy-Item -Path "llama-cpp-bin/ggml.dll" -Destination "./"
+          Copy-Item -Path "llama-cpp-bin/ggml-*.dll" -Destination "./"
      - name: Build
        run: swift build -Xswiftc -strict-concurrency=complete -v
      - name: Run tests
--- a/Package.swift
+++ b/Package.swift
@ -139,23 +139,6 @@ if checkObjcAvailability() {
 }
 #endif

-#if os(Windows)
-targets.append(contentsOf: [
-    .systemLibrary(
-        name: "llama.cpp"
-    ),
-    .target(
-        name: "KanaKanjiConverterModule",
-        dependencies: [
-            "SwiftUtils",
-            "llama.cpp",
-            "EfficientNGram",
-            .product(name: "Collections", package: "swift-collections"),
-        ],
-        swiftSettings: swiftSettings
-    )
-])
-#else
 if let envValue = ProcessInfo.processInfo.environment["LLAMA_MOCK"], envValue == "1" {
    targets.append(contentsOf: [
        .target(name: "llama-mock"),
@ -171,24 +154,43 @@ if let envValue = ProcessInfo.processInfo.environment["LLAMA_MOCK"], envValue ==
        )
    ])
 } else {
-    dependencies.append(
-        .package(url: "https://github.com/ensan-hcl/llama.cpp", branch: "6b862f4")
-    )
-
+    #if os(Windows) || os(Linux)
    targets.append(contentsOf: [
+        .systemLibrary(
+            name: "llama.cpp"
+        ),
        .target(
            name: "KanaKanjiConverterModule",
            dependencies: [
                "SwiftUtils",
+                "llama.cpp",
                "EfficientNGram",
-                .product(name: "llama", package: "llama.cpp"),
                .product(name: "Collections", package: "swift-collections"),
            ],
            swiftSettings: swiftSettings
        )
    ])
+    #else
+    targets.append(contentsOf: [
+        .binaryTarget(
+            name: "llama.cpp",
+            url: "https://github.com/fkunn1326/llama.cpp/releases/download/b4844/llama-b4844-xcframework.zip",
+            // this can be computed `swift package compute-checksum llama-b4844-xcframework.zip`
+            checksum: "40bd1e58e727511649e13a6de9eb577ea8be78fe4183c2e1b382b12054849f05"
+        ),
+        .target(
+            name: "KanaKanjiConverterModule",
+            dependencies: [
+                "SwiftUtils",
+                "EfficientNGram",
+                "llama.cpp",
+                .product(name: "Collections", package: "swift-collections"),
+            ],
+            swiftSettings: swiftSettings
+        )
+    ])
+    #endif
 }
-#endif

 let package = Package(
    name: "AzooKeyKanakanjiConverter",
--- a/Sources/KanaKanjiConverterModule/Kana2Kanji/zenzai.swift
+++ b/Sources/KanaKanjiConverterModule/Kana2Kanji/zenzai.swift
@ -62,7 +62,7 @@ extension Kana2Kanji {
        versionDependentConfig: ConvertRequestOptions.ZenzaiVersionDependentMode
    ) -> (result: LatticeNode, nodes: Nodes, cache: ZenzaiCache) {
        var constraint = zenzaiCache?.getNewConstraint(for: inputData) ?? PrefixConstraint([])
-        print("initial constraint", constraint)
+        debug("initial constraint", constraint)
        let eosNode = LatticeNode.EOSNode
        var nodes: Kana2Kanji.Nodes = []
        var constructedCandidates: [(RegisteredNode, Candidate)] = []
@ -96,19 +96,19 @@ extension Kana2Kanji {
                }
            }
            guard var (index, candidate) = best else {
-                print("best was not found!")
+                debug("best was not found!")
                // Emptyの場合
                // 制約が満たせない場合は無視する
                return (eosNode, nodes, ZenzaiCache(inputData, constraint: PrefixConstraint([]), satisfyingCandidate: nil))
            }

-            print("Constrained draft modeling", -start.timeIntervalSinceNow)
+            debug("Constrained draft modeling", -start.timeIntervalSinceNow)
            reviewLoop: while true {
                // resultsを更新
                // ここでN-Bestも並び変えていることになる
                insertedCandidates.insert((draftResult.result.prevs[index], candidate), at: 0)
                if inferenceLimit == 0 {
-                    print("inference limit! \(candidate.text) is used for excuse")
+                    debug("inference limit! \(candidate.text) is used for excuse")
                    // When inference occurs more than maximum times, then just return result at this point
                    return (eosNode, nodes, ZenzaiCache(inputData, constraint: constraint, satisfyingCandidate: candidate))
                }
@ -187,25 +187,25 @@ extension Kana2Kanji {
        switch reviewResult {
        case .error:
            // 何らかのエラーが発生
-            print("error")
+            debug("error")
            return .return(constraint: constraint, alternativeConstraints: [], satisfied: false)
        case .pass(let score, let alternativeConstraints):
            // 合格
-            print("passed:", score)
+            debug("passed:", score)
            return .return(constraint: constraint, alternativeConstraints: alternativeConstraints, satisfied: true)
        case .fixRequired(let prefixConstraint):
            // 同じ制約が2回連続で出てきたら諦める
            if constraint.constraint == prefixConstraint {
-                print("same constraint:", prefixConstraint)
+                debug("same constraint:", prefixConstraint)
                return .return(constraint: PrefixConstraint([]), alternativeConstraints: [], satisfied: false)
            }
            // 制約が得られたので、更新する
            constraint = PrefixConstraint(prefixConstraint)
-            print("update constraint:", constraint)
+            debug("update constraint:", constraint)
            // もし制約を満たす候補があるならそれを使って再レビューチャレンジを戦うことで、推論を減らせる
            for (i, candidate) in candidates.indexed() where i != candidateIndex {
                if candidate.text.utf8.hasPrefix(prefixConstraint) && self.heuristicRetryValidation(candidate.text) {
-                    print("found \(candidate.text) as another retry")
+                    debug("found \(candidate.text) as another retry")
                    return .retry(candidateIndex: i)
                }
            }
@ -214,16 +214,16 @@ extension Kana2Kanji {
            let newConstraint = PrefixConstraint(Array(wholeConstraint.utf8), hasEOS: true)
            // 同じ制約が2回連続で出てきたら諦める
            if constraint == newConstraint {
-                print("same constraint:", constraint)
+                debug("same constraint:", constraint)
                return .return(constraint: PrefixConstraint([]), alternativeConstraints: [], satisfied: false)
            }
            // 制約が得られたので、更新する
-            print("update whole constraint:", wholeConstraint)
+            debug("update whole constraint:", wholeConstraint)
            constraint = PrefixConstraint(Array(wholeConstraint.utf8), hasEOS: true)
            // もし制約を満たす候補があるならそれを使って再レビューチャレンジを戦うことで、推論を減らせる
            for (i, candidate) in candidates.indexed() where i != candidateIndex {
                if candidate.text == wholeConstraint && self.heuristicRetryValidation(candidate.text) {
-                    print("found \(candidate.text) as another retry")
+                    debug("found \(candidate.text) as another retry")
                    return .retry(candidateIndex: i)
                }
            }
--- a/Sources/KanaKanjiConverterModule/Zenz/ZenzContext.swift
+++ b/Sources/KanaKanjiConverterModule/Zenz/ZenzContext.swift
@ -59,11 +59,13 @@ struct FixedSizeHeap<Element: Comparable> {
 enum ZenzError: LocalizedError {
    case couldNotLoadModel(path: String)
    case couldNotLoadContext
+    case couldNotLoadVocab

    var errorDescription: String? {
        switch self {
-        case .couldNotLoadContext: "failed to load context"
-        case .couldNotLoadModel(path: let path): "could not load model weight at \(path)"
+        case .couldNotLoadContext: return "failed to load context"
+        case .couldNotLoadModel(path: let path): return "could not load model weight at \(path)"
+        case .couldNotLoadVocab: return "failed to load vocab"
        }
    }
 }
@ -71,18 +73,20 @@ enum ZenzError: LocalizedError {
 final class ZenzContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
+    private var vocab: OpaquePointer
    private var prevInput: [llama_token] = []

    private let n_len: Int32 = 512

-    init(model: OpaquePointer, context: OpaquePointer) {
+    init(model: OpaquePointer, context: OpaquePointer, vocab: OpaquePointer) {
        self.model = model
        self.context = context
+        self.vocab = vocab
    }

    deinit {
        llama_free(context)
-        llama_free_model(model)
+        llama_model_free(model)
        llama_backend_free()
    }

@ -90,10 +94,9 @@ final class ZenzContext {
        let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
        debug("Using \(n_threads) threads")
        var ctx_params = llama_context_default_params()
-        ctx_params.seed = 1234
        ctx_params.n_ctx = 512
-        ctx_params.n_threads       = UInt32(n_threads)
-        ctx_params.n_threads_batch = UInt32(n_threads)
+        ctx_params.n_threads       = Int32(n_threads)
+        ctx_params.n_threads_batch = Int32(n_threads)
        ctx_params.n_batch = 512
        return ctx_params
    }
@ -102,24 +105,30 @@ final class ZenzContext {
        llama_backend_init()
        var model_params = llama_model_default_params()
        model_params.use_mmap = true
-        let model = llama_load_model_from_file(path, model_params)
+        let model = llama_model_load_from_file(path, model_params)
        guard let model else {
            debug("Could not load model at \(path)")
            throw ZenzError.couldNotLoadModel(path: path)
        }

-        let context = llama_new_context_with_model(model, ctx_params)
+        let context = llama_init_from_model(model, ctx_params)
        guard let context else {
            debug("Could not load context!")
            throw ZenzError.couldNotLoadContext
        }

-        return ZenzContext(model: model, context: context)
+        let vocab = llama_model_get_vocab(model)
+        guard let vocab else {
+            debug("Could not load vocab!")
+            throw ZenzError.couldNotLoadVocab
+        }
+
+        return ZenzContext(model: model, context: context, vocab: vocab)
    }

    func reset_context() throws {
        llama_free(self.context)
-        let context = llama_new_context_with_model(self.model, Self.ctx_params)
+        let context = llama_init_from_model(self.model, Self.ctx_params)
        guard let context else {
            debug("Could not load context!")
            throw ZenzError.couldNotLoadContext
@ -157,7 +166,7 @@ final class ZenzContext {
            return .nan
        }
        let tokenizedPromptCount = ignorePrompt.isEmpty ? 1 : tokenize(text: ignorePrompt, add_bos: true, add_eos: false).count
-        let n_vocab = llama_n_vocab(model)
+        let n_vocab = llama_vocab_n_tokens(vocab)

        var sum: Float = 0
        // 最初のプロンプト部分は無視する
@ -202,14 +211,14 @@ final class ZenzContext {
    func pure_greedy_decoding(leftSideContext: String, maxCount: Int = .max) -> String {
        var prompt_tokens = self.tokenize(text: leftSideContext, add_bos: false)
        let initial_count = prompt_tokens.count
-        let eos_token = llama_token_eos(model)
+        let eos_token = llama_vocab_eos(vocab)
        while prompt_tokens.count - initial_count < maxCount {
            let startOffset = prompt_tokens.count - 1
            guard let logits = self.get_logits(tokens: prompt_tokens, logits_start_index: startOffset) else {
-                print("logits unavailable")
+                debug("logits unavailable")
                return ""
            }
-            let n_vocab = llama_n_vocab(model)
+            let n_vocab = llama_vocab_n_tokens(vocab)
            let startIndex = (prompt_tokens.count - 1 - startOffset) * Int(n_vocab)
            let endIndex = (prompt_tokens.count - startOffset) * Int(n_vocab)
            // Min-Heapを使用してn-bestを計算
@ -249,11 +258,11 @@ final class ZenzContext {
        let startOffset = prompt_tokens.count - 1

        guard let logits = self.get_logits(tokens: prompt_tokens, logits_start_index: startOffset) else {
-            print("logits unavailable")
+            debug("logits unavailable")
            return []
        }

-        let n_vocab = llama_n_vocab(model)
+        let n_vocab = llama_vocab_n_tokens(vocab)
        var exp_sum: Float = 0
        let startIndex = (prompt_tokens.count - 1 - startOffset) * Int(n_vocab)
        let endIndex = (prompt_tokens.count - startOffset) * Int(n_vocab)
@ -293,7 +302,7 @@ final class ZenzContext {
        personalizationMode: (mode: ConvertRequestOptions.ZenzaiMode.PersonalizationMode, base: EfficientNGram, personal: EfficientNGram)?,
        versionDependentConfig: ConvertRequestOptions.ZenzaiVersionDependentMode
    ) -> CandidateEvaluationResult {
-        print("Evaluate", candidate)
+        debug("Evaluate", candidate)
        // For zenz-v1 model, \u{EE00} is a token used for 'start query', and \u{EE01} is a token used for 'start answer'
        // We assume \u{EE01}\(candidate) is always splitted into \u{EE01}_\(candidate) by zenz-v1 tokenizer
        var userDictionaryPrompt: String = ""
@ -383,12 +392,12 @@ final class ZenzContext {
        let tokens = prompt_tokens + candidate_tokens
        let startOffset = prompt_tokens.count - 1
        let pos_max = llama_kv_cache_seq_pos_max(self.context, 0)
-        print("pos max:", pos_max)
+        debug("pos max:", pos_max)
        guard let logits = self.get_logits(tokens: tokens, logits_start_index: startOffset) else {
            debug("logits unavailable")
            return .error
        }
-        let n_vocab = llama_n_vocab(model)
+        let n_vocab = llama_vocab_n_tokens(vocab)
        let is_learned_token: [(isLearned: Bool, priority: Float)] = Array(repeating: (false, 0), count: prompt_tokens.count) + candidate.data.flatMap {
            // priorityは文字数にする→文字数が長いほど優先される
            Array(repeating: ($0.metadata.contains(.isLearned), logf(getLearningPriority(data: $0))), count: self.tokenize(text: $0.word, add_bos: false).count)
@ -456,12 +465,12 @@ final class ZenzContext {
            }

            guard let maxItem = tokenHeap.max else {
-                print("Max Item could not be found for unknown reason")
+                debug("Max Item could not be found for unknown reason")
                return .error
            }
            // ここで最も良い候補であったかをチェックする
            if maxItem.token != token_id {
-                if maxItem.token == llama_token_eos(model) {
+                if maxItem.token == llama_vocab_eos(vocab) {
                    var cchars = tokens[..<i].reduce(into: []) {
                        $0.append(contentsOf: token_to_piece(token: $1))
                    }
@ -524,15 +533,15 @@ final class ZenzContext {
        let utf8Count = text.utf8.count
        let n_tokens = utf8Count + (add_bos ? 1 : 0)
        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
+        let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
        var swiftTokens: [llama_token] = if tokenCount < 0 {
-            [llama_token_bos(model)]
+            [llama_vocab_bos(vocab)]
        } else {
            (0..<tokenCount).map {tokens[Int($0)]}
        }
        tokens.deallocate()
        if add_eos {
-            swiftTokens.append(llama_token_eos(model))
+            swiftTokens.append(llama_vocab_eos(vocab))
        }
        return swiftTokens
    }
@ -544,7 +553,7 @@ final class ZenzContext {
        defer {
            result.deallocate()
        }
-        let nTokens = llama_token_to_piece(model, token, result, 8, false)
+        let nTokens = llama_token_to_piece(vocab, token, result, 8, 0, false)

        if nTokens < 0 {
            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@ -552,11 +561,11 @@ final class ZenzContext {
            defer {
                newResult.deallocate()
            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
-            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
+            let nNewTokens = llama_token_to_piece(vocab, token, newResult, Int32(-nTokens), 0, false)
+            let bufferPointer: UnsafeBufferPointer<Int8> = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
            return Array(bufferPointer)
        } else {
-            let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
+            let bufferPointer: UnsafeBufferPointer<Int8> = UnsafeBufferPointer(start: result, count: Int(nTokens))
            return Array(bufferPointer)
        }
    }
--- a/Sources/llama-mock/llama-mock.swift
+++ b/Sources/llama-mock/llama-mock.swift
@ -9,19 +9,20 @@ package typealias llama_seq_id = Int32
 package struct llama_context_params {
    package var seed: Int
    package var n_ctx: Int
-    package var n_threads: UInt32
-    package var n_threads_batch: UInt32
+    package var n_threads: Int32
+    package var n_threads_batch: Int32
    package var n_batch: Int
 }
 package func llama_context_default_params() -> llama_context_params { unimplemented() }

 package typealias llama_context = OpaquePointer
-package func llama_new_context_with_model(_ model: llama_model, _ ctx_params: llama_context_params) -> llama_context? { unimplemented() }
+package func llama_init_from_model(_ model: llama_model, _ ctx_params: llama_context_params) -> llama_context? { unimplemented() }
 package func llama_free(_ context: llama_context) {}

 package typealias llama_model = OpaquePointer
+package typealias llama_vocab = OpaquePointer

-package func llama_free_model(_ model: llama_model) {}
+package func llama_model_free(_ model: llama_model) {}

 package func llama_backend_init() {}
 package func llama_backend_free() {}
@ -31,7 +32,9 @@ package struct llama_model_params {
 }
 package func llama_model_default_params() -> llama_model_params { unimplemented() }

-package func llama_load_model_from_file(_ path: String, _ model_params: llama_model_params) -> llama_model? { unimplemented() }
+package func llama_model_get_vocab(_ model: llama_model) -> llama_vocab? { unimplemented() }
+
+package func llama_model_load_from_file(_ path: String, _ model_params: llama_model_params) -> llama_model? { unimplemented() }

 package func llama_kv_cache_seq_rm(_ ctx: llama_context, _ seq_id: llama_seq_id, _ p0: llama_pos, _ p1: llama_pos) {}
 package func llama_kv_cache_seq_pos_max(_ ctx: llama_context, _ seq_id: llama_seq_id) -> Int { unimplemented() }
@ -48,12 +51,12 @@ package struct llama_batch {
 package func llama_batch_init(_ n_tokens: Int, _ embd: Int, _ n_seq_max: Int) -> llama_batch { unimplemented() }

 package func llama_n_ctx(_ ctx: llama_context) -> Int { unimplemented() }
-package func llama_n_vocab(_ model: llama_model) -> Int { unimplemented() }
+package func llama_vocab_n_tokens(_ vocab: llama_vocab) -> Int { unimplemented() }

 package func llama_tokenize(_ model: llama_model, _ text: String, _ text_len: Int32, _ tokens: UnsafeMutablePointer<llama_token>, _ n_tokens_max: Int32, _ add_special: Bool, _ parse_special: Bool) -> Int { unimplemented() }
-package func llama_token_bos(_ model: llama_model) -> llama_token { unimplemented() }
-package func llama_token_eos(_ model: llama_model) -> llama_token { unimplemented() }
-package func llama_token_to_piece(_ model: llama_model, _ token: llama_token, _ buf: UnsafeMutablePointer<Int8>, _ length: Int32, _ special: Bool) -> Int32 { unimplemented() }
+package func llama_vocab_eos(_ vocab: llama_vocab) -> llama_token { unimplemented() }
+package func llama_vocab_bos(_ vocab: llama_vocab) -> llama_token { unimplemented() }
+package func llama_token_to_piece(_ vocab: llama_vocab, _ token: llama_token, _ buf: UnsafeMutablePointer<Int8>, _ length: Int32, _ lstrip: Int32, _ special: Bool) -> Int32 { unimplemented() }

 package func llama_decode(_ ctx: llama_context, _ batch: llama_batch) -> Int { unimplemented() }
 package func llama_get_logits(_ ctx: llama_context) -> UnsafeMutablePointer<Float>? { unimplemented() }
--- a/Sources/llama.cpp/ggml-alloc.h
+++ b/Sources/llama.cpp/ggml-alloc.h
@ -1,4 +1,3 @@
-// header file from https://github.com/ensan-hcl/llama.cpp/tree/ku-nlp/gpt2-japanese-char
 #pragma once

 #include "ggml.h"
@ -8,8 +7,8 @@ extern "C" {
 #endif

 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct             ggml_backend * ggml_backend_t;

 // Tensor allocator
 struct ggml_tallocr {
@ -25,7 +24,7 @@ GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
 // Graph allocator
 /*
  Example usage:
-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());

    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
    ggml_gallocr_reserve(galloc, build_graph(max_batch));
@ -74,4 +73,4 @@ GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml

 #ifdef  __cplusplus
 }
-#endif
+#endif
--- a/Sources/llama.cpp/ggml-backend.h
+++ b/Sources/llama.cpp/ggml-backend.h
@ -1,9 +1,22 @@
-// header file from https://github.com/ensan-hcl/llama.cpp/tree/ku-nlp/gpt2-japanese-char
 #pragma once

 #include "ggml.h"
 #include "ggml-alloc.h"

+#ifdef GGML_BACKEND_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BACKEND_BUILD
+#            define GGML_BACKEND_API __declspec(dllexport) extern
+#        else
+#            define GGML_BACKEND_API __declspec(dllimport) extern
+#        endif
+#    else
+#        define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
+#    endif
+#else
+#    define GGML_BACKEND_API extern
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@ -13,42 +26,52 @@ extern "C" {
    typedef struct ggml_backend_event * ggml_backend_event_t;
    typedef struct ggml_backend * ggml_backend_t;
    typedef void * ggml_backend_graph_plan_t;
+    typedef struct ggml_backend_reg * ggml_backend_reg_t;
+    typedef struct ggml_backend_device * ggml_backend_dev_t;
+
+
+    //
+    // Backend buffer type
+    //
+
+    GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
+    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
+    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
+    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
+    GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);

    //
    // Backend buffer
    //

-    // buffer type
-    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
-    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
-    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
-    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
-
-    // buffer
    enum ggml_backend_buffer_usage {
        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
+        GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
    };

-    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+    GGML_API const char *                   ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage     (ggml_backend_buffer_t buffer);
+    GGML_API ggml_backend_buffer_type_t     ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+
+    // tensor copy between different backends
+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);

    //
-    // Backend
+    // Backend (stream)
    //

    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
@ -63,8 +86,10 @@ extern "C" {
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

-    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    // "offset" refers to the offset in tensor->data for setting/getting data
+    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);

    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);

@ -74,11 +99,11 @@ extern "C" {
    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
-    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);

-    // tensor copy between different backends
-    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+    // NOTE: will be removed, use device version instead
+    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
+    GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
+    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);

    // asynchronous copy
    // the copy is performed after all the currently queued operations in backend_src
@ -86,51 +111,132 @@ extern "C" {
    // automatic fallback to sync copy if async is not supported
    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);

-    // events
-    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
-    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
-    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
-    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
-    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
+    GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);

    //
-    // CPU backend
+    // Events
    //

-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+    GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
+    GGML_API void                 ggml_backend_event_free(ggml_backend_event_t event);
+    GGML_API void                 ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
+    GGML_API void                 ggml_backend_event_synchronize(ggml_backend_event_t event);
+    GGML_API void                 ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);

-    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+    //
+    // Backend device
+    //

-    // Create a backend buffer from an existing pointer
-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+    enum ggml_backend_dev_type {
+        // CPU device using system memory
+        GGML_BACKEND_DEVICE_TYPE_CPU,
+        // GPU device using dedicated memory
+        GGML_BACKEND_DEVICE_TYPE_GPU,
+        // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
+        GGML_BACKEND_DEVICE_TYPE_ACCEL
+    };

-    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+    // functionality supported by the device
+    struct ggml_backend_dev_caps {
+        // asynchronous operations
+        bool async;
+        // pinned host buffer
+        bool host_buffer;
+        // creating buffers from host ptr
+        bool buffer_from_host_ptr;
+        // event synchronization
+        bool events;
+    };

-#ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
+    // all the device properties
+    struct ggml_backend_dev_props {
+        const char * name;
+        const char * description;
+        size_t memory_free;
+        size_t memory_total;
+        enum ggml_backend_dev_type type;
+        struct ggml_backend_dev_caps caps;
+    };
+
+    GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
+    GGML_API const char *                  ggml_backend_dev_description(ggml_backend_dev_t device);
+    GGML_API void                          ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
+    GGML_API enum ggml_backend_dev_type    ggml_backend_dev_type(ggml_backend_dev_t device);
+    GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
+    GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
+    GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
+    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
+    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
+    GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
+
+    GGML_API bool                          ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
+    GGML_API bool                          ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
+    GGML_API bool                          ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
+
+    //
+    // Backend (reg)
+    //
+
+    GGML_API const char *       ggml_backend_reg_name(ggml_backend_reg_t reg);
+    GGML_API size_t             ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
+    GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
+    GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
+
+    // Common functions that may be obtained using ggml_backend_reg_get_proc_address
+
+    // Split buffer type for tensor parallelism
+    typedef ggml_backend_buffer_type_t   (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
+    // Set the number of threads for the backend
+    typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
+    // Get additional buffer types provided by the device (returns a NULL-terminated array)
+    typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
+    // Set the abort callback for the backend
+    typedef void                         (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
+    // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
+    struct ggml_backend_feature {
+        const char * name;
+        const char * value;
+    };
+    typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);

    //
    // Backend registry
    //

-    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
+    GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);

-    GGML_API size_t                     ggml_backend_reg_get_count(void);
-    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
-    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
-    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
-    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
+    // Backend (reg) enumeration
+    GGML_API size_t             ggml_backend_reg_count(void);
+    GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
+    GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
+
+    // Device enumeration
+    GGML_API size_t             ggml_backend_dev_count(void);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
+
+    // Direct backend (stream) initialization
+    // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
+    GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
+    // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
+    GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
+    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
+    GGML_API ggml_backend_t ggml_backend_init_best(void);
+
+    // Load a backend from a dynamic library and register it
+    GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
+    // Unload a backend if loaded dynamically and unregister it
+    GGML_API void               ggml_backend_unload(ggml_backend_reg_t reg);
+    // Load all known backends from dynamic libraries
+    GGML_API void               ggml_backend_load_all(void);
+    GGML_API void               ggml_backend_load_all_from_path(const char * dir_path);

    //
    // Backend scheduler
    //

-    // The backend scheduler allows for multiple backends to be used together
+    // The backend scheduler allows for multiple backend devices to be used together
    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
    // The backends are selected based on:
    // - the backend that supports the operation
@ -154,20 +260,26 @@ extern "C" {
        ggml_backend_sched_reserve(sched, reserve_graph);

        // compute
-        graph = build_graph(sched);
-        ggml_backend_sched_graph_compute(sched, graph);
+        graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
+        for (int i = 0; i < 10; ++i) {
+            ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
+        }

        // if there are graph inputs:
-        ggml_backend_sched_reset(sched);
-        ggml_backend_sched_alloc_graph(sched, graph);
-        ggml_backend_tensor_set(input_tensor, ...);
-        ggml_backend_sched_graph_compute(sched, graph);
+        graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
+        ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
+        ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
+        ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
+        ggml_backend_sched_graph_compute(sched, graph); // execute the graph
+
+        // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
+        // allocate them statically via ggml_backend_alloc_ctx_tensors
    }
    */

-    struct ggml_backend_sched;
    typedef struct ggml_backend_sched * ggml_backend_sched_t;

+    // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
    // when ask == true, the scheduler wants to know if the user wants to observe this node
    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
    //
@ -176,12 +288,15 @@ extern "C" {
    //
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);

-    // Initialize a backend scheduler
+    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);

    // Initialize backend buffers from a measure graph
-    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
+
+    GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);

    // Get the number of splits of the last graph
    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
@ -193,12 +308,14 @@ extern "C" {
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

    // Allocate and compute graph on the backend scheduler
-    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);

-    // Reset all assignments and allocators - must be called before changing the node backends
+    // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
+    // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
+    // The correct way to use this API is to discard the deallocated tensors and create new ones.
    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);

    // Set a callback to be called for each resulting node during graph compute
@ -219,16 +336,19 @@ extern "C" {
    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);

-    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);

    // Compare the output of two backends
    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);

    // Tensor initialization
    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
-    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);

+    // CPU buffer types are always available
+    GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);

 #ifdef  __cplusplus
 }
-#endif
+#endif
--- a/Sources/llama.cpp/ggml-cpp.h
+++ b/Sources/llama.cpp/ggml-cpp.h
@ -0,0 +1,39 @@
+#pragma once
+
+#ifndef __cplusplus
+#error "This header is for C++ only"
+#endif
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "gguf.h"
+#include <memory>
+
+// Smart pointers for ggml types
+
+// ggml
+
+struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
+struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
+
+typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
+typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
+
+// ggml-alloc
+
+struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
+
+typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
+
+// ggml-backend
+
+struct ggml_backend_deleter        { void operator()(ggml_backend_t backend)       { ggml_backend_free(backend); } };
+struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
+struct ggml_backend_event_deleter  { void operator()(ggml_backend_event_t event)   { ggml_backend_event_free(event); } };
+struct ggml_backend_sched_deleter  { void operator()(ggml_backend_sched_t sched)   { ggml_backend_sched_free(sched); } };
+
+typedef std::unique_ptr<ggml_backend,        ggml_backend_deleter>        ggml_backend_ptr;
+typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
+typedef std::unique_ptr<ggml_backend_event,  ggml_backend_event_deleter>  ggml_backend_event_ptr;
+typedef std::unique_ptr<ggml_backend_sched,  ggml_backend_sched_deleter>  ggml_backend_sched_ptr;
--- a/Sources/llama.cpp/ggml-cpu.h
+++ b/Sources/llama.cpp/ggml-cpu.h
@ -0,0 +1,137 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggml-org/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+        int n_threads;
+        struct ggml_threadpool * threadpool;
+
+        // abort ggml_graph_compute when true
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
+    };
+
+    // numa strategies
+    enum ggml_numa_strategy {
+        GGML_NUMA_STRATEGY_DISABLED   = 0,
+        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
+        GGML_NUMA_STRATEGY_ISOLATE    = 2,
+        GGML_NUMA_STRATEGY_NUMACTL    = 3,
+        GGML_NUMA_STRATEGY_MIRROR     = 4,
+        GGML_NUMA_STRATEGY_COUNT
+    };
+
+    GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
+    GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+
+    GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+
+    GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+
+    GGML_BACKEND_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+
+    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+
+    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
+    GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);
+
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+                    struct ggml_threadpool * threadpool /* = NULL */ );
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+
+    //
+    // system info
+    //
+
+    // x86
+    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
+    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
+    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
+    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
+    GGML_BACKEND_API int ggml_cpu_has_amx_int8   (void);
+    // ARM
+    GGML_BACKEND_API int ggml_cpu_has_neon       (void);
+    GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
+    GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
+    GGML_BACKEND_API int ggml_cpu_has_dotprod    (void);
+    GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
+    GGML_BACKEND_API int ggml_cpu_has_sve        (void);
+    GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
+    GGML_BACKEND_API int ggml_cpu_has_sme        (void);
+    // other
+    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
+    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
+    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
+    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
+
+    // Internal types and functions exposed for tests and benchmarks
+
+    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+                                       const void * GGML_RESTRICT y, size_t by, int nrc);
+
+    struct ggml_type_traits_cpu {
+        ggml_from_float_t        from_float;
+        ggml_vec_dot_t           vec_dot;
+        enum ggml_type           vec_dot_type;
+        int64_t                  nrows; // number of rows to process simultaneously
+    };
+
+    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
+
+    GGML_BACKEND_API void ggml_cpu_init(void);
+
+    //
+    // CPU backend
+    //
+
+    GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
+
+    GGML_BACKEND_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_BACKEND_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
+    GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+
+    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
--- a/Sources/llama.cpp/ggml-metal.h
+++ b/Sources/llama.cpp/ggml-metal.h
@ -0,0 +1,66 @@
+// Note: this description is outdated
+//
+// An interface allowing to compute ggml_cgraph with Metal
+//
+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
+// used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
+//
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stddef.h>
+#include <stdbool.h>
+
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// backend API
+// user-code should use only these functions
+//
+
+GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
+
+GGML_DEPRECATED(
+        GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
+        "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
+
+GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+
+// helper to check if the device supports a specific family
+// ideally, the user code should be doing these checks
+// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
+
+// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
+GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
--- a/Sources/llama.cpp/ggml.h
+++ b/Sources/llama.cpp/ggml.h
--- a/Sources/llama.cpp/llama.h
+++ b/Sources/llama.cpp/llama.h