Merge pull request #157 from fkunn1326/latest_llama

feat: 最新のllama.cppに対応
This commit is contained in:
Miwa
2025-03-18 23:03:14 +09:00
committed by GitHub
13 changed files with 1607 additions and 1266 deletions

View File

@ -14,10 +14,19 @@ jobs:
with:
submodules: true
- name: Download pre-built llama.cpp binaries
run: |
wget -O llama-cpp-bin.zip "https://github.com/fkunn1326/llama.cpp/releases/download/b4846/llama-b4846-bin-ubuntu-x64.zip"
unzip llama-cpp-bin.zip -d llama-cpp-bin
- name: Copy llama.cpp binaries
run: cp llama-cpp-bin/build/bin/lib*.so ./
- name: Build and Test in DevContainer
uses: devcontainers/ci@v0.3
with:
push: never
runCmd: |
swift build -Xswiftc -strict-concurrency=complete -v
swift test -c release -Xswiftc -strict-concurrency=complete -v
swift build -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v
cp llama-cpp-bin/build/bin/lib*.so .build/*/release/
swift test -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v

View File

@ -39,10 +39,18 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
- name: Download pre-built llama.cpp binaries
run: |
wget -O llama-cpp-bin.zip "https://github.com/fkunn1326/llama.cpp/releases/download/b4846/llama-b4846-bin-ubuntu-x64.zip"
unzip llama-cpp-bin.zip -d llama-cpp-bin
- name: Copy llama.cpp binaries
run: cp llama-cpp-bin/build/bin/lib*.so ./
- name: Build
run: swift build -Xswiftc -strict-concurrency=complete -v
run: swift build -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v
- name: Run tests
run: swift test -c release -Xswiftc -strict-concurrency=complete -v
run: |
cp llama-cpp-bin/build/bin/lib*.so .build/*/release/
swift test -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v
windows-build:
name: Swift ${{ matrix.swift-version.tag }} on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
@ -62,18 +70,20 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
- name: Clone llama.cpp
run: git clone -b ku-nlp/gpt2-japanese-char https://github.com/ensan-hcl/llama.cpp.git
- name: Build llama.cpp
- name: Download pre-built llama.cpp binaries
shell: pwsh
run: |
cmake -B build -DBUILD_SHARED_LIBS=ON
cmake --build build --config Release
working-directory: ./llama.cpp
- name: Copy built files
$zipUrl = "https://github.com/fkunn1326/llama.cpp/releases/download/b4846/llama-b4846-bin-win-avx-x64.zip"
$zipPath = "llama-cpp-bin.zip"
Invoke-WebRequest -Uri $zipUrl -OutFile $zipPath
Expand-Archive -Path $zipPath -DestinationPath llama-cpp-bin
- name: Copy llama.cpp binaries
shell: pwsh
run: |
cp ./build/bin/Release/llama.dll ../
cp ./build/Release/llama.lib ../
working-directory: ./llama.cpp
Copy-Item -Path "llama-cpp-bin/llama.dll" -Destination "./"
Copy-Item -Path "llama-cpp-bin/llama.lib" -Destination "./"
Copy-Item -Path "llama-cpp-bin/ggml.dll" -Destination "./"
Copy-Item -Path "llama-cpp-bin/ggml-*.dll" -Destination "./"
- name: Build
run: swift build -Xswiftc -strict-concurrency=complete -v
- name: Run tests

View File

@ -139,23 +139,6 @@ if checkObjcAvailability() {
}
#endif
#if os(Windows)
targets.append(contentsOf: [
.systemLibrary(
name: "llama.cpp"
),
.target(
name: "KanaKanjiConverterModule",
dependencies: [
"SwiftUtils",
"llama.cpp",
"EfficientNGram",
.product(name: "Collections", package: "swift-collections"),
],
swiftSettings: swiftSettings
)
])
#else
if let envValue = ProcessInfo.processInfo.environment["LLAMA_MOCK"], envValue == "1" {
targets.append(contentsOf: [
.target(name: "llama-mock"),
@ -171,24 +154,43 @@ if let envValue = ProcessInfo.processInfo.environment["LLAMA_MOCK"], envValue ==
)
])
} else {
dependencies.append(
.package(url: "https://github.com/ensan-hcl/llama.cpp", branch: "6b862f4")
)
#if os(Windows) || os(Linux)
targets.append(contentsOf: [
.systemLibrary(
name: "llama.cpp"
),
.target(
name: "KanaKanjiConverterModule",
dependencies: [
"SwiftUtils",
"llama.cpp",
"EfficientNGram",
.product(name: "llama", package: "llama.cpp"),
.product(name: "Collections", package: "swift-collections"),
],
swiftSettings: swiftSettings
)
])
#else
targets.append(contentsOf: [
.binaryTarget(
name: "llama.cpp",
url: "https://github.com/fkunn1326/llama.cpp/releases/download/b4844/llama-b4844-xcframework.zip",
// this can be computed `swift package compute-checksum llama-b4844-xcframework.zip`
checksum: "40bd1e58e727511649e13a6de9eb577ea8be78fe4183c2e1b382b12054849f05"
),
.target(
name: "KanaKanjiConverterModule",
dependencies: [
"SwiftUtils",
"EfficientNGram",
"llama.cpp",
.product(name: "Collections", package: "swift-collections"),
],
swiftSettings: swiftSettings
)
])
#endif
}
#endif
let package = Package(
name: "AzooKeyKanakanjiConverter",

View File

@ -62,7 +62,7 @@ extension Kana2Kanji {
versionDependentConfig: ConvertRequestOptions.ZenzaiVersionDependentMode
) -> (result: LatticeNode, nodes: Nodes, cache: ZenzaiCache) {
var constraint = zenzaiCache?.getNewConstraint(for: inputData) ?? PrefixConstraint([])
print("initial constraint", constraint)
debug("initial constraint", constraint)
let eosNode = LatticeNode.EOSNode
var nodes: Kana2Kanji.Nodes = []
var constructedCandidates: [(RegisteredNode, Candidate)] = []
@ -96,19 +96,19 @@ extension Kana2Kanji {
}
}
guard var (index, candidate) = best else {
print("best was not found!")
debug("best was not found!")
// Empty
//
return (eosNode, nodes, ZenzaiCache(inputData, constraint: PrefixConstraint([]), satisfyingCandidate: nil))
}
print("Constrained draft modeling", -start.timeIntervalSinceNow)
debug("Constrained draft modeling", -start.timeIntervalSinceNow)
reviewLoop: while true {
// results
// N-Best
insertedCandidates.insert((draftResult.result.prevs[index], candidate), at: 0)
if inferenceLimit == 0 {
print("inference limit! \(candidate.text) is used for excuse")
debug("inference limit! \(candidate.text) is used for excuse")
// When inference occurs more than maximum times, then just return result at this point
return (eosNode, nodes, ZenzaiCache(inputData, constraint: constraint, satisfyingCandidate: candidate))
}
@ -187,25 +187,25 @@ extension Kana2Kanji {
switch reviewResult {
case .error:
//
print("error")
debug("error")
return .return(constraint: constraint, alternativeConstraints: [], satisfied: false)
case .pass(let score, let alternativeConstraints):
//
print("passed:", score)
debug("passed:", score)
return .return(constraint: constraint, alternativeConstraints: alternativeConstraints, satisfied: true)
case .fixRequired(let prefixConstraint):
// 2
if constraint.constraint == prefixConstraint {
print("same constraint:", prefixConstraint)
debug("same constraint:", prefixConstraint)
return .return(constraint: PrefixConstraint([]), alternativeConstraints: [], satisfied: false)
}
//
constraint = PrefixConstraint(prefixConstraint)
print("update constraint:", constraint)
debug("update constraint:", constraint)
// 使
for (i, candidate) in candidates.indexed() where i != candidateIndex {
if candidate.text.utf8.hasPrefix(prefixConstraint) && self.heuristicRetryValidation(candidate.text) {
print("found \(candidate.text) as another retry")
debug("found \(candidate.text) as another retry")
return .retry(candidateIndex: i)
}
}
@ -214,16 +214,16 @@ extension Kana2Kanji {
let newConstraint = PrefixConstraint(Array(wholeConstraint.utf8), hasEOS: true)
// 2
if constraint == newConstraint {
print("same constraint:", constraint)
debug("same constraint:", constraint)
return .return(constraint: PrefixConstraint([]), alternativeConstraints: [], satisfied: false)
}
//
print("update whole constraint:", wholeConstraint)
debug("update whole constraint:", wholeConstraint)
constraint = PrefixConstraint(Array(wholeConstraint.utf8), hasEOS: true)
// 使
for (i, candidate) in candidates.indexed() where i != candidateIndex {
if candidate.text == wholeConstraint && self.heuristicRetryValidation(candidate.text) {
print("found \(candidate.text) as another retry")
debug("found \(candidate.text) as another retry")
return .retry(candidateIndex: i)
}
}

View File

@ -59,11 +59,13 @@ struct FixedSizeHeap<Element: Comparable> {
enum ZenzError: LocalizedError {
case couldNotLoadModel(path: String)
case couldNotLoadContext
case couldNotLoadVocab
var errorDescription: String? {
switch self {
case .couldNotLoadContext: "failed to load context"
case .couldNotLoadModel(path: let path): "could not load model weight at \(path)"
case .couldNotLoadContext: return "failed to load context"
case .couldNotLoadModel(path: let path): return "could not load model weight at \(path)"
case .couldNotLoadVocab: return "failed to load vocab"
}
}
}
@ -71,18 +73,20 @@ enum ZenzError: LocalizedError {
final class ZenzContext {
private var model: OpaquePointer
private var context: OpaquePointer
private var vocab: OpaquePointer
private var prevInput: [llama_token] = []
private let n_len: Int32 = 512
init(model: OpaquePointer, context: OpaquePointer) {
init(model: OpaquePointer, context: OpaquePointer, vocab: OpaquePointer) {
self.model = model
self.context = context
self.vocab = vocab
}
deinit {
llama_free(context)
llama_free_model(model)
llama_model_free(model)
llama_backend_free()
}
@ -90,10 +94,9 @@ final class ZenzContext {
let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
debug("Using \(n_threads) threads")
var ctx_params = llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = 512
ctx_params.n_threads = UInt32(n_threads)
ctx_params.n_threads_batch = UInt32(n_threads)
ctx_params.n_threads = Int32(n_threads)
ctx_params.n_threads_batch = Int32(n_threads)
ctx_params.n_batch = 512
return ctx_params
}
@ -102,24 +105,30 @@ final class ZenzContext {
llama_backend_init()
var model_params = llama_model_default_params()
model_params.use_mmap = true
let model = llama_load_model_from_file(path, model_params)
let model = llama_model_load_from_file(path, model_params)
guard let model else {
debug("Could not load model at \(path)")
throw ZenzError.couldNotLoadModel(path: path)
}
let context = llama_new_context_with_model(model, ctx_params)
let context = llama_init_from_model(model, ctx_params)
guard let context else {
debug("Could not load context!")
throw ZenzError.couldNotLoadContext
}
return ZenzContext(model: model, context: context)
let vocab = llama_model_get_vocab(model)
guard let vocab else {
debug("Could not load vocab!")
throw ZenzError.couldNotLoadVocab
}
return ZenzContext(model: model, context: context, vocab: vocab)
}
func reset_context() throws {
llama_free(self.context)
let context = llama_new_context_with_model(self.model, Self.ctx_params)
let context = llama_init_from_model(self.model, Self.ctx_params)
guard let context else {
debug("Could not load context!")
throw ZenzError.couldNotLoadContext
@ -157,7 +166,7 @@ final class ZenzContext {
return .nan
}
let tokenizedPromptCount = ignorePrompt.isEmpty ? 1 : tokenize(text: ignorePrompt, add_bos: true, add_eos: false).count
let n_vocab = llama_n_vocab(model)
let n_vocab = llama_vocab_n_tokens(vocab)
var sum: Float = 0
//
@ -202,14 +211,14 @@ final class ZenzContext {
func pure_greedy_decoding(leftSideContext: String, maxCount: Int = .max) -> String {
var prompt_tokens = self.tokenize(text: leftSideContext, add_bos: false)
let initial_count = prompt_tokens.count
let eos_token = llama_token_eos(model)
let eos_token = llama_vocab_eos(vocab)
while prompt_tokens.count - initial_count < maxCount {
let startOffset = prompt_tokens.count - 1
guard let logits = self.get_logits(tokens: prompt_tokens, logits_start_index: startOffset) else {
print("logits unavailable")
debug("logits unavailable")
return ""
}
let n_vocab = llama_n_vocab(model)
let n_vocab = llama_vocab_n_tokens(vocab)
let startIndex = (prompt_tokens.count - 1 - startOffset) * Int(n_vocab)
let endIndex = (prompt_tokens.count - startOffset) * Int(n_vocab)
// Min-Heap使n-best
@ -249,11 +258,11 @@ final class ZenzContext {
let startOffset = prompt_tokens.count - 1
guard let logits = self.get_logits(tokens: prompt_tokens, logits_start_index: startOffset) else {
print("logits unavailable")
debug("logits unavailable")
return []
}
let n_vocab = llama_n_vocab(model)
let n_vocab = llama_vocab_n_tokens(vocab)
var exp_sum: Float = 0
let startIndex = (prompt_tokens.count - 1 - startOffset) * Int(n_vocab)
let endIndex = (prompt_tokens.count - startOffset) * Int(n_vocab)
@ -293,7 +302,7 @@ final class ZenzContext {
personalizationMode: (mode: ConvertRequestOptions.ZenzaiMode.PersonalizationMode, base: EfficientNGram, personal: EfficientNGram)?,
versionDependentConfig: ConvertRequestOptions.ZenzaiVersionDependentMode
) -> CandidateEvaluationResult {
print("Evaluate", candidate)
debug("Evaluate", candidate)
// For zenz-v1 model, \u{EE00} is a token used for 'start query', and \u{EE01} is a token used for 'start answer'
// We assume \u{EE01}\(candidate) is always splitted into \u{EE01}_\(candidate) by zenz-v1 tokenizer
var userDictionaryPrompt: String = ""
@ -383,12 +392,12 @@ final class ZenzContext {
let tokens = prompt_tokens + candidate_tokens
let startOffset = prompt_tokens.count - 1
let pos_max = llama_kv_cache_seq_pos_max(self.context, 0)
print("pos max:", pos_max)
debug("pos max:", pos_max)
guard let logits = self.get_logits(tokens: tokens, logits_start_index: startOffset) else {
debug("logits unavailable")
return .error
}
let n_vocab = llama_n_vocab(model)
let n_vocab = llama_vocab_n_tokens(vocab)
let is_learned_token: [(isLearned: Bool, priority: Float)] = Array(repeating: (false, 0), count: prompt_tokens.count) + candidate.data.flatMap {
// priority
Array(repeating: ($0.metadata.contains(.isLearned), logf(getLearningPriority(data: $0))), count: self.tokenize(text: $0.word, add_bos: false).count)
@ -456,12 +465,12 @@ final class ZenzContext {
}
guard let maxItem = tokenHeap.max else {
print("Max Item could not be found for unknown reason")
debug("Max Item could not be found for unknown reason")
return .error
}
//
if maxItem.token != token_id {
if maxItem.token == llama_token_eos(model) {
if maxItem.token == llama_vocab_eos(vocab) {
var cchars = tokens[..<i].reduce(into: []) {
$0.append(contentsOf: token_to_piece(token: $1))
}
@ -524,15 +533,15 @@ final class ZenzContext {
let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
var swiftTokens: [llama_token] = if tokenCount < 0 {
[llama_token_bos(model)]
[llama_vocab_bos(vocab)]
} else {
(0..<tokenCount).map {tokens[Int($0)]}
}
tokens.deallocate()
if add_eos {
swiftTokens.append(llama_token_eos(model))
swiftTokens.append(llama_vocab_eos(vocab))
}
return swiftTokens
}
@ -544,7 +553,7 @@ final class ZenzContext {
defer {
result.deallocate()
}
let nTokens = llama_token_to_piece(model, token, result, 8, false)
let nTokens = llama_token_to_piece(vocab, token, result, 8, 0, false)
if nTokens < 0 {
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@ -552,11 +561,11 @@ final class ZenzContext {
defer {
newResult.deallocate()
}
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
let nNewTokens = llama_token_to_piece(vocab, token, newResult, Int32(-nTokens), 0, false)
let bufferPointer: UnsafeBufferPointer<Int8> = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else {
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
let bufferPointer: UnsafeBufferPointer<Int8> = UnsafeBufferPointer(start: result, count: Int(nTokens))
return Array(bufferPointer)
}
}

View File

@ -9,19 +9,20 @@ package typealias llama_seq_id = Int32
package struct llama_context_params {
package var seed: Int
package var n_ctx: Int
package var n_threads: UInt32
package var n_threads_batch: UInt32
package var n_threads: Int32
package var n_threads_batch: Int32
package var n_batch: Int
}
package func llama_context_default_params() -> llama_context_params { unimplemented() }
package typealias llama_context = OpaquePointer
package func llama_new_context_with_model(_ model: llama_model, _ ctx_params: llama_context_params) -> llama_context? { unimplemented() }
package func llama_init_from_model(_ model: llama_model, _ ctx_params: llama_context_params) -> llama_context? { unimplemented() }
package func llama_free(_ context: llama_context) {}
package typealias llama_model = OpaquePointer
package typealias llama_vocab = OpaquePointer
package func llama_free_model(_ model: llama_model) {}
package func llama_model_free(_ model: llama_model) {}
package func llama_backend_init() {}
package func llama_backend_free() {}
@ -31,7 +32,9 @@ package struct llama_model_params {
}
package func llama_model_default_params() -> llama_model_params { unimplemented() }
package func llama_load_model_from_file(_ path: String, _ model_params: llama_model_params) -> llama_model? { unimplemented() }
package func llama_model_get_vocab(_ model: llama_model) -> llama_vocab? { unimplemented() }
package func llama_model_load_from_file(_ path: String, _ model_params: llama_model_params) -> llama_model? { unimplemented() }
package func llama_kv_cache_seq_rm(_ ctx: llama_context, _ seq_id: llama_seq_id, _ p0: llama_pos, _ p1: llama_pos) {}
package func llama_kv_cache_seq_pos_max(_ ctx: llama_context, _ seq_id: llama_seq_id) -> Int { unimplemented() }
@ -48,12 +51,12 @@ package struct llama_batch {
package func llama_batch_init(_ n_tokens: Int, _ embd: Int, _ n_seq_max: Int) -> llama_batch { unimplemented() }
package func llama_n_ctx(_ ctx: llama_context) -> Int { unimplemented() }
package func llama_n_vocab(_ model: llama_model) -> Int { unimplemented() }
package func llama_vocab_n_tokens(_ vocab: llama_vocab) -> Int { unimplemented() }
package func llama_tokenize(_ model: llama_model, _ text: String, _ text_len: Int32, _ tokens: UnsafeMutablePointer<llama_token>, _ n_tokens_max: Int32, _ add_special: Bool, _ parse_special: Bool) -> Int { unimplemented() }
package func llama_token_bos(_ model: llama_model) -> llama_token { unimplemented() }
package func llama_token_eos(_ model: llama_model) -> llama_token { unimplemented() }
package func llama_token_to_piece(_ model: llama_model, _ token: llama_token, _ buf: UnsafeMutablePointer<Int8>, _ length: Int32, _ special: Bool) -> Int32 { unimplemented() }
package func llama_vocab_eos(_ vocab: llama_vocab) -> llama_token { unimplemented() }
package func llama_vocab_bos(_ vocab: llama_vocab) -> llama_token { unimplemented() }
package func llama_token_to_piece(_ vocab: llama_vocab, _ token: llama_token, _ buf: UnsafeMutablePointer<Int8>, _ length: Int32, _ lstrip: Int32, _ special: Bool) -> Int32 { unimplemented() }
package func llama_decode(_ ctx: llama_context, _ batch: llama_batch) -> Int { unimplemented() }
package func llama_get_logits(_ ctx: llama_context) -> UnsafeMutablePointer<Float>? { unimplemented() }

View File

@ -1,4 +1,3 @@
// header file from https://github.com/ensan-hcl/llama.cpp/tree/ku-nlp/gpt2-japanese-char
#pragma once
#include "ggml.h"
@ -8,8 +7,8 @@ extern "C" {
#endif
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
// Tensor allocator
struct ggml_tallocr {
@ -25,7 +24,7 @@ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
// Graph allocator
/*
Example usage:
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
ggml_gallocr_reserve(galloc, build_graph(max_batch));
@ -74,4 +73,4 @@ GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml
#ifdef __cplusplus
}
#endif
#endif

View File

@ -1,9 +1,22 @@
// header file from https://github.com/ensan-hcl/llama.cpp/tree/ku-nlp/gpt2-japanese-char
#pragma once
#include "ggml.h"
#include "ggml-alloc.h"
#ifdef GGML_BACKEND_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef GGML_BACKEND_BUILD
# define GGML_BACKEND_API __declspec(dllexport) extern
# else
# define GGML_BACKEND_API __declspec(dllimport) extern
# endif
# else
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
# endif
#else
# define GGML_BACKEND_API extern
#endif
#ifdef __cplusplus
extern "C" {
#endif
@ -13,42 +26,52 @@ extern "C" {
typedef struct ggml_backend_event * ggml_backend_event_t;
typedef struct ggml_backend * ggml_backend_t;
typedef void * ggml_backend_graph_plan_t;
typedef struct ggml_backend_reg * ggml_backend_reg_t;
typedef struct ggml_backend_device * ggml_backend_dev_t;
//
// Backend buffer type
//
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
//
// Backend buffer
//
// buffer type
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer
enum ggml_backend_buffer_usage {
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
};
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
//
// Backend
// Backend (stream)
//
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
@ -63,8 +86,10 @@ extern "C" {
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// "offset" refers to the offset in tensor->data for setting/getting data
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
@ -74,11 +99,11 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// NOTE: will be removed, use device version instead
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// asynchronous copy
// the copy is performed after all the currently queued operations in backend_src
@ -86,51 +111,132 @@ extern "C" {
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
// events
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
//
// CPU backend
// Events
//
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
//
// Backend device
//
// Create a backend buffer from an existing pointer
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
enum ggml_backend_dev_type {
// CPU device using system memory
GGML_BACKEND_DEVICE_TYPE_CPU,
// GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
};
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
// functionality supported by the device
struct ggml_backend_dev_caps {
// asynchronous operations
bool async;
// pinned host buffer
bool host_buffer;
// creating buffers from host ptr
bool buffer_from_host_ptr;
// event synchronization
bool events;
};
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
// all the device properties
struct ggml_backend_dev_props {
const char * name;
const char * description;
size_t memory_free;
size_t memory_total;
enum ggml_backend_dev_type type;
struct ggml_backend_dev_caps caps;
};
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
//
// Backend (reg)
//
GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
// Split buffer type for tensor parallelism
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
// Set the number of threads for the backend
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
// Get additional buffer types provided by the device (returns a NULL-terminated array)
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
// Set the abort callback for the backend
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
struct ggml_backend_feature {
const char * name;
const char * value;
};
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
//
// Backend registry
//
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
GGML_API size_t ggml_backend_reg_get_count(void);
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
GGML_API const char * ggml_backend_reg_get_name(size_t i);
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
// Backend (reg) enumeration
GGML_API size_t ggml_backend_reg_count(void);
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
// Device enumeration
GGML_API size_t ggml_backend_dev_count(void);
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
// Direct backend (stream) initialization
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void);
// Load a backend from a dynamic library and register it
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
// Unload a backend if loaded dynamically and unregister it
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
// Load all known backends from dynamic libraries
GGML_API void ggml_backend_load_all(void);
GGML_API void ggml_backend_load_all_from_path(const char * dir_path);
//
// Backend scheduler
//
// The backend scheduler allows for multiple backends to be used together
// The backend scheduler allows for multiple backend devices to be used together
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
// The backends are selected based on:
// - the backend that supports the operation
@ -154,20 +260,26 @@ extern "C" {
ggml_backend_sched_reserve(sched, reserve_graph);
// compute
graph = build_graph(sched);
ggml_backend_sched_graph_compute(sched, graph);
graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
for (int i = 0; i < 10; ++i) {
ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
}
// if there are graph inputs:
ggml_backend_sched_reset(sched);
ggml_backend_sched_alloc_graph(sched, graph);
ggml_backend_tensor_set(input_tensor, ...);
ggml_backend_sched_graph_compute(sched, graph);
graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
ggml_backend_sched_graph_compute(sched, graph); // execute the graph
// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
// allocate them statically via ggml_backend_alloc_ctx_tensors
}
*/
struct ggml_backend_sched;
typedef struct ggml_backend_sched * ggml_backend_sched_t;
// Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
// when ask == true, the scheduler wants to know if the user wants to observe this node
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
//
@ -176,12 +288,15 @@ extern "C" {
//
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
// Initialize a backend scheduler
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
// Get the number of splits of the last graph
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
@ -193,12 +308,14 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
// Reset all assignments and allocators - must be called before changing the node backends
// Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
// The correct way to use this API is to discard the deallocated tensors and create new ones.
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
// Set a callback to be called for each resulting node during graph compute
@ -219,16 +336,19 @@ extern "C" {
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
// Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
// CPU buffer types are always available
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,39 @@
#pragma once
#ifndef __cplusplus
#error "This header is for C++ only"
#endif
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "gguf.h"
#include <memory>
// Smart pointers for ggml types
// ggml
struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
// ggml-alloc
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
// ggml-backend
struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } };
struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } };
struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } };
typedef std::unique_ptr<ggml_backend, ggml_backend_deleter> ggml_backend_ptr;
typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
typedef std::unique_ptr<ggml_backend_event, ggml_backend_event_deleter> ggml_backend_event_ptr;
typedef std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter> ggml_backend_sched_ptr;

View File

@ -0,0 +1,137 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggml-org/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};
// numa strategies
enum ggml_numa_strategy {
GGML_NUMA_STRATEGY_DISABLED = 0,
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
GGML_NUMA_STRATEGY_ISOLATE = 2,
GGML_NUMA_STRATEGY_NUMACTL = 3,
GGML_NUMA_STRATEGY_MIRROR = 4,
GGML_NUMA_STRATEGY_COUNT
};
GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
//
// system info
//
// x86
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
GGML_BACKEND_API int ggml_cpu_has_avx (void);
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
GGML_BACKEND_API int ggml_cpu_has_fma (void);
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
// ARM
GGML_BACKEND_API int ggml_cpu_has_neon (void);
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
GGML_BACKEND_API int ggml_cpu_has_sve (void);
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
GGML_BACKEND_API int ggml_cpu_has_sme (void);
// other
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
// Internal types and functions exposed for tests and benchmarks
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
struct ggml_type_traits_cpu {
ggml_from_float_t from_float;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously
};
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
GGML_BACKEND_API void ggml_cpu_init(void);
//
// CPU backend
//
GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,66 @@
// Note: this description is outdated
//
// An interface allowing to compute ggml_cgraph with Metal
//
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
//
// How it works?
//
// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
//
// You only need to make sure that all memory buffers that you used during the graph creation
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
// used during the graph evaluation to determine the arguments of the compute kernels.
//
// Synchronization between device and host memory (for example for input and output tensors)
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
//
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include <stddef.h>
#include <stdbool.h>
struct ggml_tensor;
struct ggml_cgraph;
#ifdef __cplusplus
extern "C" {
#endif
//
// backend API
// user-code should use only these functions
//
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_DEPRECATED(
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
"obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family
// ideally, the user code should be doing these checks
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff