mirror of
https://github.com/mii443/AzooKeyKanaKanjiConverter.git
synced 2025-08-22 15:05:26 +00:00
Merge pull request #157 from fkunn1326/latest_llama
feat: 最新のllama.cppに対応
This commit is contained in:
13
.github/workflows/swift-in-devcontainer.yml
vendored
13
.github/workflows/swift-in-devcontainer.yml
vendored
@ -14,10 +14,19 @@ jobs:
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Download pre-built llama.cpp binaries
|
||||
run: |
|
||||
wget -O llama-cpp-bin.zip "https://github.com/fkunn1326/llama.cpp/releases/download/b4846/llama-b4846-bin-ubuntu-x64.zip"
|
||||
unzip llama-cpp-bin.zip -d llama-cpp-bin
|
||||
|
||||
- name: Copy llama.cpp binaries
|
||||
run: cp llama-cpp-bin/build/bin/lib*.so ./
|
||||
|
||||
- name: Build and Test in DevContainer
|
||||
uses: devcontainers/ci@v0.3
|
||||
with:
|
||||
push: never
|
||||
runCmd: |
|
||||
swift build -Xswiftc -strict-concurrency=complete -v
|
||||
swift test -c release -Xswiftc -strict-concurrency=complete -v
|
||||
swift build -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v
|
||||
cp llama-cpp-bin/build/bin/lib*.so .build/*/release/
|
||||
swift test -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v
|
||||
|
34
.github/workflows/swift.yml
vendored
34
.github/workflows/swift.yml
vendored
@ -39,10 +39,18 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- name: Download pre-built llama.cpp binaries
|
||||
run: |
|
||||
wget -O llama-cpp-bin.zip "https://github.com/fkunn1326/llama.cpp/releases/download/b4846/llama-b4846-bin-ubuntu-x64.zip"
|
||||
unzip llama-cpp-bin.zip -d llama-cpp-bin
|
||||
- name: Copy llama.cpp binaries
|
||||
run: cp llama-cpp-bin/build/bin/lib*.so ./
|
||||
- name: Build
|
||||
run: swift build -Xswiftc -strict-concurrency=complete -v
|
||||
run: swift build -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v
|
||||
- name: Run tests
|
||||
run: swift test -c release -Xswiftc -strict-concurrency=complete -v
|
||||
run: |
|
||||
cp llama-cpp-bin/build/bin/lib*.so .build/*/release/
|
||||
swift test -c release -Xswiftc -strict-concurrency=complete -Xlinker -L./ -v
|
||||
windows-build:
|
||||
name: Swift ${{ matrix.swift-version.tag }} on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
@ -62,18 +70,20 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- name: Clone llama.cpp
|
||||
run: git clone -b ku-nlp/gpt2-japanese-char https://github.com/ensan-hcl/llama.cpp.git
|
||||
- name: Build llama.cpp
|
||||
- name: Download pre-built llama.cpp binaries
|
||||
shell: pwsh
|
||||
run: |
|
||||
cmake -B build -DBUILD_SHARED_LIBS=ON
|
||||
cmake --build build --config Release
|
||||
working-directory: ./llama.cpp
|
||||
- name: Copy built files
|
||||
$zipUrl = "https://github.com/fkunn1326/llama.cpp/releases/download/b4846/llama-b4846-bin-win-avx-x64.zip"
|
||||
$zipPath = "llama-cpp-bin.zip"
|
||||
Invoke-WebRequest -Uri $zipUrl -OutFile $zipPath
|
||||
Expand-Archive -Path $zipPath -DestinationPath llama-cpp-bin
|
||||
- name: Copy llama.cpp binaries
|
||||
shell: pwsh
|
||||
run: |
|
||||
cp ./build/bin/Release/llama.dll ../
|
||||
cp ./build/Release/llama.lib ../
|
||||
working-directory: ./llama.cpp
|
||||
Copy-Item -Path "llama-cpp-bin/llama.dll" -Destination "./"
|
||||
Copy-Item -Path "llama-cpp-bin/llama.lib" -Destination "./"
|
||||
Copy-Item -Path "llama-cpp-bin/ggml.dll" -Destination "./"
|
||||
Copy-Item -Path "llama-cpp-bin/ggml-*.dll" -Destination "./"
|
||||
- name: Build
|
||||
run: swift build -Xswiftc -strict-concurrency=complete -v
|
||||
- name: Run tests
|
||||
|
@ -139,23 +139,6 @@ if checkObjcAvailability() {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if os(Windows)
|
||||
targets.append(contentsOf: [
|
||||
.systemLibrary(
|
||||
name: "llama.cpp"
|
||||
),
|
||||
.target(
|
||||
name: "KanaKanjiConverterModule",
|
||||
dependencies: [
|
||||
"SwiftUtils",
|
||||
"llama.cpp",
|
||||
"EfficientNGram",
|
||||
.product(name: "Collections", package: "swift-collections"),
|
||||
],
|
||||
swiftSettings: swiftSettings
|
||||
)
|
||||
])
|
||||
#else
|
||||
if let envValue = ProcessInfo.processInfo.environment["LLAMA_MOCK"], envValue == "1" {
|
||||
targets.append(contentsOf: [
|
||||
.target(name: "llama-mock"),
|
||||
@ -171,24 +154,43 @@ if let envValue = ProcessInfo.processInfo.environment["LLAMA_MOCK"], envValue ==
|
||||
)
|
||||
])
|
||||
} else {
|
||||
dependencies.append(
|
||||
.package(url: "https://github.com/ensan-hcl/llama.cpp", branch: "6b862f4")
|
||||
)
|
||||
|
||||
#if os(Windows) || os(Linux)
|
||||
targets.append(contentsOf: [
|
||||
.systemLibrary(
|
||||
name: "llama.cpp"
|
||||
),
|
||||
.target(
|
||||
name: "KanaKanjiConverterModule",
|
||||
dependencies: [
|
||||
"SwiftUtils",
|
||||
"llama.cpp",
|
||||
"EfficientNGram",
|
||||
.product(name: "llama", package: "llama.cpp"),
|
||||
.product(name: "Collections", package: "swift-collections"),
|
||||
],
|
||||
swiftSettings: swiftSettings
|
||||
)
|
||||
])
|
||||
#else
|
||||
targets.append(contentsOf: [
|
||||
.binaryTarget(
|
||||
name: "llama.cpp",
|
||||
url: "https://github.com/fkunn1326/llama.cpp/releases/download/b4844/llama-b4844-xcframework.zip",
|
||||
// this can be computed `swift package compute-checksum llama-b4844-xcframework.zip`
|
||||
checksum: "40bd1e58e727511649e13a6de9eb577ea8be78fe4183c2e1b382b12054849f05"
|
||||
),
|
||||
.target(
|
||||
name: "KanaKanjiConverterModule",
|
||||
dependencies: [
|
||||
"SwiftUtils",
|
||||
"EfficientNGram",
|
||||
"llama.cpp",
|
||||
.product(name: "Collections", package: "swift-collections"),
|
||||
],
|
||||
swiftSettings: swiftSettings
|
||||
)
|
||||
])
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
let package = Package(
|
||||
name: "AzooKeyKanakanjiConverter",
|
||||
|
@ -62,7 +62,7 @@ extension Kana2Kanji {
|
||||
versionDependentConfig: ConvertRequestOptions.ZenzaiVersionDependentMode
|
||||
) -> (result: LatticeNode, nodes: Nodes, cache: ZenzaiCache) {
|
||||
var constraint = zenzaiCache?.getNewConstraint(for: inputData) ?? PrefixConstraint([])
|
||||
print("initial constraint", constraint)
|
||||
debug("initial constraint", constraint)
|
||||
let eosNode = LatticeNode.EOSNode
|
||||
var nodes: Kana2Kanji.Nodes = []
|
||||
var constructedCandidates: [(RegisteredNode, Candidate)] = []
|
||||
@ -96,19 +96,19 @@ extension Kana2Kanji {
|
||||
}
|
||||
}
|
||||
guard var (index, candidate) = best else {
|
||||
print("best was not found!")
|
||||
debug("best was not found!")
|
||||
// Emptyの場合
|
||||
// 制約が満たせない場合は無視する
|
||||
return (eosNode, nodes, ZenzaiCache(inputData, constraint: PrefixConstraint([]), satisfyingCandidate: nil))
|
||||
}
|
||||
|
||||
print("Constrained draft modeling", -start.timeIntervalSinceNow)
|
||||
debug("Constrained draft modeling", -start.timeIntervalSinceNow)
|
||||
reviewLoop: while true {
|
||||
// resultsを更新
|
||||
// ここでN-Bestも並び変えていることになる
|
||||
insertedCandidates.insert((draftResult.result.prevs[index], candidate), at: 0)
|
||||
if inferenceLimit == 0 {
|
||||
print("inference limit! \(candidate.text) is used for excuse")
|
||||
debug("inference limit! \(candidate.text) is used for excuse")
|
||||
// When inference occurs more than maximum times, then just return result at this point
|
||||
return (eosNode, nodes, ZenzaiCache(inputData, constraint: constraint, satisfyingCandidate: candidate))
|
||||
}
|
||||
@ -187,25 +187,25 @@ extension Kana2Kanji {
|
||||
switch reviewResult {
|
||||
case .error:
|
||||
// 何らかのエラーが発生
|
||||
print("error")
|
||||
debug("error")
|
||||
return .return(constraint: constraint, alternativeConstraints: [], satisfied: false)
|
||||
case .pass(let score, let alternativeConstraints):
|
||||
// 合格
|
||||
print("passed:", score)
|
||||
debug("passed:", score)
|
||||
return .return(constraint: constraint, alternativeConstraints: alternativeConstraints, satisfied: true)
|
||||
case .fixRequired(let prefixConstraint):
|
||||
// 同じ制約が2回連続で出てきたら諦める
|
||||
if constraint.constraint == prefixConstraint {
|
||||
print("same constraint:", prefixConstraint)
|
||||
debug("same constraint:", prefixConstraint)
|
||||
return .return(constraint: PrefixConstraint([]), alternativeConstraints: [], satisfied: false)
|
||||
}
|
||||
// 制約が得られたので、更新する
|
||||
constraint = PrefixConstraint(prefixConstraint)
|
||||
print("update constraint:", constraint)
|
||||
debug("update constraint:", constraint)
|
||||
// もし制約を満たす候補があるならそれを使って再レビューチャレンジを戦うことで、推論を減らせる
|
||||
for (i, candidate) in candidates.indexed() where i != candidateIndex {
|
||||
if candidate.text.utf8.hasPrefix(prefixConstraint) && self.heuristicRetryValidation(candidate.text) {
|
||||
print("found \(candidate.text) as another retry")
|
||||
debug("found \(candidate.text) as another retry")
|
||||
return .retry(candidateIndex: i)
|
||||
}
|
||||
}
|
||||
@ -214,16 +214,16 @@ extension Kana2Kanji {
|
||||
let newConstraint = PrefixConstraint(Array(wholeConstraint.utf8), hasEOS: true)
|
||||
// 同じ制約が2回連続で出てきたら諦める
|
||||
if constraint == newConstraint {
|
||||
print("same constraint:", constraint)
|
||||
debug("same constraint:", constraint)
|
||||
return .return(constraint: PrefixConstraint([]), alternativeConstraints: [], satisfied: false)
|
||||
}
|
||||
// 制約が得られたので、更新する
|
||||
print("update whole constraint:", wholeConstraint)
|
||||
debug("update whole constraint:", wholeConstraint)
|
||||
constraint = PrefixConstraint(Array(wholeConstraint.utf8), hasEOS: true)
|
||||
// もし制約を満たす候補があるならそれを使って再レビューチャレンジを戦うことで、推論を減らせる
|
||||
for (i, candidate) in candidates.indexed() where i != candidateIndex {
|
||||
if candidate.text == wholeConstraint && self.heuristicRetryValidation(candidate.text) {
|
||||
print("found \(candidate.text) as another retry")
|
||||
debug("found \(candidate.text) as another retry")
|
||||
return .retry(candidateIndex: i)
|
||||
}
|
||||
}
|
||||
|
@ -59,11 +59,13 @@ struct FixedSizeHeap<Element: Comparable> {
|
||||
enum ZenzError: LocalizedError {
|
||||
case couldNotLoadModel(path: String)
|
||||
case couldNotLoadContext
|
||||
case couldNotLoadVocab
|
||||
|
||||
var errorDescription: String? {
|
||||
switch self {
|
||||
case .couldNotLoadContext: "failed to load context"
|
||||
case .couldNotLoadModel(path: let path): "could not load model weight at \(path)"
|
||||
case .couldNotLoadContext: return "failed to load context"
|
||||
case .couldNotLoadModel(path: let path): return "could not load model weight at \(path)"
|
||||
case .couldNotLoadVocab: return "failed to load vocab"
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -71,18 +73,20 @@ enum ZenzError: LocalizedError {
|
||||
final class ZenzContext {
|
||||
private var model: OpaquePointer
|
||||
private var context: OpaquePointer
|
||||
private var vocab: OpaquePointer
|
||||
private var prevInput: [llama_token] = []
|
||||
|
||||
private let n_len: Int32 = 512
|
||||
|
||||
init(model: OpaquePointer, context: OpaquePointer) {
|
||||
init(model: OpaquePointer, context: OpaquePointer, vocab: OpaquePointer) {
|
||||
self.model = model
|
||||
self.context = context
|
||||
self.vocab = vocab
|
||||
}
|
||||
|
||||
deinit {
|
||||
llama_free(context)
|
||||
llama_free_model(model)
|
||||
llama_model_free(model)
|
||||
llama_backend_free()
|
||||
}
|
||||
|
||||
@ -90,10 +94,9 @@ final class ZenzContext {
|
||||
let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
|
||||
debug("Using \(n_threads) threads")
|
||||
var ctx_params = llama_context_default_params()
|
||||
ctx_params.seed = 1234
|
||||
ctx_params.n_ctx = 512
|
||||
ctx_params.n_threads = UInt32(n_threads)
|
||||
ctx_params.n_threads_batch = UInt32(n_threads)
|
||||
ctx_params.n_threads = Int32(n_threads)
|
||||
ctx_params.n_threads_batch = Int32(n_threads)
|
||||
ctx_params.n_batch = 512
|
||||
return ctx_params
|
||||
}
|
||||
@ -102,24 +105,30 @@ final class ZenzContext {
|
||||
llama_backend_init()
|
||||
var model_params = llama_model_default_params()
|
||||
model_params.use_mmap = true
|
||||
let model = llama_load_model_from_file(path, model_params)
|
||||
let model = llama_model_load_from_file(path, model_params)
|
||||
guard let model else {
|
||||
debug("Could not load model at \(path)")
|
||||
throw ZenzError.couldNotLoadModel(path: path)
|
||||
}
|
||||
|
||||
let context = llama_new_context_with_model(model, ctx_params)
|
||||
let context = llama_init_from_model(model, ctx_params)
|
||||
guard let context else {
|
||||
debug("Could not load context!")
|
||||
throw ZenzError.couldNotLoadContext
|
||||
}
|
||||
|
||||
return ZenzContext(model: model, context: context)
|
||||
let vocab = llama_model_get_vocab(model)
|
||||
guard let vocab else {
|
||||
debug("Could not load vocab!")
|
||||
throw ZenzError.couldNotLoadVocab
|
||||
}
|
||||
|
||||
return ZenzContext(model: model, context: context, vocab: vocab)
|
||||
}
|
||||
|
||||
func reset_context() throws {
|
||||
llama_free(self.context)
|
||||
let context = llama_new_context_with_model(self.model, Self.ctx_params)
|
||||
let context = llama_init_from_model(self.model, Self.ctx_params)
|
||||
guard let context else {
|
||||
debug("Could not load context!")
|
||||
throw ZenzError.couldNotLoadContext
|
||||
@ -157,7 +166,7 @@ final class ZenzContext {
|
||||
return .nan
|
||||
}
|
||||
let tokenizedPromptCount = ignorePrompt.isEmpty ? 1 : tokenize(text: ignorePrompt, add_bos: true, add_eos: false).count
|
||||
let n_vocab = llama_n_vocab(model)
|
||||
let n_vocab = llama_vocab_n_tokens(vocab)
|
||||
|
||||
var sum: Float = 0
|
||||
// 最初のプロンプト部分は無視する
|
||||
@ -202,14 +211,14 @@ final class ZenzContext {
|
||||
func pure_greedy_decoding(leftSideContext: String, maxCount: Int = .max) -> String {
|
||||
var prompt_tokens = self.tokenize(text: leftSideContext, add_bos: false)
|
||||
let initial_count = prompt_tokens.count
|
||||
let eos_token = llama_token_eos(model)
|
||||
let eos_token = llama_vocab_eos(vocab)
|
||||
while prompt_tokens.count - initial_count < maxCount {
|
||||
let startOffset = prompt_tokens.count - 1
|
||||
guard let logits = self.get_logits(tokens: prompt_tokens, logits_start_index: startOffset) else {
|
||||
print("logits unavailable")
|
||||
debug("logits unavailable")
|
||||
return ""
|
||||
}
|
||||
let n_vocab = llama_n_vocab(model)
|
||||
let n_vocab = llama_vocab_n_tokens(vocab)
|
||||
let startIndex = (prompt_tokens.count - 1 - startOffset) * Int(n_vocab)
|
||||
let endIndex = (prompt_tokens.count - startOffset) * Int(n_vocab)
|
||||
// Min-Heapを使用してn-bestを計算
|
||||
@ -249,11 +258,11 @@ final class ZenzContext {
|
||||
let startOffset = prompt_tokens.count - 1
|
||||
|
||||
guard let logits = self.get_logits(tokens: prompt_tokens, logits_start_index: startOffset) else {
|
||||
print("logits unavailable")
|
||||
debug("logits unavailable")
|
||||
return []
|
||||
}
|
||||
|
||||
let n_vocab = llama_n_vocab(model)
|
||||
let n_vocab = llama_vocab_n_tokens(vocab)
|
||||
var exp_sum: Float = 0
|
||||
let startIndex = (prompt_tokens.count - 1 - startOffset) * Int(n_vocab)
|
||||
let endIndex = (prompt_tokens.count - startOffset) * Int(n_vocab)
|
||||
@ -293,7 +302,7 @@ final class ZenzContext {
|
||||
personalizationMode: (mode: ConvertRequestOptions.ZenzaiMode.PersonalizationMode, base: EfficientNGram, personal: EfficientNGram)?,
|
||||
versionDependentConfig: ConvertRequestOptions.ZenzaiVersionDependentMode
|
||||
) -> CandidateEvaluationResult {
|
||||
print("Evaluate", candidate)
|
||||
debug("Evaluate", candidate)
|
||||
// For zenz-v1 model, \u{EE00} is a token used for 'start query', and \u{EE01} is a token used for 'start answer'
|
||||
// We assume \u{EE01}\(candidate) is always splitted into \u{EE01}_\(candidate) by zenz-v1 tokenizer
|
||||
var userDictionaryPrompt: String = ""
|
||||
@ -383,12 +392,12 @@ final class ZenzContext {
|
||||
let tokens = prompt_tokens + candidate_tokens
|
||||
let startOffset = prompt_tokens.count - 1
|
||||
let pos_max = llama_kv_cache_seq_pos_max(self.context, 0)
|
||||
print("pos max:", pos_max)
|
||||
debug("pos max:", pos_max)
|
||||
guard let logits = self.get_logits(tokens: tokens, logits_start_index: startOffset) else {
|
||||
debug("logits unavailable")
|
||||
return .error
|
||||
}
|
||||
let n_vocab = llama_n_vocab(model)
|
||||
let n_vocab = llama_vocab_n_tokens(vocab)
|
||||
let is_learned_token: [(isLearned: Bool, priority: Float)] = Array(repeating: (false, 0), count: prompt_tokens.count) + candidate.data.flatMap {
|
||||
// priorityは文字数にする→文字数が長いほど優先される
|
||||
Array(repeating: ($0.metadata.contains(.isLearned), logf(getLearningPriority(data: $0))), count: self.tokenize(text: $0.word, add_bos: false).count)
|
||||
@ -456,12 +465,12 @@ final class ZenzContext {
|
||||
}
|
||||
|
||||
guard let maxItem = tokenHeap.max else {
|
||||
print("Max Item could not be found for unknown reason")
|
||||
debug("Max Item could not be found for unknown reason")
|
||||
return .error
|
||||
}
|
||||
// ここで最も良い候補であったかをチェックする
|
||||
if maxItem.token != token_id {
|
||||
if maxItem.token == llama_token_eos(model) {
|
||||
if maxItem.token == llama_vocab_eos(vocab) {
|
||||
var cchars = tokens[..<i].reduce(into: []) {
|
||||
$0.append(contentsOf: token_to_piece(token: $1))
|
||||
}
|
||||
@ -524,15 +533,15 @@ final class ZenzContext {
|
||||
let utf8Count = text.utf8.count
|
||||
let n_tokens = utf8Count + (add_bos ? 1 : 0)
|
||||
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
|
||||
let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
|
||||
var swiftTokens: [llama_token] = if tokenCount < 0 {
|
||||
[llama_token_bos(model)]
|
||||
[llama_vocab_bos(vocab)]
|
||||
} else {
|
||||
(0..<tokenCount).map {tokens[Int($0)]}
|
||||
}
|
||||
tokens.deallocate()
|
||||
if add_eos {
|
||||
swiftTokens.append(llama_token_eos(model))
|
||||
swiftTokens.append(llama_vocab_eos(vocab))
|
||||
}
|
||||
return swiftTokens
|
||||
}
|
||||
@ -544,7 +553,7 @@ final class ZenzContext {
|
||||
defer {
|
||||
result.deallocate()
|
||||
}
|
||||
let nTokens = llama_token_to_piece(model, token, result, 8, false)
|
||||
let nTokens = llama_token_to_piece(vocab, token, result, 8, 0, false)
|
||||
|
||||
if nTokens < 0 {
|
||||
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
|
||||
@ -552,11 +561,11 @@ final class ZenzContext {
|
||||
defer {
|
||||
newResult.deallocate()
|
||||
}
|
||||
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
|
||||
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
|
||||
let nNewTokens = llama_token_to_piece(vocab, token, newResult, Int32(-nTokens), 0, false)
|
||||
let bufferPointer: UnsafeBufferPointer<Int8> = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
|
||||
return Array(bufferPointer)
|
||||
} else {
|
||||
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
|
||||
let bufferPointer: UnsafeBufferPointer<Int8> = UnsafeBufferPointer(start: result, count: Int(nTokens))
|
||||
return Array(bufferPointer)
|
||||
}
|
||||
}
|
||||
|
@ -9,19 +9,20 @@ package typealias llama_seq_id = Int32
|
||||
package struct llama_context_params {
|
||||
package var seed: Int
|
||||
package var n_ctx: Int
|
||||
package var n_threads: UInt32
|
||||
package var n_threads_batch: UInt32
|
||||
package var n_threads: Int32
|
||||
package var n_threads_batch: Int32
|
||||
package var n_batch: Int
|
||||
}
|
||||
package func llama_context_default_params() -> llama_context_params { unimplemented() }
|
||||
|
||||
package typealias llama_context = OpaquePointer
|
||||
package func llama_new_context_with_model(_ model: llama_model, _ ctx_params: llama_context_params) -> llama_context? { unimplemented() }
|
||||
package func llama_init_from_model(_ model: llama_model, _ ctx_params: llama_context_params) -> llama_context? { unimplemented() }
|
||||
package func llama_free(_ context: llama_context) {}
|
||||
|
||||
package typealias llama_model = OpaquePointer
|
||||
package typealias llama_vocab = OpaquePointer
|
||||
|
||||
package func llama_free_model(_ model: llama_model) {}
|
||||
package func llama_model_free(_ model: llama_model) {}
|
||||
|
||||
package func llama_backend_init() {}
|
||||
package func llama_backend_free() {}
|
||||
@ -31,7 +32,9 @@ package struct llama_model_params {
|
||||
}
|
||||
package func llama_model_default_params() -> llama_model_params { unimplemented() }
|
||||
|
||||
package func llama_load_model_from_file(_ path: String, _ model_params: llama_model_params) -> llama_model? { unimplemented() }
|
||||
package func llama_model_get_vocab(_ model: llama_model) -> llama_vocab? { unimplemented() }
|
||||
|
||||
package func llama_model_load_from_file(_ path: String, _ model_params: llama_model_params) -> llama_model? { unimplemented() }
|
||||
|
||||
package func llama_kv_cache_seq_rm(_ ctx: llama_context, _ seq_id: llama_seq_id, _ p0: llama_pos, _ p1: llama_pos) {}
|
||||
package func llama_kv_cache_seq_pos_max(_ ctx: llama_context, _ seq_id: llama_seq_id) -> Int { unimplemented() }
|
||||
@ -48,12 +51,12 @@ package struct llama_batch {
|
||||
package func llama_batch_init(_ n_tokens: Int, _ embd: Int, _ n_seq_max: Int) -> llama_batch { unimplemented() }
|
||||
|
||||
package func llama_n_ctx(_ ctx: llama_context) -> Int { unimplemented() }
|
||||
package func llama_n_vocab(_ model: llama_model) -> Int { unimplemented() }
|
||||
package func llama_vocab_n_tokens(_ vocab: llama_vocab) -> Int { unimplemented() }
|
||||
|
||||
package func llama_tokenize(_ model: llama_model, _ text: String, _ text_len: Int32, _ tokens: UnsafeMutablePointer<llama_token>, _ n_tokens_max: Int32, _ add_special: Bool, _ parse_special: Bool) -> Int { unimplemented() }
|
||||
package func llama_token_bos(_ model: llama_model) -> llama_token { unimplemented() }
|
||||
package func llama_token_eos(_ model: llama_model) -> llama_token { unimplemented() }
|
||||
package func llama_token_to_piece(_ model: llama_model, _ token: llama_token, _ buf: UnsafeMutablePointer<Int8>, _ length: Int32, _ special: Bool) -> Int32 { unimplemented() }
|
||||
package func llama_vocab_eos(_ vocab: llama_vocab) -> llama_token { unimplemented() }
|
||||
package func llama_vocab_bos(_ vocab: llama_vocab) -> llama_token { unimplemented() }
|
||||
package func llama_token_to_piece(_ vocab: llama_vocab, _ token: llama_token, _ buf: UnsafeMutablePointer<Int8>, _ length: Int32, _ lstrip: Int32, _ special: Bool) -> Int32 { unimplemented() }
|
||||
|
||||
package func llama_decode(_ ctx: llama_context, _ batch: llama_batch) -> Int { unimplemented() }
|
||||
package func llama_get_logits(_ ctx: llama_context) -> UnsafeMutablePointer<Float>? { unimplemented() }
|
@ -1,4 +1,3 @@
|
||||
// header file from https://github.com/ensan-hcl/llama.cpp/tree/ku-nlp/gpt2-japanese-char
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
@ -8,8 +7,8 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
// Tensor allocator
|
||||
struct ggml_tallocr {
|
||||
@ -25,7 +24,7 @@ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
|
||||
// Graph allocator
|
||||
/*
|
||||
Example usage:
|
||||
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
|
||||
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
||||
|
||||
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
|
||||
ggml_gallocr_reserve(galloc, build_graph(max_batch));
|
||||
@ -74,4 +73,4 @@ GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
@ -1,9 +1,22 @@
|
||||
// header file from https://github.com/ensan-hcl/llama.cpp/tree/ku-nlp/gpt2-japanese-char
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
|
||||
#ifdef GGML_BACKEND_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef GGML_BACKEND_BUILD
|
||||
# define GGML_BACKEND_API __declspec(dllexport) extern
|
||||
# else
|
||||
# define GGML_BACKEND_API __declspec(dllimport) extern
|
||||
# endif
|
||||
# else
|
||||
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
|
||||
# endif
|
||||
#else
|
||||
# define GGML_BACKEND_API extern
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@ -13,42 +26,52 @@ extern "C" {
|
||||
typedef struct ggml_backend_event * ggml_backend_event_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
typedef void * ggml_backend_graph_plan_t;
|
||||
typedef struct ggml_backend_reg * ggml_backend_reg_t;
|
||||
typedef struct ggml_backend_device * ggml_backend_dev_t;
|
||||
|
||||
|
||||
//
|
||||
// Backend buffer type
|
||||
//
|
||||
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
||||
|
||||
//
|
||||
// Backend buffer
|
||||
//
|
||||
|
||||
// buffer type
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
|
||||
// buffer
|
||||
enum ggml_backend_buffer_usage {
|
||||
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
||||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
||||
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
|
||||
};
|
||||
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
//
|
||||
// Backend
|
||||
// Backend (stream)
|
||||
//
|
||||
|
||||
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
||||
@ -63,8 +86,10 @@ extern "C" {
|
||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
// "offset" refers to the offset in tensor->data for setting/getting data
|
||||
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
|
||||
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
||||
|
||||
@ -74,11 +99,11 @@ extern "C" {
|
||||
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
// NOTE: will be removed, use device version instead
|
||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// asynchronous copy
|
||||
// the copy is performed after all the currently queued operations in backend_src
|
||||
@ -86,51 +111,132 @@ extern "C" {
|
||||
// automatic fallback to sync copy if async is not supported
|
||||
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// events
|
||||
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
|
||||
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
// Events
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
//
|
||||
// Backend device
|
||||
//
|
||||
|
||||
// Create a backend buffer from an existing pointer
|
||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
enum ggml_backend_dev_type {
|
||||
// CPU device using system memory
|
||||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
// GPU device using dedicated memory
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
||||
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
||||
};
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
// functionality supported by the device
|
||||
struct ggml_backend_dev_caps {
|
||||
// asynchronous operations
|
||||
bool async;
|
||||
// pinned host buffer
|
||||
bool host_buffer;
|
||||
// creating buffers from host ptr
|
||||
bool buffer_from_host_ptr;
|
||||
// event synchronization
|
||||
bool events;
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
// all the device properties
|
||||
struct ggml_backend_dev_props {
|
||||
const char * name;
|
||||
const char * description;
|
||||
size_t memory_free;
|
||||
size_t memory_total;
|
||||
enum ggml_backend_dev_type type;
|
||||
struct ggml_backend_dev_caps caps;
|
||||
};
|
||||
|
||||
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
||||
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
|
||||
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
||||
|
||||
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
|
||||
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||
|
||||
//
|
||||
// Backend (reg)
|
||||
//
|
||||
|
||||
GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
|
||||
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
||||
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
||||
|
||||
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||
|
||||
// Split buffer type for tensor parallelism
|
||||
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
||||
// Set the number of threads for the backend
|
||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
||||
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
||||
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
||||
// Set the abort callback for the backend
|
||||
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
|
||||
struct ggml_backend_feature {
|
||||
const char * name;
|
||||
const char * value;
|
||||
};
|
||||
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
//
|
||||
|
||||
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
|
||||
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||
|
||||
GGML_API size_t ggml_backend_reg_get_count(void);
|
||||
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
|
||||
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
|
||||
// Backend (reg) enumeration
|
||||
GGML_API size_t ggml_backend_reg_count(void);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
|
||||
|
||||
// Device enumeration
|
||||
GGML_API size_t ggml_backend_dev_count(void);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
|
||||
|
||||
// Direct backend (stream) initialization
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
|
||||
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
||||
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
||||
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
||||
|
||||
// Load a backend from a dynamic library and register it
|
||||
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
|
||||
// Unload a backend if loaded dynamically and unregister it
|
||||
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
|
||||
// Load all known backends from dynamic libraries
|
||||
GGML_API void ggml_backend_load_all(void);
|
||||
GGML_API void ggml_backend_load_all_from_path(const char * dir_path);
|
||||
|
||||
//
|
||||
// Backend scheduler
|
||||
//
|
||||
|
||||
// The backend scheduler allows for multiple backends to be used together
|
||||
// The backend scheduler allows for multiple backend devices to be used together
|
||||
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
||||
// The backends are selected based on:
|
||||
// - the backend that supports the operation
|
||||
@ -154,20 +260,26 @@ extern "C" {
|
||||
ggml_backend_sched_reserve(sched, reserve_graph);
|
||||
|
||||
// compute
|
||||
graph = build_graph(sched);
|
||||
ggml_backend_sched_graph_compute(sched, graph);
|
||||
graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
|
||||
}
|
||||
|
||||
// if there are graph inputs:
|
||||
ggml_backend_sched_reset(sched);
|
||||
ggml_backend_sched_alloc_graph(sched, graph);
|
||||
ggml_backend_tensor_set(input_tensor, ...);
|
||||
ggml_backend_sched_graph_compute(sched, graph);
|
||||
graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
|
||||
ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
|
||||
ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
|
||||
ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
|
||||
ggml_backend_sched_graph_compute(sched, graph); // execute the graph
|
||||
|
||||
// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
|
||||
// allocate them statically via ggml_backend_alloc_ctx_tensors
|
||||
}
|
||||
*/
|
||||
|
||||
struct ggml_backend_sched;
|
||||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||
|
||||
// Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
|
||||
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
||||
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
||||
//
|
||||
@ -176,12 +288,15 @@ extern "C" {
|
||||
//
|
||||
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
|
||||
// Initialize a backend scheduler
|
||||
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||
|
||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
||||
|
||||
// Get the number of splits of the last graph
|
||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||
@ -193,12 +308,14 @@ extern "C" {
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
|
||||
// Allocate and compute graph on the backend scheduler
|
||||
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
|
||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
||||
|
||||
// Reset all assignments and allocators - must be called before changing the node backends
|
||||
// Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
|
||||
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
|
||||
// The correct way to use this API is to discard the deallocated tensors and create new ones.
|
||||
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
||||
|
||||
// Set a callback to be called for each resulting node during graph compute
|
||||
@ -219,16 +336,19 @@ extern "C" {
|
||||
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
||||
|
||||
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||
|
||||
// Compare the output of two backends
|
||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
||||
|
||||
// Tensor initialization
|
||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
|
||||
// CPU buffer types are always available
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
39
Sources/llama.cpp/ggml-cpp.h
Normal file
39
Sources/llama.cpp/ggml-cpp.h
Normal file
@ -0,0 +1,39 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef __cplusplus
|
||||
#error "This header is for C++ only"
|
||||
#endif
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
#include <memory>
|
||||
|
||||
// Smart pointers for ggml types
|
||||
|
||||
// ggml
|
||||
|
||||
struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
|
||||
struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
|
||||
typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
|
||||
|
||||
// ggml-alloc
|
||||
|
||||
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
|
||||
|
||||
// ggml-backend
|
||||
|
||||
struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } };
|
||||
struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
|
||||
struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } };
|
||||
struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_backend, ggml_backend_deleter> ggml_backend_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_event, ggml_backend_event_deleter> ggml_backend_event_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter> ggml_backend_sched_ptr;
|
137
Sources/llama.cpp/ggml-cpu.h
Normal file
137
Sources/llama.cpp/ggml-cpu.h
Normal file
@ -0,0 +1,137 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggml-org/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
||||
int n_threads;
|
||||
struct ggml_threadpool * threadpool;
|
||||
|
||||
// abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
enum ggml_numa_strategy {
|
||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||
GGML_NUMA_STRATEGY_COUNT
|
||||
};
|
||||
|
||||
GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||
GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
|
||||
GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
|
||||
GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||
|
||||
GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
|
||||
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
|
||||
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||
GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
//
|
||||
// system info
|
||||
//
|
||||
|
||||
// x86
|
||||
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
|
||||
// ARM
|
||||
GGML_BACKEND_API int ggml_cpu_has_neon (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_sve (void);
|
||||
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
||||
GGML_BACKEND_API int ggml_cpu_has_sme (void);
|
||||
// other
|
||||
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
||||
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
|
||||
struct ggml_type_traits_cpu {
|
||||
ggml_from_float_t from_float;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
};
|
||||
|
||||
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||
|
||||
GGML_BACKEND_API void ggml_cpu_init(void);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
66
Sources/llama.cpp/ggml-metal.h
Normal file
66
Sources/llama.cpp/ggml-metal.h
Normal file
@ -0,0 +1,66 @@
|
||||
// Note: this description is outdated
|
||||
//
|
||||
// An interface allowing to compute ggml_cgraph with Metal
|
||||
//
|
||||
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
||||
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
|
||||
//
|
||||
// How it works?
|
||||
//
|
||||
// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
|
||||
// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
|
||||
// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
|
||||
//
|
||||
// You only need to make sure that all memory buffers that you used during the graph creation
|
||||
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
||||
// used during the graph evaluation to determine the arguments of the compute kernels.
|
||||
//
|
||||
// Synchronization between device and host memory (for example for input and output tensors)
|
||||
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
struct ggml_tensor;
|
||||
struct ggml_cgraph;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// backend API
|
||||
// user-code should use only these functions
|
||||
//
|
||||
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
|
||||
GGML_DEPRECATED(
|
||||
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
||||
"obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
|
||||
|
||||
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
|
||||
// helper to check if the device supports a specific family
|
||||
// ideally, the user code should be doing these checks
|
||||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||
GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||
|
||||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
||||
GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user