Merge pull request #124 from fkunn1326/zenzai-cuda

Windowsではllama.cppを分離する
2025-08-22 15:05:26 +00:00 · 2024-09-14 22:33:42 +09:00
parent 31ce9911ac ae0b4053de
commit 269c47dd3c
8 changed files with 4056 additions and 73 deletions
--- a/.github/workflows/swift.yml
+++ b/.github/workflows/swift.yml
@ -62,6 +62,18 @@ jobs:
      - uses: actions/checkout@v4
        with:
          submodules: true
+      - name: Clone llama.cpp
+        run: git clone -b ku-nlp/gpt2-japanese-char https://github.com/ensan-hcl/llama.cpp.git
+      - name: Build llama.cpp
+        run: |
+          cmake -B build -DBUILD_SHARED_LIBS=ON
+          cmake --build build --config Release
+        working-directory: ./llama.cpp
+      - name: Copy built files
+        run: |
+          cp ./build/bin/Release/llama.dll ../
+          cp ./build/Release/llama.lib ../
+        working-directory: ./llama.cpp
      - name: Build
        run: swift build -Xswiftc -strict-concurrency=complete -v
      - name: Run tests
--- a/Docs/about_windows_support.md
+++ b/Docs/about_windows_support.md
@ -0,0 +1,35 @@
+# Windows対応について
+
+Windows上でAzookeyKanaKanjiConverterを使用するためには、`llama.cpp`をビルドして`llama.lib`と`llama.dll`を準備する必要があります。
+
+## 対応の背景
+
+`llama.cpp`をCUDAに対応させるには`.cu` ファイルのビルドが必要ですが、Swiftが内部で使用している`clang`ではこれに対応していない[^1]ので、[cxx-interop](https://www.swift.org/documentation/cxx-interop/)を使うことができません。WindowsではCUDA対応を実現するために、外部のDLLに依存する形を取ることにしました。
+
+## 実行手順
+
+Windows上で AzookeyKanaKanjiConverter を動作させるためには、以下の手順で`llama.cpp`をビルドする必要があります。
+
+```cmd
+git clone -b ku-nlp/gpt2-japanese-char https://github.com/ensan-hcl/llama.cpp.git
+cmake -B build -DBUILD_SHARED_LIBS=ON
+cmake --build build --config Release
+```
+
+> [!TIP]
+> CUDAに対応させてビルドする場合、`-DLLAMA_CUDA=ON`オプションを指定してビルドします。
+
+必要なファイルは以下のパスに存在します。
+```
+build/bin/Release/llama.dll
+build/Release/llama.lib
+```
+
+## 配置方法
+
+AzookeyKanaKanjiConverterを使って開発を行うとき、`llama.lib` はビルド時に必要になるので、プロジェクトのルートディレクトリ（`Package.swift`と同じフォルダ）に配置します。
+
+また、`llama.dll` は実行時に必要となるため、[DLL検索パス](https://learn.microsoft.com/ja-jp/windows/win32/dlls/dynamic-link-library-search-order#standard-search-order-for-unpackaged-apps)に沿って配置する必要があります。プロジェクトがビルドされたファイルと同じディレクトリに配置するのが適切だと考えられます。
+
+
+[^1]: https://llvm.org/docs/CompileCudaWithLLVM.html#id3
--- a/Package.swift
+++ b/Package.swift
@ -14,6 +14,107 @@ let swiftSettings: [SwiftSetting] = [
    .enableUpcomingFeature("ImportObjcForwardDeclarations")
 ]

+var dependencies: [Package.Dependency] = [
+    // Dependencies declare other packages that this package depends on.
+    // .package(url: /* package url */, from: "1.0.0"),
+    .package(url: "https://github.com/apple/swift-algorithms", from: "1.0.0"),
+    .package(url: "https://github.com/apple/swift-collections", from: "1.0.0"),
+    .package(url: "https://github.com/apple/swift-argument-parser", .upToNextMajor(from: "1.0.0"))
+]
+
+var targets: [Target] = [
+    // Targets are the basic building blocks of a package. A target can define a module or a test suite.
+    // Targets can depend on other targets in this package, and on products in packages this package depends on.
+    .target(
+        name: "SwiftUtils",
+        dependencies: [
+            .product(name: "Algorithms", package: "swift-algorithms")
+        ],
+        resources: [],
+        swiftSettings: swiftSettings
+    ),
+    .target(
+        name: "KanaKanjiConverterModuleWithDefaultDictionary",
+        dependencies: [
+            "KanaKanjiConverterModule"
+        ],
+        exclude: [
+            "azooKey_dictionary_storage/README.md",
+            "azooKey_dictionary_storage/LICENSE",
+        ],
+        resources: [
+            .copy("azooKey_dictionary_storage/Dictionary"),
+        ],
+        swiftSettings: swiftSettings
+    ),
+    .executableTarget(
+        name: "CliTool",
+        dependencies: [
+            "KanaKanjiConverterModuleWithDefaultDictionary",
+            .product(name: "ArgumentParser", package: "swift-argument-parser"),
+        ]
+    ),
+    .testTarget(
+        name: "SwiftUtilsTests",
+        dependencies: ["SwiftUtils"],
+        resources: [],
+        swiftSettings: swiftSettings
+    ),
+    .testTarget(
+        name: "KanaKanjiConverterModuleTests",
+        dependencies: ["KanaKanjiConverterModule"],
+        resources: [
+            .copy("DictionaryMock")
+        ],
+        swiftSettings: swiftSettings
+    ),
+    .testTarget(
+        name: "KanaKanjiConverterModuleWithDefaultDictionaryTests",
+        dependencies: [
+            "KanaKanjiConverterModuleWithDefaultDictionary",
+            .product(name: "Collections", package: "swift-collections")
+        ],
+        swiftSettings: swiftSettings
+    )
+]
+
+
+#if !(os(Windows))
+dependencies.append(
+    .package(url: "https://github.com/ensan-hcl/llama.cpp", branch: "6b862f4")
+)
+#endif
+
+
+#if os(Windows)
+targets.append(contentsOf: [
+    .systemLibrary(
+        name: "llama.cpp"
+    ),
+    .target(
+        name: "KanaKanjiConverterModule",
+        dependencies: [
+            "SwiftUtils",
+            "llama.cpp",
+            .product(name: "Collections", package: "swift-collections")
+        ],
+        swiftSettings: swiftSettings
+    )
+])
+#else
+targets.append(contentsOf: [
+    .target(
+        name: "KanaKanjiConverterModule",
+        dependencies: [
+            "SwiftUtils",
+            .product(name: "llama", package: "llama.cpp"),
+            .product(name: "Collections", package: "swift-collections")
+        ],
+        swiftSettings: swiftSettings
+    )
+])
+#endif
+
 let package = Package(
    name: "AzooKeyKanakanjiConverter",
    platforms: [.iOS(.v14), .macOS(.v12)],
@ -34,77 +135,6 @@ let package = Package(
            targets: ["KanaKanjiConverterModule"]
        ),
    ],
-    dependencies: [
-        // Dependencies declare other packages that this package depends on.
-        // .package(url: /* package url */, from: "1.0.0"),
-        .package(url: "https://github.com/apple/swift-algorithms", from: "1.0.0"),
-        .package(url: "https://github.com/apple/swift-collections", from: "1.0.0"),
-        .package(url: "https://github.com/apple/swift-argument-parser", .upToNextMajor(from: "1.0.0")),
-        // local package
-        .package(url: "https://github.com/ensan-hcl/llama.cpp", branch: "b5cc30f"),
-    ],
-    targets: [
-        // Targets are the basic building blocks of a package. A target can define a module or a test suite.
-        // Targets can depend on other targets in this package, and on products in packages this package depends on.
-        .target(
-            name: "SwiftUtils",
-            dependencies: [
-                .product(name: "Algorithms", package: "swift-algorithms")
-            ],
-            resources: [],
-            swiftSettings: swiftSettings
-        ),
-        .target(
-            name: "KanaKanjiConverterModule",
-            dependencies: [
-                "SwiftUtils",
-                .product(name: "llama", package: "llama.cpp"),
-                .product(name: "Collections", package: "swift-collections")
-            ],
-            swiftSettings: swiftSettings
-        ),
-        .target(
-            name: "KanaKanjiConverterModuleWithDefaultDictionary",
-            dependencies: [
-                "KanaKanjiConverterModule"
-            ],
-            exclude: [
-                "azooKey_dictionary_storage/README.md",
-                "azooKey_dictionary_storage/LICENSE",
-            ],
-            resources: [
-                .copy("azooKey_dictionary_storage/Dictionary"),
-            ],
-            swiftSettings: swiftSettings
-        ),
-        .executableTarget(
-            name: "CliTool",
-            dependencies: [
-                "KanaKanjiConverterModuleWithDefaultDictionary",
-                .product(name: "ArgumentParser", package: "swift-argument-parser"),
-            ]
-        ),
-        .testTarget(
-            name: "SwiftUtilsTests",
-            dependencies: ["SwiftUtils"],
-            resources: [],
-            swiftSettings: swiftSettings
-        ),
-        .testTarget(
-            name: "KanaKanjiConverterModuleTests",
-            dependencies: ["KanaKanjiConverterModule"],
-            resources: [
-                .copy("DictionaryMock")
-            ],
-            swiftSettings: swiftSettings
-        ),
-        .testTarget(
-            name: "KanaKanjiConverterModuleWithDefaultDictionaryTests",
-            dependencies: [
-                "KanaKanjiConverterModuleWithDefaultDictionary",
-                .product(name: "Collections", package: "swift-collections")
-            ],
-            swiftSettings: swiftSettings
-        )
-    ]
+    dependencies: dependencies,
+    targets: targets
 )
--- a/Sources/llama.cpp/ggml-alloc.h
+++ b/Sources/llama.cpp/ggml-alloc.h
@ -0,0 +1,77 @@
+// header file from https://github.com/ensan-hcl/llama.cpp/tree/ku-nlp/gpt2-japanese-char
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct ggml_backend * ggml_backend_t;
+
+// Tensor allocator
+struct ggml_tallocr {
+    ggml_backend_buffer_t buffer;
+    void * base;
+    size_t alignment;
+    size_t offset;
+};
+
+GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
+GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
+
+// Graph allocator
+/*
+  Example usage:
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
+
+    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
+    ggml_gallocr_reserve(galloc, build_graph(max_batch));
+
+    // allocate the graph
+    struct ggml_cgraph * graph = build_graph(batch);
+    ggml_gallocr_alloc_graph(galloc, graph);
+
+    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
+
+    // evaluate the graph
+    ggml_backend_graph_compute(backend, graph);
+*/
+
+// special tensor flags for use with the graph allocator:
+//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
+//   ggml_set_output(): output tensors are never freed and never overwritten
+
+typedef struct ggml_gallocr * ggml_gallocr_t;
+
+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
+GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
+
+// pre-allocate buffers from a measure graph - does not allocate or modify the graph
+// call with a worst-case graph to avoid buffer reallocations
+// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
+// returns false if the buffer allocation failed
+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API bool ggml_gallocr_reserve_n(
+    ggml_gallocr_t galloc,
+    struct ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids);
+
+// automatic reallocation if the topology changes when using a single buffer
+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+
+GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
+// Utils
+// Create a buffer and allocate all the tensors in a ggml_context
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/Sources/llama.cpp/ggml-backend.h
+++ b/Sources/llama.cpp/ggml-backend.h
@ -0,0 +1,234 @@
+// header file from https://github.com/ensan-hcl/llama.cpp/tree/ku-nlp/gpt2-japanese-char
+#pragma once
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    typedef struct ggml_backend_event * ggml_backend_event_t;
+    typedef struct ggml_backend * ggml_backend_t;
+    typedef void * ggml_backend_graph_plan_t;
+
+    //
+    // Backend buffer
+    //
+
+    // buffer type
+    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
+    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
+    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
+
+    // buffer
+    enum ggml_backend_buffer_usage {
+        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
+        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
+    };
+
+    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+
+    //
+    // Backend
+    //
+
+    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
+    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
+    GGML_API void         ggml_backend_free(ggml_backend_t backend);
+
+    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
+    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
+
+    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
+
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
+    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
+
+    // tensor copy between different backends
+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // asynchronous copy
+    // the copy is performed after all the currently queued operations in backend_src
+    // backend_dst will wait for the copy to complete before performing other operations
+    // automatic fallback to sync copy if async is not supported
+    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // events
+    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
+    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
+    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
+    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
+    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
+
+    //
+    // CPU backend
+    //
+
+    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+
+    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+
+    // Create a backend buffer from an existing pointer
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+
+    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+
+#ifdef GGML_USE_CPU_HBM
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+#endif
+
+    //
+    // Backend registry
+    //
+
+    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
+
+    GGML_API size_t                     ggml_backend_reg_get_count(void);
+    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
+    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
+    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
+    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
+
+    //
+    // Backend scheduler
+    //
+
+    // The backend scheduler allows for multiple backends to be used together
+    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
+    // The backends are selected based on:
+    // - the backend that supports the operation
+    // - the location of the pre-allocated tensors (e.g. the weights)
+    /*
+      Example usage:
+
+        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
+        // preferrably to run on the same backend as the buffer
+        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
+
+        // initialize buffers from a max size graph (optional)
+        reserve_graph = build_graph(sched, max_batch_size);
+
+        // manually assign nodes to a backend (optional, should not be needed in most cases)
+        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
+        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
+
+        ggml_backend_sched_reserve(sched, reserve_graph);
+
+        // compute
+        graph = build_graph(sched);
+        ggml_backend_sched_graph_compute(sched, graph);
+
+        // if there are graph inputs:
+        ggml_backend_sched_reset(sched);
+        ggml_backend_sched_alloc_graph(sched, graph);
+        ggml_backend_tensor_set(input_tensor, ...);
+        ggml_backend_sched_graph_compute(sched, graph);
+    }
+    */
+
+    struct ggml_backend_sched;
+    typedef struct ggml_backend_sched * ggml_backend_sched_t;
+
+    // when ask == true, the scheduler wants to know if the user wants to observe this node
+    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
+    //
+    // when ask == false, the scheduler is passing the node tensor to the user for observation
+    // if the user returns false, the scheduler will cancel the graph compute
+    //
+    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
+
+    // Initialize a backend scheduler
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
+    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
+
+    // Initialize backend buffers from a measure graph
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+
+    // Get the number of splits of the last graph
+    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
+
+    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
+    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+
+    // Allocate and compute graph on the backend scheduler
+    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
+
+    // Reset all assignments and allocators - must be called before changing the node backends
+    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
+
+    // Set a callback to be called for each resulting node during graph compute
+    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+
+    //
+    // Utils
+    //
+
+    struct ggml_backend_graph_copy {
+        ggml_backend_buffer_t buffer;
+        struct ggml_context * ctx_allocated;
+        struct ggml_context * ctx_unallocated;
+        struct ggml_cgraph * graph;
+    };
+
+    // Copy a graph to a different backend
+    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
+
+    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+
+    // Compare the output of two backends
+    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
+
+    // Tensor initialization
+    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
+    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+
+
+#ifdef  __cplusplus
+}
+#endif
--- a/Sources/llama.cpp/ggml.h
+++ b/Sources/llama.cpp/ggml.h
--- a/Sources/llama.cpp/llama.h
+++ b/Sources/llama.cpp/llama.h
--- a/Sources/llama.cpp/module.modulemap
+++ b/Sources/llama.cpp/module.modulemap
@ -0,0 +1,10 @@
+module llama [system] {
+    header "llama.h"
+    header "ggml.h"
+    header "ggml-alloc.h"
+    header "ggml-backend.h"
+
+    link "llama"
+
+    export *
+}