Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit fc8ef54

Browse filesBrowse files
lshzh-wwslarenggerganov
authored
metal : enable ggml-alloc (#2627)
* metal: enable ggml-alloc Make ggml-alloc work with concurrently dispatch. * style-fix Co-authored-by: slaren <slarengh@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent bf83bff commit fc8ef54
Copy full SHA for fc8ef54

File tree

Expand file treeCollapse file tree

5 files changed

+61
-26
lines changed
Filter options
Expand file treeCollapse file tree

5 files changed

+61
-26
lines changed

‎ggml-alloc.c

Copy file name to clipboardExpand all lines: ggml-alloc.c
+24-1Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ struct ggml_allocr {
6767
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
6868
size_t max_size;
6969
bool measure;
70+
int parse_seq[GGML_MAX_NODES];
71+
bool has_parse_seq;
7072

7173
#ifdef GGML_ALLOCATOR_DEBUG
7274
struct ggml_tensor * allocated_tensors[1024];
@@ -229,6 +231,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
229231
alloc->n_free_blocks++;
230232
}
231233

234+
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
235+
int pos = 0;
236+
for (int i = 0; i < n; i++) {
237+
if (list[i] != -1) {
238+
alloc->parse_seq[pos] = list[i];
239+
pos++;
240+
}
241+
}
242+
alloc->has_parse_seq = true;
243+
}
244+
232245
void ggml_allocr_reset(struct ggml_allocr * alloc) {
233246
alloc->n_free_blocks = 1;
234247
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
@@ -248,6 +261,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
248261
/*.hash_table = */ {{0}},
249262
/*.max_size = */ 0,
250263
/*.measure = */ false,
264+
/*.parse_seq = */ {0},
265+
/*.has_parse_seq = */ false,
251266
#ifdef GGML_ALLOCATOR_DEBUG
252267
/*.allocated_tensors = */ = {0},
253268
#endif
@@ -275,6 +290,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
275290
/*.hash_table = */ {{0}},
276291
/*.max_size = */ 0,
277292
/*.measure = */ true,
293+
/*.parse_seq = */ {0},
294+
/*.has_parse_seq = */ false,
278295
#ifdef GGML_ALLOCATOR_DEBUG
279296
/*.allocated_tensors = */ = {0},
280297
#endif
@@ -473,7 +490,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
473490
allocate_node(alloc, input);
474491
}
475492
}
476-
for (int i = 0; i < gf->n_nodes; i++) {
493+
for (int ind = 0; ind < gf->n_nodes; ind++) {
494+
int i;
495+
if (alloc->has_parse_seq) {
496+
i = alloc->parse_seq[ind];
497+
} else {
498+
i = ind;
499+
}
477500
struct ggml_tensor * node = gf->nodes[i];
478501

479502
// allocate parents (leafs)

‎ggml-alloc.h

Copy file name to clipboardExpand all lines: ggml-alloc.h
+4Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ extern "C" {
1010
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
1111
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
1212

13+
// tell the allocator to parse nodes following the order described in the list
14+
// you should call this if your graph are optimized to execute out-of-order
15+
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
16+
1317
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
1418
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
1519
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);

‎ggml-metal.h

Copy file name to clipboardExpand all lines: ggml-metal.h
+6-3Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
6363

6464
// try to find operations that can be run concurrently in the graph
6565
// you should run it again if the topology of your graph changes
66-
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
66+
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
6767

68-
// if the graph has been optimized for concurrently dispatch
69-
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
68+
// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
69+
int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
70+
71+
// output the concur_list for ggml_alloc
72+
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
7073

7174
// same as ggml_graph_compute but uses Metal
7275
// creates gf->n_threads command buffers in parallel

‎ggml-metal.m

Copy file name to clipboardExpand all lines: ggml-metal.m
+8-7Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -236,11 +236,12 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
236236
ctx->n_cb = n_cb;
237237
}
238238

239-
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
240-
if (ctx->concur_list_len) {
241-
return true;
242-
}
243-
return false;
239+
int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
240+
return ctx->concur_list_len;
241+
}
242+
243+
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
244+
return ctx->concur_list;
244245
}
245246

246247
// finds the Metal buffer that contains the tensor data on the GPU device
@@ -383,7 +384,7 @@ void ggml_metal_get_tensor(
383384

384385
void ggml_metal_graph_find_concurrency(
385386
struct ggml_metal_context * ctx,
386-
struct ggml_cgraph * gf) {
387+
struct ggml_cgraph * gf, bool check_mem) {
387388
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
388389
int nodes_unused[GGML_MAX_CONCUR];
389390

@@ -430,7 +431,7 @@ void ggml_metal_graph_find_concurrency(
430431
}
431432
}
432433
}
433-
if (exe_flag) {
434+
if (exe_flag && check_mem) {
434435
// check if nodes[i]'s data will be overwritten by a node before nodes[i].
435436
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
436437
int64_t data_start = (int64_t) gf->nodes[i]->data;

‎llama.cpp

Copy file name to clipboardExpand all lines: llama.cpp
+19-15Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text,
6363
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
6464

6565

66-
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
66+
#if !defined(GGML_USE_CUBLAS)
6767
#include "ggml-alloc.h"
6868
#define LLAMA_USE_ALLOCATOR
6969
#else
@@ -1846,10 +1846,6 @@ static bool llama_eval_internal(
18461846

18471847
#ifdef GGML_USE_METAL
18481848
if (lctx.ctx_metal) {
1849-
// TODO: disabled until #2413 is resolved
1850-
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1851-
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1852-
//}
18531849
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
18541850
ggml_metal_graph_compute(lctx.ctx_metal, gf);
18551851
ggml_metal_get_tensor (lctx.ctx_metal, res);
@@ -3287,7 +3283,18 @@ struct llama_context * llama_new_context_with_model(
32873283
int n_past = hparams.n_ctx - n_tokens;
32883284
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
32893285
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3290-
3286+
#ifdef GGML_USE_METAL
3287+
if (params.n_gpu_layers > 0) {
3288+
ctx->ctx_metal = ggml_metal_init(1);
3289+
if (!ctx->ctx_metal) {
3290+
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
3291+
llama_free(ctx);
3292+
return NULL;
3293+
}
3294+
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
3295+
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
3296+
}
3297+
#endif
32913298
// measure memory requirements for the graph
32923299
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
32933300

@@ -3305,6 +3312,11 @@ struct llama_context * llama_new_context_with_model(
33053312

33063313
ctx->buf_alloc.resize(alloc_size);
33073314
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3315+
#ifdef GGML_USE_METAL
3316+
if (ctx->ctx_metal) {
3317+
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
3318+
}
3319+
#endif
33083320
}
33093321
#else
33103322
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
@@ -3319,13 +3331,6 @@ struct llama_context * llama_new_context_with_model(
33193331
#ifdef GGML_USE_METAL
33203332
if (params.n_gpu_layers > 0) {
33213333
// this allocates all Metal resources and memory buffers
3322-
ctx->ctx_metal = ggml_metal_init(1);
3323-
3324-
if (!ctx->ctx_metal) {
3325-
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
3326-
llama_free(ctx);
3327-
return NULL;
3328-
}
33293334

33303335
void * data_ptr = NULL;
33313336
size_t data_size = 0;
@@ -3354,8 +3359,7 @@ struct llama_context * llama_new_context_with_model(
33543359
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
33553360
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
33563361

3357-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
3358-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
3362+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
33593363
#undef LLAMA_METAL_CHECK_BUF
33603364
}
33613365
#endif

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.