Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 95ea26f

Browse filesBrowse files
benchmark : add tool for timing q4_0 matrix multiplication (#653)
* Initial version of q4_0 matrix multiplication benchmark * Bugfix: Added dependency to ggml.o to benchmark * Reviewer requests: added parameter for threads, switched to ggml_time_us() * Reviewer input: removed rtsc, use epsilon for check * Review comment: Removed set_locale * Feature: Param for numer of iterations, Bugfix for use of parameter threads * Reviewer suggestion: Moved to examples * Reviewer feedback: Updated clean: and benchmark: sections --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 82d146d commit 95ea26f
Copy full SHA for 95ea26f

File tree

Expand file treeCollapse file tree

2 files changed

+276
-1
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+276
-1
lines changed

‎Makefile

Copy file name to clipboardExpand all lines: Makefile
+6-1Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ common.o: examples/common.cpp examples/common.h
149149
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
150150

151151
clean:
152-
rm -vf *.o main quantize quantize-stats perplexity embedding
152+
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
153153

154154
main: examples/main/main.cpp ggml.o llama.o common.o
155155
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@@ -171,10 +171,15 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
171171

172172
libllama.so: llama.o ggml.o
173173
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
174+
174175
#
175176
# Tests
176177
#
177178

179+
benchmark: ggml.o
180+
$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
181+
./benchmark-q4_0-matmult
182+
178183
.PHONY: tests
179184
tests:
180185
bash ./tests/run-tests.sh
+270Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
/*
2+
License: MIT License
3+
4+
Changelog:
5+
- 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
6+
7+
*/
8+
9+
#include <locale.h>
10+
#include "ggml.h"
11+
#include <assert.h>
12+
#include <math.h>
13+
#include <cstring>
14+
#include <cstdio>
15+
#include <cinttypes>
16+
#include <unordered_map>
17+
#include <queue>
18+
#include <string.h>
19+
#include <cassert>
20+
#include <fstream>
21+
#include <string>
22+
#include <iterator>
23+
#include <algorithm>
24+
25+
float tensor_sum_elements(struct ggml_tensor * tensor) {
26+
float sum = 0;
27+
if (tensor->type==6) {
28+
for (int j = 0; j < tensor->ne[1]; j++) {
29+
for (int k = 0; k < tensor->ne[0]; k++) {
30+
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
31+
}
32+
}
33+
}
34+
return sum;
35+
}
36+
37+
38+
/*
39+
These are mapping to unknown
40+
GGML_TYPE_I8,
41+
GGML_TYPE_I16,
42+
GGML_TYPE_I32,
43+
GGML_TYPE_COUNT,
44+
*/
45+
46+
#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
47+
48+
#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
49+
TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
50+
TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
51+
{ float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
52+
53+
struct benchmark_params_struct {
54+
int32_t n_threads = 1;
55+
int32_t n_iterations = 10;
56+
};
57+
58+
void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
59+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
60+
fprintf(stderr, "\n");
61+
fprintf(stderr, "options:\n");
62+
fprintf(stderr, " -h, --help show this help message and exit\n");
63+
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
64+
fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations);
65+
fprintf(stderr, "\n");
66+
}
67+
68+
int main(int argc, char ** argv) {
69+
70+
71+
struct benchmark_params_struct benchmark_params;
72+
73+
bool invalid_param = false;
74+
std::string arg;
75+
for (int i = 1; i < argc; i++) {
76+
arg = argv[i];
77+
78+
if (arg == "-t" || arg == "--threads") {
79+
if (++i >= argc) {
80+
invalid_param = true;
81+
break;
82+
}
83+
benchmark_params.n_threads = std::stoi(argv[i]);
84+
} else if (arg == "-i" || arg == "--iter") {
85+
if (++i >= argc) {
86+
invalid_param = true;
87+
break;
88+
}
89+
benchmark_params.n_iterations = std::stoi(argv[i]);
90+
} else if (arg == "-h" || arg == "--help") {
91+
print_usage(argc, argv, benchmark_params);
92+
exit(0);
93+
}
94+
if (invalid_param) {
95+
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
96+
print_usage(argc, argv, benchmark_params);
97+
exit(1);
98+
}
99+
}
100+
101+
102+
// create the ggml context
103+
printf("Starting Test\n");
104+
105+
106+
107+
struct ggml_context * ctx;
108+
//const int sizex = 4096;
109+
//const int sizey = 11008;
110+
111+
#undef VERBOSE_DEBUGGING
112+
#ifndef VERBOSE_DEBUGGING
113+
const int sizey = 4096;
114+
const int sizex = 11008;
115+
const int sizez = 128;
116+
#else
117+
/* Working - let's increase size */
118+
const int sizey = 1;
119+
const int sizex = (8*32);
120+
const int sizez = 1;
121+
122+
/*const int sizey = 1;
123+
const int sizex = 3*(8*32);
124+
const int sizez = 1;*/
125+
#endif
126+
127+
//printf("Memsize required = %i\n", sizex*sizex);
128+
ggml_type wtype = GGML_TYPE_F32;
129+
130+
size_t ctx_size = 0;
131+
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
132+
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
133+
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
134+
ctx_size += sizex*sizeof(float);
135+
ctx_size += 1024*1024*100;
136+
137+
printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
138+
139+
struct ggml_init_params params = {
140+
/*.mem_size =*/ ctx_size,
141+
/*.mem_buffer =*/ NULL,
142+
/* no_alloc =*/ 0
143+
};
144+
145+
ctx = ggml_init(params);
146+
if (!ctx) {
147+
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
148+
return false;
149+
}
150+
151+
152+
printf("Creating new tensors\n");
153+
// printf("Creating new tensor m1\n");
154+
struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
155+
ggml_set_f32(m11, 1.0f);
156+
157+
// printf("Creating new tensor m1\n");
158+
struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
159+
ggml_set_f32(m12, 1.5f);
160+
161+
// printf("Creating new tensor m2\n");
162+
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
163+
ggml_set_f32(m2, 2.0f);
164+
165+
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
166+
// printf("Creating new tensor m11xm2\n");
167+
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
168+
169+
// printf("Creating compute graph\n");
170+
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
171+
172+
gf.n_threads=benchmark_params.n_threads;
173+
printf("cgraph->n_threads=%i\n",gf.n_threads);
174+
175+
TENSOR_DUMP(m11);
176+
TENSOR_DUMP(m2);
177+
178+
ggml_graph_compute(ctx, &gf);
179+
180+
TENSOR_DUMP(gf.nodes[0]);
181+
182+
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
183+
184+
int32_t nelements = sizex*sizey;
185+
int32_t ne[2] = { sizex, sizey };
186+
187+
std::vector<int64_t> hist_cur(1 << 4, 0);
188+
189+
// Set up a the benchmark matrices
190+
// printf("Creating new tensor q11 & Running quantize\n");
191+
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
192+
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
193+
194+
// Set up a the compute graph
195+
// printf("Creating new tensor q31\n");
196+
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
197+
198+
// printf("Creating compute graph\n");
199+
struct ggml_cgraph gf31 = ggml_build_forward(q31);
200+
gf31.n_threads=benchmark_params.n_threads;
201+
202+
// Set up a second graph computation to make sure we override the CPU cache lines
203+
// printf("Creating new tensor q12 & Running quantize\n");
204+
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
205+
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
206+
207+
// printf("Creating new tensor q32\n");
208+
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
209+
210+
//printf("Creating compute graph\n");
211+
struct ggml_cgraph gf32 = ggml_build_forward(q32);
212+
gf32.n_threads=benchmark_params.n_threads;
213+
printf("cgraph->n_threads=%i\n",gf31.n_threads);
214+
215+
const int dimx = sizex;
216+
const int dimy = sizey;
217+
const int dimz = sizez;
218+
long long int flops_per_dot_product = dimy + dimy;
219+
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
220+
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
221+
222+
223+
// Let's use the F32 result from above as a reference for the q4_0 multiplication
224+
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
225+
226+
227+
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
228+
printf("==============================================================================================\n");
229+
230+
for (int i=0;i<benchmark_params.n_iterations ;i++) {
231+
232+
long long int start = ggml_time_us();
233+
//printf("Running ggml_graph_compute\n");
234+
ggml_graph_compute(ctx, &gf31);
235+
long long int stop = ggml_time_us();
236+
long long int usec = stop-start;
237+
float sec = usec/1000000;
238+
float flops_per_usec = (1.0f*flops_per_matrix)/usec;
239+
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
240+
i,
241+
gf31.n_threads,
242+
sizex, sizey, sizez, flops_per_matrix,
243+
usec,flops_per_usec);
244+
245+
#ifdef VERBOSE_DEBUGGING
246+
TENSOR_DUMP("res",gf31.nodes[0])
247+
#endif
248+
249+
// Check that the matrix multiplication result is in the right ballpark
250+
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
251+
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
252+
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
253+
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
254+
255+
if (delta > allowed_delta) {
256+
printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
257+
sum_of_F32_reference,
258+
sum_of_Q4_result,
259+
delta,
260+
allowed_delta
261+
);
262+
exit(0);
263+
}
264+
265+
// Running a different graph computation to make sure we override the CPU cache lines
266+
ggml_graph_compute(ctx, &gf32);
267+
268+
}
269+
270+
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.