Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 0545b16

Browse filesBrowse files
committed
OPT: Improved memcopy, join & JIT (Needs cache cleanup)
- JIT.eval no longer uses temp buffer, although can now write directly into final buffer - OCL & CUDA code improved for memcopy, copy, reshape & JIT (previous precompiled cached code is incompatible). - join is now using memcopy with vector of input arrays
1 parent c5cd3fd commit 0545b16
Copy full SHA for 0545b16

File tree

Expand file treeCollapse file tree

21 files changed

+1393
-905
lines changed
Filter options
Expand file treeCollapse file tree

21 files changed

+1393
-905
lines changed

‎src/backend/common/dispatch.hpp

Copy file name to clipboardExpand all lines: src/backend/common/dispatch.hpp
+72Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
#pragma once
1111

12+
#include <af/defines.h>
13+
#include <algorithm>
1214
#include <cmath>
1315

1416
#define divup(a, b) (((a) + (b)-1) / (b))
@@ -42,3 +44,73 @@ inline T greatestPrimeFactor(T n) {
4244

4345
return v;
4446
}
47+
48+
// For OPENCL, the dimensions of local are returned
49+
// usage: cl::NDRange local = bestBlockSize<cl::NDRange>(dims, WG)
50+
// For CUDA, the dimensions of 1 block are returned
51+
// usage: dim3 block = bestBlockSize<dim3>(dims, 32);
52+
// The parameter dims can have any type as long as it is convertable to unsigned
53+
54+
// Remark: The bestBlockSize is only best for independent element operations, as
55+
// are: copying, scaling, math on independent elements, ...
56+
// Since vector dimensions can be returned, it is NOT USABLE FOR BLOCK
57+
// OPERATIONS, as are: matmul, etc.
58+
template<typename Tout, typename Tin>
59+
Tout bestBlockSize(const Tin dims[4], unsigned warp) {
60+
const unsigned d0 = static_cast<unsigned>(dims[0]);
61+
const unsigned d1 = static_cast<unsigned>(dims[1]);
62+
const unsigned d2 = static_cast<unsigned>(dims[2]);
63+
const unsigned OCC = 3;
64+
const unsigned elements = d0 * d1;
65+
const unsigned minThreads = warp / 4; // quarter wave
66+
const unsigned maxThreads =
67+
std::min(warp * 4, divup(elements * warp, 16384U) * minThreads);
68+
69+
const unsigned threads0 =
70+
#ifdef AF_OPENCL
71+
(d0 < warp) ? d0 :
72+
#endif
73+
(d1 == 1) ? warp * 4
74+
: (maxThreads >= 128) && (!(d0 & (128 - 1)) || (d0 > OCC * (128 - 1)))
75+
? 128
76+
: (maxThreads >= 64) && (!(d0 & (64 - 1)) || (d0 > OCC * (64 - 1)))
77+
? 64
78+
: warp;
79+
80+
const unsigned threads1 =
81+
(threads0 <= maxThreads / 128) &&
82+
(!(d1 & (128 - 1)) || (d1 > OCC * (128 - 1)))
83+
? 128
84+
: (threads0 <= maxThreads / 64) &&
85+
(!(d1 & (64 - 1)) || (d1 > OCC * (64 - 1)))
86+
? 64
87+
: (threads0 <= maxThreads / 32) &&
88+
(!(d1 & (32 - 1)) || (d1 > OCC * (32 - 1)))
89+
? 32
90+
: (threads0 <= maxThreads / 16) &&
91+
(!(d1 & (16 - 1)) || (d1 > OCC * (16 - 1)))
92+
? 16
93+
: (threads0 <= maxThreads / 8) &&
94+
(!(d1 & (8 - 1)) || (d1 > OCC * (8 - 1)))
95+
? 8
96+
: (threads0 <= maxThreads / 4) &&
97+
(!(d1 & (4 - 1)) || (d1 > OCC * (4 - 1)))
98+
? 4
99+
: (threads0 <= maxThreads / 2) &&
100+
(!(d1 & (2 - 1)) || (d1 > OCC * (2 - 1)))
101+
? 2
102+
: 1;
103+
104+
const unsigned threads01 = threads0 * threads1;
105+
if (d2 == 1 || threads01 * 2 > maxThreads) return Tout(threads0, threads1);
106+
107+
const unsigned threads2 =
108+
(threads01 <= maxThreads / 64) && !(d2 & (64 - 1)) ? 64
109+
: (threads01 <= maxThreads / 32) && !(d2 & (32 - 1)) ? 32
110+
: (threads01 <= maxThreads / 16) && !(d2 & (16 - 1)) ? 16
111+
: (threads01 <= maxThreads / 8) && !(d2 & (8 - 1)) ? 8
112+
: (threads01 <= maxThreads / 4) && !(d2 & (4 - 1)) ? 4
113+
: (threads01 <= maxThreads / 2) && !(d2 & (2 - 1)) ? 2
114+
: 1;
115+
return Tout(threads0, threads1, threads2);
116+
}

‎src/backend/cuda/Array.cpp

Copy file name to clipboardExpand all lines: src/backend/cuda/Array.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ kJITHeuristics passesJitHeuristics(Node *root_node) {
265265
// The size of the parameters without any extra arguments from the
266266
// JIT tree. This includes one output Param object and 4 integers.
267267
constexpr size_t base_param_size =
268-
sizeof(Param<T>) + (4 * sizeof(uint));
268+
sizeof(Param<T>) + 4 * sizeof(int) + 4 * sizeof(char);
269269

270270
// extra padding for safety to avoid failure during compilation
271271
constexpr size_t jit_padding_size = 256; //@umar dontfix!

‎src/backend/cuda/CMakeLists.txt

Copy file name to clipboardExpand all lines: src/backend/cuda/CMakeLists.txt
-2Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,6 @@ set(nvrtc_src
187187
${CMAKE_CURRENT_SOURCE_DIR}/kernel/index.cuh
188188
${CMAKE_CURRENT_SOURCE_DIR}/kernel/iota.cuh
189189
${CMAKE_CURRENT_SOURCE_DIR}/kernel/ireduce.cuh
190-
${CMAKE_CURRENT_SOURCE_DIR}/kernel/join.cuh
191190
${CMAKE_CURRENT_SOURCE_DIR}/kernel/lookup.cuh
192191
${CMAKE_CURRENT_SOURCE_DIR}/kernel/lu_split.cuh
193192
${CMAKE_CURRENT_SOURCE_DIR}/kernel/match_template.cuh
@@ -432,7 +431,6 @@ cuda_add_library(afcuda
432431
kernel/interp.hpp
433432
kernel/iota.hpp
434433
kernel/ireduce.hpp
435-
kernel/join.hpp
436434
kernel/lookup.hpp
437435
kernel/lu_split.hpp
438436
kernel/match_template.hpp

‎src/backend/cuda/copy.cpp

Copy file name to clipboardExpand all lines: src/backend/cuda/copy.cpp
+38-51Lines changed: 38 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -22,87 +22,74 @@ using common::is_complex;
2222
namespace cuda {
2323

2424
template<typename T>
25-
void copyData(T *dst, const Array<T> &src) {
26-
if (src.elements() == 0) { return; }
27-
28-
// FIXME: Merge this with copyArray
29-
src.eval();
30-
31-
Array<T> out = src;
32-
const T *ptr = NULL;
33-
34-
if (src.isLinear() || // No offsets, No strides
35-
src.ndims() == 1 // Simple offset, no strides.
36-
) {
37-
// A.get() gets data with offsets
38-
ptr = src.get();
39-
} else {
40-
// FIXME: Think about implementing eval
41-
out = copyArray(src);
42-
ptr = out.get();
25+
void copyData(T *data, const Array<T> &src) {
26+
if (src.elements() > 0) {
27+
Array<T> lin = src.isReady() && src.isLinear() ? src : copyArray(src);
28+
// out is now guaranteed linear
29+
auto stream = cuda::getActiveStream();
30+
CUDA_CHECK(cudaMemcpyAsync(data, lin.get(), lin.elements() * sizeof(T),
31+
cudaMemcpyDeviceToHost, stream));
32+
CUDA_CHECK(cudaStreamSynchronize(stream));
4333
}
44-
45-
auto stream = cuda::getActiveStream();
46-
CUDA_CHECK(cudaMemcpyAsync(dst, ptr, src.elements() * sizeof(T),
47-
cudaMemcpyDeviceToHost, stream));
48-
CUDA_CHECK(cudaStreamSynchronize(stream));
4934
}
5035

5136
template<typename T>
5237
Array<T> copyArray(const Array<T> &src) {
5338
Array<T> out = createEmptyArray<T>(src.dims());
54-
if (src.elements() == 0) { return out; }
55-
56-
if (src.isLinear()) {
57-
CUDA_CHECK(
58-
cudaMemcpyAsync(out.get(), src.get(), src.elements() * sizeof(T),
59-
cudaMemcpyDeviceToDevice, cuda::getActiveStream()));
60-
} else {
39+
if (src.isReady()) {
6140
kernel::memcopy<T>(out, src, src.ndims());
41+
} else {
42+
Param<T> info(out.get(), src.dims().dims, src.strides().dims);
43+
evalNodes(info, src.getNode().get());
6244
}
6345
return out;
6446
}
6547

6648
template<typename T>
67-
void multiply_inplace(Array<T> &in, double val) {
68-
kernel::copy<T, T>(in, in, in.ndims(), scalar<T>(0), val);
49+
void multiply_inplace(Array<T> &src, double norm) {
50+
kernel::copy<T, T>(src, src, src.ndims(), scalar<T>(0), norm);
6951
}
7052

7153
template<typename inType, typename outType>
7254
struct copyWrapper {
73-
void operator()(Array<outType> &out, Array<inType> const &in) {
74-
kernel::copy<inType, outType>(out, in, in.ndims(), scalar<outType>(0),
75-
1);
55+
void operator()(Array<outType> &dst, Array<inType> const &src) {
56+
kernel::copy<inType, outType>(dst, src, src.ndims(), scalar<outType>(0),
57+
1.0);
7658
}
7759
};
7860

7961
template<typename T>
8062
struct copyWrapper<T, T> {
81-
void operator()(Array<T> &out, Array<T> const &in) {
82-
if (out.isLinear() && in.isLinear() &&
83-
out.elements() == in.elements()) {
84-
CUDA_CHECK(cudaMemcpyAsync(
85-
out.get(), in.get(), in.elements() * sizeof(T),
86-
cudaMemcpyDeviceToDevice, cuda::getActiveStream()));
63+
void operator()(Array<T> &dst, Array<T> const &src) {
64+
if (dst.isLinear() && src.isLinear() &&
65+
dst.elements() == src.elements()) {
66+
if (src.isReady()) {
67+
CUDA_CHECK(cudaMemcpyAsync(
68+
dst.get(), src.get(), src.elements() * sizeof(T),
69+
cudaMemcpyDeviceToDevice, cuda::getActiveStream()));
70+
} else {
71+
Param<T> info(dst.get(), src.dims().dims, dst.strides().dims);
72+
evalNodes(info, src.getNode().get());
73+
}
8774
} else {
88-
kernel::copy<T, T>(out, in, in.ndims(), scalar<T>(0), 1);
75+
kernel::copy<T, T>(dst, src, src.ndims(), scalar<T>(0), 1.0);
8976
}
9077
}
9178
};
9279

9380
template<typename inType, typename outType>
94-
void copyArray(Array<outType> &out, Array<inType> const &in) {
81+
void copyArray(Array<outType> &dst, Array<inType> const &src) {
9582
static_assert(!(is_complex<inType>::value && !is_complex<outType>::value),
9683
"Cannot copy from complex value to a non complex value");
97-
ARG_ASSERT(1, (in.ndims() == out.dims().ndims()));
84+
ARG_ASSERT(1, (src.ndims() == dst.ndims()));
9885
copyWrapper<inType, outType> copyFn;
99-
copyFn(out, in);
86+
copyFn(dst, src);
10087
}
10188

102-
#define INSTANTIATE(T) \
103-
template void copyData<T>(T * dst, const Array<T> &src); \
104-
template Array<T> copyArray<T>(const Array<T> &src); \
105-
template void multiply_inplace<T>(Array<T> & in, double norm);
89+
#define INSTANTIATE(T) \
90+
template void copyData<T>(T * data, const Array<T> &src); \
91+
template Array<T> copyArray<T>(const Array<T> &src); \
92+
template void multiply_inplace<T>(Array<T> & src, double norm);
10693

10794
INSTANTIATE(float)
10895
INSTANTIATE(double)
@@ -168,9 +155,9 @@ INSTANTIATE_COPY_ARRAY_COMPLEX(cfloat)
168155
INSTANTIATE_COPY_ARRAY_COMPLEX(cdouble)
169156

170157
template<typename T>
171-
T getScalar(const Array<T> &in) {
158+
T getScalar(const Array<T> &src) {
172159
T retVal{};
173-
CUDA_CHECK(cudaMemcpyAsync(&retVal, in.get(), sizeof(T),
160+
CUDA_CHECK(cudaMemcpyAsync(&retVal, src.get(), sizeof(T),
174161
cudaMemcpyDeviceToHost,
175162
cuda::getActiveStream()));
176163
CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream()));

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.