Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit e4a495d

Browse filesBrowse files
authored
Merge pull request #27115 from r-devulap/ldexp
BUG: Use the new npyv_loadable_stride_ functions for ldexp and frexp
2 parents 7533a4c + bbcedfc commit e4a495d
Copy full SHA for e4a495d

File tree

Expand file treeCollapse file tree

3 files changed

+13
-30
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+13
-30
lines changed

‎numpy/_core/src/common/simd/avx512/avx512.h

Copy file name to clipboardExpand all lines: numpy/_core/src/common/simd/avx512/avx512.h
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
1212
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16)
1313
#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
14+
#define NPY_SIMD_MAXLOAD_STRIDE64 (0x7fffffff / 16)
15+
#define NPY_SIMD_MAXSTORE_STRIDE64 (0x7fffffff / 16)
1416

1517
typedef __m512i npyv_u8;
1618
typedef __m512i npyv_s8;

‎numpy/_core/src/umath/fast_loop_macros.h

Copy file name to clipboardExpand all lines: numpy/_core/src/umath/fast_loop_macros.h
-28Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -323,34 +323,6 @@ abs_ptrdiff(char *a, char *b)
323323
((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
324324
((abs_ptrdiff(args[1], args[0]) == 0))))
325325

326-
/*
327-
* Avoid using SIMD for very large step sizes for several reasons:
328-
* 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
329-
* in which case we need two i64gather instructions and an additional vinsertf32x8
330-
* instruction to load a single zmm register (since one i64gather instruction
331-
* loads into a ymm register). This is not ideal for performance.
332-
* 2) Gather and scatter instructions can be slow when the loads/stores
333-
* cross page boundaries.
334-
*
335-
* We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
336-
* element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
337-
* ensures this. The condition also requires that the input and output arrays
338-
* should have no overlap in memory.
339-
*/
340-
#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
341-
((labs(steps[0]) < MAX_STEP_SIZE) && \
342-
(labs(steps[1]) < MAX_STEP_SIZE) && \
343-
(labs(steps[2]) < MAX_STEP_SIZE) && \
344-
(nomemoverlap(args[0], steps[0], args[2], steps[2], dimensions[0])) && \
345-
(nomemoverlap(args[1], steps[1], args[2], steps[2], dimensions[0])))
346-
347-
#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
348-
((labs(steps[0]) < MAX_STEP_SIZE) && \
349-
(labs(steps[1]) < MAX_STEP_SIZE) && \
350-
(labs(steps[2]) < MAX_STEP_SIZE) && \
351-
(nomemoverlap(args[0], steps[0], args[2], steps[2], dimensions[0])) && \
352-
(nomemoverlap(args[0], steps[0], args[1], steps[1], dimensions[0])))
353-
354326
/*
355327
* 1) Output should be contiguous, can handle strided input data
356328
* 2) Input step should be smaller than MAX_STEP_SIZE for performance

‎numpy/_core/src/umath/loops_exponent_log.dispatch.c.src

Copy file name to clipboardExpand all lines: numpy/_core/src/umath/loops_exponent_log.dispatch.c.src
+11-2Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,12 +1350,17 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
13501350
* #TYPE = FLOAT, DOUBLE#
13511351
* #c = f, #
13521352
* #C = F, #
1353+
* #suffix = f32, f64#
13531354
*/
13541355
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_frexp)
13551356
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
13561357
{
13571358
#ifdef SIMD_AVX512_SKX
1358-
if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
1359+
if ((npyv_loadable_stride_@suffix@(steps[0])) &&
1360+
(npyv_storable_stride_@suffix@(steps[1])) &&
1361+
(npyv_storable_stride_@suffix@(steps[2])) &&
1362+
(!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0])) &&
1363+
(!is_mem_overlap(args[0], steps[0], args[1], steps[1], dimensions[0]))) {
13591364
AVX512_SKX_frexp_@TYPE@(args, dimensions, steps);
13601365
return;
13611366
}
@@ -1370,7 +1375,11 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_ldexp)
13701375
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
13711376
{
13721377
#ifdef SIMD_AVX512_SKX
1373-
if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
1378+
if ((npyv_loadable_stride_@suffix@(steps[0])) &&
1379+
(npyv_storable_stride_@suffix@(steps[1])) &&
1380+
(npyv_storable_stride_@suffix@(steps[2])) &&
1381+
(!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0])) &&
1382+
(!is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0]))) {
13741383
AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps);
13751384
return;
13761385
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.