diff --git a/.github/workflows/linux_qemu.yml b/.github/workflows/linux_qemu.yml index cd66b034cd79..58e91e34ee9c 100644 --- a/.github/workflows/linux_qemu.yml +++ b/.github/workflows/linux_qemu.yml @@ -178,3 +178,99 @@ jobs: '" + linux_loongarch64_qemu: + # To enable this workflow on a fork, comment out: + if: github.repository == 'numpy/numpy' + runs-on: ubuntu-24.04 + continue-on-error: true + strategy: + fail-fast: false + matrix: + BUILD_PROP: + - [ + "loongarch64", + "loongarch64-linux-gnu", + "cnclarechen/numpy-loong64-debian:v1", + "-Dallow-noblas=true", + "test_kind or test_multiarray or test_simd or test_umath or test_ufunc", + "loong64" + ] + env: + TOOLCHAIN_NAME: ${{ matrix.BUILD_PROP[1] }} + DOCKER_CONTAINER: ${{ matrix.BUILD_PROP[2] }} + MESON_OPTIONS: ${{ matrix.BUILD_PROP[3] }} + RUNTIME_TEST_FILTER: ${{ matrix.BUILD_PROP[4] }} + ARCH: ${{ matrix.BUILD_PROP[5] }} + TERM: xterm-256color + + name: "${{ matrix.BUILD_PROP[0] }}" + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + submodules: recursive + fetch-tags: true + + - name: Initialize binfmt_misc for qemu-user-static + run: | + docker run --rm --privileged loongcr.lcpu.dev/multiarch/archlinux --reset -p yes + + - name: Install GCC cross-compilers + run: | + sudo apt update + sudo apt install -y ninja-build gcc-14-${TOOLCHAIN_NAME} g++-14-${TOOLCHAIN_NAME} gfortran-14-${TOOLCHAIN_NAME} + + - name: Cache docker container + uses: actions/cache@v4.1.2 + id: container-cache + with: + path: ~/docker_${{ matrix.BUILD_PROP[1] }} + key: container-${{ runner.os }}-${{ matrix.BUILD_PROP[1] }}-${{ matrix.BUILD_PROP[2] }}-${{ hashFiles('requirements/build_requirements.txt') }} + + - name: Creates new container + if: steps.container-cache.outputs.cache-hit != 'true' + run: | + docker run --platform=linux/${ARCH} --name the_container --interactive \ + -v /:/host -v $(pwd):/numpy ${DOCKER_CONTAINER} /bin/bash -c " + mkdir -p /lib64 && ln -s /host/lib64/ld-* /lib64/ && + ln -s /host/lib/x86_64-linux-gnu /lib/x86_64-linux-gnu && + ln -s /host/usr/${TOOLCHAIN_NAME} /usr/${TOOLCHAIN_NAME} && + ln -s /host/usr/lib/gcc-cross/${TOOLCHAIN_NAME} /usr/lib/gcc/${TOOLCHAIN_NAME} && + rm -f /usr/bin/gcc && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-gcc-14 /usr/bin/gcc && + rm -f /usr/bin/g++ && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-g++-14 /usr/bin/g++ && + rm -f /usr/bin/gfortran && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-gfortran-14 /usr/bin/gfortran && + rm -f /usr/bin/ar && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ar /usr/bin/ar && + rm -f /usr/bin/as && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-as /usr/bin/as && + rm -f /usr/bin/ld && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ld /usr/bin/ld && + rm -f /usr/bin/ld.bfd && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ld.bfd /usr/bin/ld.bfd && + rm -f /usr/bin/ninja && ln -s /host/usr/bin/ninja /usr/bin/ninja && + git config --global --add safe.directory /numpy && + python -m pip install --break-system-packages -r /numpy/requirements/build_requirements.txt && + python -m pip install --break-system-packages pytest pytest-xdist hypothesis typing_extensions + " + docker commit the_container the_container + mkdir -p "~/docker_${TOOLCHAIN_NAME}" + docker save -o "~/docker_${TOOLCHAIN_NAME}/the_container.tar" the_container + + - name: Load container from cache + if: steps.container-cache.outputs.cache-hit == 'true' + run: docker load -i "~/docker_${TOOLCHAIN_NAME}/the_container.tar" + + - name: Meson Build + run: | + docker run --rm --platform=linux/${ARCH} -e "TERM=xterm-256color" \ + -v $(pwd):/numpy -v /:/host the_container \ + /bin/script -e -q -c "/bin/bash --noprofile --norc -eo pipefail -c ' + cd /numpy/ && spin build --clean -- ${MESON_OPTIONS} + '" + + - name: Meson Log + if: always() + run: 'cat build/meson-logs/meson-log.txt' + + - name: Run Tests + run: | + docker run --rm --platform=linux/${ARCH} -e "TERM=xterm-256color" \ + -v $(pwd):/numpy -v /:/host the_container \ + /bin/script -e -q -c "/bin/bash --noprofile --norc -eo pipefail -c ' + cd /numpy && spin test -- -k \"${RUNTIME_TEST_FILTER}\" + '" diff --git a/meson.options b/meson.options index 844fa4f5a2e7..1be05d324756 100644 --- a/meson.options +++ b/meson.options @@ -35,6 +35,7 @@ option('test-simd', type: 'array', 'VSX', 'VSX2', 'VSX3', 'VSX4', 'NEON', 'ASIMD', 'VX', 'VXE', 'VXE2', + 'LSX', ], description: 'Specify a list of CPU features to be tested against NumPy SIMD interface') option('test-simd-args', type: 'string', value: '', diff --git a/meson_cpu/loongarch64/meson.build b/meson_cpu/loongarch64/meson.build new file mode 100644 index 000000000000..570e3bfcda01 --- /dev/null +++ b/meson_cpu/loongarch64/meson.build @@ -0,0 +1,8 @@ +source_root = meson.project_source_root() +mod_features = import('features') + +LSX = mod_features.new( + 'LSX', 1, args: ['-mlsx'], + test_code: files(source_root + '/numpy/distutils/checks/cpu_lsx.c')[0] +) +LOONGARCH64_FEATURES = {'LSX': LSX} diff --git a/meson_cpu/main_config.h.in b/meson_cpu/main_config.h.in index d89e62f5f66b..4de867d1f325 100644 --- a/meson_cpu/main_config.h.in +++ b/meson_cpu/main_config.h.in @@ -389,4 +389,8 @@ #ifdef @P@HAVE_RVV #include #endif + +#ifdef @P@HAVE_LSX + #include +#endif #endif // @P@_CPU_DISPATCHER_CONF_H_ diff --git a/meson_cpu/meson.build b/meson_cpu/meson.build index 3afc54cae415..e5b6d0fbe7be 100644 --- a/meson_cpu/meson.build +++ b/meson_cpu/meson.build @@ -76,6 +76,7 @@ subdir('ppc64') subdir('s390x') subdir('arm') subdir('riscv64') +subdir('loongarch64') CPU_FEATURES = {} CPU_FEATURES += ARM_FEATURES @@ -83,6 +84,7 @@ CPU_FEATURES += X86_FEATURES CPU_FEATURES += PPC64_FEATURES CPU_FEATURES += S390X_FEATURES CPU_FEATURES += RV64_FEATURES +CPU_FEATURES += LOONGARCH64_FEATURES # Parse the requested baseline (CPU_CONF_BASELINE) and dispatch features # (CPU_CONF_DISPATCH). @@ -97,6 +99,7 @@ min_features = { 'aarch64': [ASIMD], 'riscv64': [], 'wasm32': [], + 'loongarch64': [LSX], }.get(cpu_family, []) if host_machine.endian() == 'little' and cpu_family == 'ppc64' min_features = [VSX2] @@ -112,6 +115,7 @@ max_features_dict = { 'aarch64': ARM_FEATURES, 'riscv64': RV64_FEATURES, 'wasm32': {}, + 'loongarch64': LOONGARCH64_FEATURES, }.get(cpu_family, {}) max_features = [] foreach fet_name, fet_obj : max_features_dict diff --git a/numpy/_core/include/numpy/npy_cpu.h b/numpy/_core/include/numpy/npy_cpu.h index 15f9f12931c8..67a9d0b2cdc8 100644 --- a/numpy/_core/include/numpy/npy_cpu.h +++ b/numpy/_core/include/numpy/npy_cpu.h @@ -109,8 +109,8 @@ #elif __riscv_xlen == 32 #define NPY_CPU_RISCV32 #endif -#elif defined(__loongarch__) - #define NPY_CPU_LOONGARCH +#elif defined(__loongarch64) + #define NPY_CPU_LOONGARCH64 #elif defined(__EMSCRIPTEN__) /* __EMSCRIPTEN__ is defined by emscripten: an LLVM-to-Web compiler */ #define NPY_CPU_WASM diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index 7373c8ea46b0..79ad7d99497e 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -97,6 +97,10 @@ if use_svml endif endif +if host_machine.cpu_family() == 'loongarch64' + add_project_arguments(['-DHWY_COMPILE_ONLY_SCALAR'], language: ['cpp']) +endif + use_highway = not get_option('disable-highway') if use_highway and not fs.exists('src/highway/README.md') error('Missing the `highway` git submodule! Run `git submodule update --init` to fix this.') @@ -880,6 +884,7 @@ foreach gen_mtargets : [ ASIMD, NEON, VSX3, VSX2, VXE, VX, + LSX, ] ], [ @@ -890,6 +895,7 @@ foreach gen_mtargets : [ NEON, VSX4, VSX2, VX, + LSX, ] ], [ @@ -900,6 +906,7 @@ foreach gen_mtargets : [ VSX3, VSX2, NEON, VXE, VX, + LSX, ] ], [ @@ -916,7 +923,8 @@ foreach gen_mtargets : [ AVX512_SKX, [AVX2, FMA3], VSX4, VSX2, NEON_VFPV4, - VXE + VXE, + LSX, ] ], [ @@ -927,6 +935,7 @@ foreach gen_mtargets : [ AVX512_SKX, AVX2, SSE2, VSX2, VX, + LSX, ] ], [ @@ -937,6 +946,7 @@ foreach gen_mtargets : [ AVX512_SKX, AVX2, SSE2, VSX2, VXE, VX, + LSX, ] ], [ @@ -954,6 +964,7 @@ foreach gen_mtargets : [ VSX4, VSX3, VSX2, NEON_VFPV4, VXE2, VXE, + LSX, ] ], [ @@ -968,7 +979,8 @@ foreach gen_mtargets : [ ASIMD, NEON, AVX512_SKX, AVX2, SSE2, VSX2, - VXE, VX + VXE, VX, + LSX, ] ], [ @@ -978,7 +990,8 @@ foreach gen_mtargets : [ SSE41, SSE2, VSX2, ASIMD, NEON, - VXE, VX + VXE, VX, + LSX, ] ], [ @@ -988,6 +1001,7 @@ foreach gen_mtargets : [ SSE41, SSE2, VSX2, ASIMD, NEON, + LSX, ] ], [ @@ -998,6 +1012,7 @@ foreach gen_mtargets : [ ASIMD, NEON, VSX3, VSX2, VXE, VX, + LSX, ] ], [ @@ -1008,6 +1023,7 @@ foreach gen_mtargets : [ NEON, VSX2, VX, + LSX, ] ], ] diff --git a/numpy/_core/src/_simd/_simd.dispatch.c.src b/numpy/_core/src/_simd/_simd.dispatch.c.src index 02f84fa5592c..2b47c84706f5 100644 --- a/numpy/_core/src/_simd/_simd.dispatch.c.src +++ b/numpy/_core/src/_simd/_simd.dispatch.c.src @@ -30,7 +30,7 @@ * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# - * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# + * #shr_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #bitw8b_sup= 1, 0, 0, 0, 0, 0, 0, 0, 0, 0# */ #if @simd_sup@ diff --git a/numpy/_core/src/_simd/_simd_easyintrin.inc b/numpy/_core/src/_simd/_simd_easyintrin.inc index e300e54843a0..65c83279898d 100644 --- a/numpy/_core/src/_simd/_simd_easyintrin.inc +++ b/numpy/_core/src/_simd/_simd_easyintrin.inc @@ -243,7 +243,6 @@ NPY_EXPAND(FN(8, __VA_ARGS__)) #define SIMD__IMPL_COUNT_15(FN, ...) \ - NPY_EXPAND(FN(0, __VA_ARGS__)) \ SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) #define SIMD__IMPL_COUNT_16(FN, ...) \ @@ -251,7 +250,6 @@ NPY_EXPAND(FN(16, __VA_ARGS__)) #define SIMD__IMPL_COUNT_31(FN, ...) \ - NPY_EXPAND(FN(0, __VA_ARGS__)) \ SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) #define SIMD__IMPL_COUNT_32(FN, ...) \ @@ -267,7 +265,6 @@ NPY_EXPAND(FN(48, __VA_ARGS__)) #define SIMD__IMPL_COUNT_63(FN, ...) \ - NPY_EXPAND(FN(0, __VA_ARGS__)) \ SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__) #define SIMD__IMPL_COUNT_64(FN, ...) \ diff --git a/numpy/_core/src/common/npy_cpu_features.c b/numpy/_core/src/common/npy_cpu_features.c index fe392b7bea0c..69e426873c41 100644 --- a/numpy/_core/src/common/npy_cpu_features.c +++ b/numpy/_core/src/common/npy_cpu_features.c @@ -125,7 +125,8 @@ static struct { {NPY_CPU_FEATURE_ASIMDDP, "ASIMDDP"}, {NPY_CPU_FEATURE_ASIMDFHM, "ASIMDFHM"}, {NPY_CPU_FEATURE_SVE, "SVE"}, - {NPY_CPU_FEATURE_RVV, "RVV"}}; + {NPY_CPU_FEATURE_RVV, "RVV"}, + {NPY_CPU_FEATURE_LSX, "LSX"}}; NPY_VISIBILITY_HIDDEN PyObject * @@ -665,6 +666,25 @@ npy__cpu_init_features(void) npy__cpu_have[NPY_CPU_FEATURE_VX] = 1; } +/***************** LoongArch ******************/ + +#elif defined(__loongarch64) + +#include +#include + +static void +npy__cpu_init_features(void) +{ + memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX); + unsigned int hwcap = getauxval(AT_HWCAP); + + if ((hwcap & HWCAP_LOONGARCH_LSX)) { + npy__cpu_have[NPY_CPU_FEATURE_LSX] = 1; + return; + } +} + /***************** ARM ******************/ diff --git a/numpy/_core/src/common/npy_cpu_features.h b/numpy/_core/src/common/npy_cpu_features.h index d1e9d7e60d9f..7d6a406f8789 100644 --- a/numpy/_core/src/common/npy_cpu_features.h +++ b/numpy/_core/src/common/npy_cpu_features.h @@ -91,7 +91,7 @@ enum npy_cpu_features // IBM/ZARCH NPY_CPU_FEATURE_VX = 350, - + // Vector-Enhancements Facility 1 NPY_CPU_FEATURE_VXE = 351, @@ -101,6 +101,9 @@ enum npy_cpu_features // RISC-V NPY_CPU_FEATURE_RVV = 400, + // LOONGARCH + NPY_CPU_FEATURE_LSX = 500, + NPY_CPU_FEATURE_MAX }; @@ -113,7 +116,7 @@ enum npy_cpu_features * - uses 'NPY_DISABLE_CPU_FEATURES' to disable dispatchable features * - uses 'NPY_ENABLE_CPU_FEATURES' to enable dispatchable features * - * It will set a RuntimeError when + * It will set a RuntimeError when * - CPU baseline features from the build are not supported at runtime * - 'NPY_DISABLE_CPU_FEATURES' tries to disable a baseline feature * - 'NPY_DISABLE_CPU_FEATURES' and 'NPY_ENABLE_CPU_FEATURES' are @@ -122,14 +125,14 @@ enum npy_cpu_features * by the machine or build * - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature when the project was * not built with any feature optimization support - * + * * It will set an ImportWarning when: * - 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that is not supported * by the machine or build * - 'NPY_DISABLE_CPU_FEATURES' or 'NPY_ENABLE_CPU_FEATURES' tries to * disable/enable a feature when the project was not built with any feature * optimization support - * + * * return 0 on success otherwise return -1 */ NPY_VISIBILITY_HIDDEN int diff --git a/numpy/_core/src/common/simd/intdiv.h b/numpy/_core/src/common/simd/intdiv.h index d843eaf4c9d9..0284d49d23bb 100644 --- a/numpy/_core/src/common/simd/intdiv.h +++ b/numpy/_core/src/common/simd/intdiv.h @@ -216,6 +216,10 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d) divisor.val[0] = npyv_setall_u8(m); divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1)); divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2)); +#elif defined(NPY_HAVE_LSX) + divisor.val[0] = npyv_setall_u8(m); + divisor.val[1] = npyv_setall_u8(sh1); + divisor.val[2] = npyv_setall_u8(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -225,7 +229,7 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d) NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d); NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d) { -#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 +#if defined(NPY_HAVE_SSE2) // SSE/AVX2/AVX512 npyv_s16x3 p = npyv_divisor_s16(d); npyv_s8x3 r; r.val[0] = npyv_reinterpret_s8_s16(p.val[0]); @@ -249,7 +253,7 @@ NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d) npyv_s8x3 divisor; divisor.val[0] = npyv_setall_s8(m); divisor.val[2] = npyv_setall_s8(d < 0 ? -1 : 0); - #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) + #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_s8(sh); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_setall_s8(-sh); @@ -291,6 +295,9 @@ NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d) #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh1)); divisor.val[2] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh2)); +#elif defined(NPY_HAVE_LSX) + divisor.val[1] = npyv_setall_u16(sh1); + divisor.val[2] = npyv_setall_u16(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -321,6 +328,8 @@ NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d) divisor.val[1] = npyv_setall_s16(sh); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_setall_s16(-sh); +#elif defined(NPY_HAVE_LSX) + divisor.val[1] = npyv_setall_s16(sh); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -358,6 +367,9 @@ NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d) #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh1)); divisor.val[2] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh2)); +#elif defined(NPY_HAVE_LSX) + divisor.val[1] = npyv_setall_u32(sh1); + divisor.val[2] = npyv_setall_u32(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -393,6 +405,8 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d) divisor.val[1] = npyv_setall_s32(sh); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_setall_s32(-sh); +#elif defined(NPY_HAVE_LSX) + divisor.val[1] = npyv_setall_s32(sh); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -427,6 +441,9 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d) #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_u64(sh1); divisor.val[2] = npyv_set_u64(sh2); + #elif defined(NPY_HAVE_LSX) + divisor.val[1] = npyv_setall_u64(sh1); + divisor.val[2] = npyv_setall_u64(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -465,6 +482,8 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d) divisor.val[2] = npyv_setall_s64(d < 0 ? -1 : 0); // sign of divisor #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_s64(sh); + #elif defined(NPY_HAVE_LSX) + divisor.val[1] = npyv_setall_s64(sh); #else #error "please initialize the shifting operand for the new architecture" #endif diff --git a/numpy/_core/src/common/simd/lsx/arithmetic.h b/numpy/_core/src/common/simd/lsx/arithmetic.h new file mode 100644 index 000000000000..33aad40871bd --- /dev/null +++ b/numpy/_core/src/common/simd/lsx/arithmetic.h @@ -0,0 +1,257 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_LSX_ARITHMETIC_H +#define _NPY_SIMD_LSX_ARITHMETIC_H + +/*************************** + * Addition + ***************************/ +// non-saturated +#define npyv_add_u8 __lsx_vadd_b +#define npyv_add_s8 __lsx_vadd_b +#define npyv_add_u16 __lsx_vadd_h +#define npyv_add_s16 __lsx_vadd_h +#define npyv_add_u32 __lsx_vadd_w +#define npyv_add_s32 __lsx_vadd_w +#define npyv_add_u64 __lsx_vadd_d +#define npyv_add_s64 __lsx_vadd_d +#define npyv_add_f32 __lsx_vfadd_s +#define npyv_add_f64 __lsx_vfadd_d + +// saturated +#define npyv_adds_u8 __lsx_vsadd_bu +#define npyv_adds_s8 __lsx_vsadd_b +#define npyv_adds_u16 __lsx_vsadd_hu +#define npyv_adds_s16 __lsx_vsadd_h +#define npyv_adds_u32 __lsx_vsadd_wu +#define npyv_adds_s32 __lsx_vsadd_w +#define npyv_adds_u64 __lsx_vsadd_du +#define npyv_adds_s64 __lsx_vsadd_d + + +/*************************** + * Subtraction + ***************************/ +// non-saturated +#define npyv_sub_u8 __lsx_vsub_b +#define npyv_sub_s8 __lsx_vsub_b +#define npyv_sub_u16 __lsx_vsub_h +#define npyv_sub_s16 __lsx_vsub_h +#define npyv_sub_u32 __lsx_vsub_w +#define npyv_sub_s32 __lsx_vsub_w +#define npyv_sub_u64 __lsx_vsub_d +#define npyv_sub_s64 __lsx_vsub_d +#define npyv_sub_f32 __lsx_vfsub_s +#define npyv_sub_f64 __lsx_vfsub_d + +// saturated +#define npyv_subs_u8 __lsx_vssub_bu +#define npyv_subs_s8 __lsx_vssub_b +#define npyv_subs_u16 __lsx_vssub_hu +#define npyv_subs_s16 __lsx_vssub_h +#define npyv_subs_u32 __lsx_vssub_wu +#define npyv_subs_s32 __lsx_vssub_w +#define npyv_subs_u64 __lsx_vssub_du +#define npyv_subs_s64 __lsx_vssub_d + + +/*************************** + * Multiplication + ***************************/ +// non-saturated +#define npyv_mul_u8 __lsx_vmul_b +#define npyv_mul_s8 __lsx_vmul_b +#define npyv_mul_u16 __lsx_vmul_h +#define npyv_mul_s16 __lsx_vmul_h +#define npyv_mul_u32 __lsx_vmul_w +#define npyv_mul_s32 __lsx_vmul_w +#define npyv_mul_f32 __lsx_vfmul_s +#define npyv_mul_f64 __lsx_vfmul_d + + +/*************************** + * Integer Division + ***************************/ +// See simd/intdiv.h for more clarification +// divide each unsigned 8-bit element by a precomputed divisor +NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) +{ + // high part of unsigned multiplication + __m128i mulhi = __lsx_vmuh_bu(a, divisor.val[0]); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m128i q = __lsx_vsub_b(a, mulhi); + q = __lsx_vsrl_b(q, divisor.val[1]); + q = __lsx_vadd_b(mulhi, q); + q = __lsx_vsrl_b(q, divisor.val[2]); + + return q; +} +// divide each signed 8-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor); +NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) +{ + __m128i mulhi = __lsx_vmuh_b(a, divisor.val[0]); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + __m128i q = __lsx_vsra_b(__lsx_vadd_b(a, mulhi), divisor.val[1]); + q = __lsx_vsub_b(q, __lsx_vsrai_b(a, 7)); + q = __lsx_vsub_b(__lsx_vxor_v(q, divisor.val[2]), divisor.val[2]); + return q; +} +// divide each unsigned 16-bit element by a precomputed divisor +NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) +{ + // high part of unsigned multiplication + __m128i mulhi = __lsx_vmuh_hu(a, divisor.val[0]); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m128i q = __lsx_vsub_h(a, mulhi); + q = __lsx_vsrl_h(q, divisor.val[1]); + q = __lsx_vadd_h(mulhi, q); + q = __lsx_vsrl_h(q, divisor.val[2]); + return q; +} +// divide each signed 16-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) +{ + // high part of signed multiplication + __m128i mulhi = __lsx_vmuh_h(a, divisor.val[0]); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + __m128i q = __lsx_vsra_h(__lsx_vadd_h(a, mulhi), divisor.val[1]); + q = __lsx_vsub_h(q, __lsx_vsrai_h(a, 15)); + q = __lsx_vsub_h(__lsx_vxor_v(q, divisor.val[2]), divisor.val[2]); + return q; +} +// divide each unsigned 32-bit element by a precomputed divisor +NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) +{ + // high part of unsigned multiplication + __m128i mulhi = __lsx_vmuh_wu(a, divisor.val[0]); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m128i q = __lsx_vsub_w(a, mulhi); + q = __lsx_vsrl_w(q, divisor.val[1]); + q = __lsx_vadd_w(mulhi, q); + q = __lsx_vsrl_w(q, divisor.val[2]); + return q; +} +// divide each signed 32-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) +{ + __m128i mulhi = __lsx_vmuh_w(a, divisor.val[0]); + __m128i q = __lsx_vsra_w(__lsx_vadd_w(a, mulhi), divisor.val[1]); + q = __lsx_vsub_w(q, __lsx_vsrai_w(a, 31)); + q = __lsx_vsub_w(__lsx_vxor_v(q, divisor.val[2]), divisor.val[2]);; + return q; +} +// returns the high 64 bits of unsigned 64-bit multiplication +// xref https://stackoverflow.com/a/28827013 +NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b) +{ + __m128i hi = __lsx_vmuh_du(a, b); + return hi; +} +// divide each unsigned 64-bit element by a precomputed divisor +NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) +{ + // high part of unsigned multiplication + __m128i mulhi = __lsx_vmuh_du(a, divisor.val[0]); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + __m128i q = __lsx_vsub_d(a, mulhi); + q = __lsx_vsrl_d(q, divisor.val[1]); + q = __lsx_vadd_d(mulhi, q); + q = __lsx_vsrl_d(q, divisor.val[2]); + return q; +} +// divide each signed 64-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) +{ + __m128i mulhi = __lsx_vmuh_d(a, divisor.val[0]); + __m128i q = __lsx_vsra_d(__lsx_vadd_d(a, mulhi), divisor.val[1]); + q = __lsx_vsub_d(q, __lsx_vsrai_d(a, 63)); + q = __lsx_vsub_d(__lsx_vxor_v(q, divisor.val[2]), divisor.val[2]); + return q; +} +/*************************** + * Division + ***************************/ +#define npyv_div_f32 __lsx_vfdiv_s +#define npyv_div_f64 __lsx_vfdiv_d +/*************************** + * FUSED + ***************************/ +// multiply and add, a*b + c +#define npyv_muladd_f32 __lsx_vfmadd_s +#define npyv_muladd_f64 __lsx_vfmadd_d +// multiply and subtract, a*b - c +#define npyv_mulsub_f32 __lsx_vfmsub_s +#define npyv_mulsub_f64 __lsx_vfmsub_d +// negate multiply and add, -(a*b) + c equal to -(a*b - c) +#define npyv_nmuladd_f32 __lsx_vfnmsub_s +#define npyv_nmuladd_f64 __lsx_vfnmsub_d +// negate multiply and subtract, -(a*b) - c equal to -(a*b +c) +#define npyv_nmulsub_f32 __lsx_vfnmadd_s +#define npyv_nmulsub_f64 __lsx_vfnmadd_d + // multiply, add for odd elements and subtract even elements. + // (a * b) -+ c +NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { + return __lsx_vfmadd_s(a, b, (__m128)__lsx_vxor_v((__m128i)c, (__m128i)(v4f32){-0.0, 0.0, -0.0, 0.0})); + + } +NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { + return __lsx_vfmadd_d(a, b, (__m128d)__lsx_vxor_v((__m128i)c, (__m128i)(v2f64){-0.0, 0.0})); + + } + +/*************************** + * Summation + ***************************/ +// reduce sum across vector +NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) +{ + __m128i t1 = __lsx_vhaddw_du_wu(a, a); + __m128i t2 = __lsx_vhaddw_qu_du(t1, t1); + return __lsx_vpickve2gr_wu(t2, 0); +} + +NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) +{ + __m128i t = __lsx_vhaddw_qu_du(a, a); + return __lsx_vpickve2gr_du(t, 0); +} + +NPY_FINLINE float npyv_sum_f32(npyv_f32 a) +{ + __m128 ft = __lsx_vfadd_s(a, (__m128)__lsx_vbsrl_v((__m128i)a, 8)); + ft = __lsx_vfadd_s(ft, (__m128)__lsx_vbsrl_v(ft, 4)); + return ft[0]; +} + +NPY_FINLINE double npyv_sum_f64(npyv_f64 a) +{ + __m128d fd = __lsx_vfadd_d(a, (__m128d)__lsx_vreplve_d((__m128i)a, 1)); + return fd[0]; +} + +// expand the source vector and performs sum reduce +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) +{ + __m128i first = __lsx_vhaddw_hu_bu((__m128i)a,(__m128i)a); + __m128i second = __lsx_vhaddw_wu_hu((__m128i)first,(__m128i)first); + __m128i third = __lsx_vhaddw_du_wu((__m128i)second,(__m128i)second); + __m128i four = __lsx_vhaddw_qu_du((__m128i)third,(__m128i)third); + return four[0]; +} + +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) +{ + __m128i t1 = __lsx_vhaddw_wu_hu(a, a); + __m128i t2 = __lsx_vhaddw_du_wu(t1, t1); + __m128i t3 = __lsx_vhaddw_qu_du(t2, t2); + return __lsx_vpickve2gr_w(t3, 0); +} + +#endif // _NPY_SIMD_LSX_ARITHMETIC_H diff --git a/numpy/_core/src/common/simd/lsx/conversion.h b/numpy/_core/src/common/simd/lsx/conversion.h new file mode 100644 index 000000000000..72c22e90701c --- /dev/null +++ b/numpy/_core/src/common/simd/lsx/conversion.h @@ -0,0 +1,100 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_LSX_CVT_H +#define _NPY_SIMD_LSX_CVT_H + +// convert mask types to integer types +#define npyv_cvt_u8_b8(BL) BL +#define npyv_cvt_s8_b8(BL) BL +#define npyv_cvt_u16_b16(BL) BL +#define npyv_cvt_s16_b16(BL) BL +#define npyv_cvt_u32_b32(BL) BL +#define npyv_cvt_s32_b32(BL) BL +#define npyv_cvt_u64_b64(BL) BL +#define npyv_cvt_s64_b64(BL) BL +#define npyv_cvt_f32_b32(BL) (__m128)(BL) +#define npyv_cvt_f64_b64(BL) (__m128d)(BL) + +// convert integer types to mask types +#define npyv_cvt_b8_u8(A) A +#define npyv_cvt_b8_s8(A) A +#define npyv_cvt_b16_u16(A) A +#define npyv_cvt_b16_s16(A) A +#define npyv_cvt_b32_u32(A) A +#define npyv_cvt_b32_s32(A) A +#define npyv_cvt_b64_u64(A) A +#define npyv_cvt_b64_s64(A) A +#define npyv_cvt_b32_f32(A) (__m128i)(A) +#define npyv_cvt_b64_f64(A) (__m128i)(A) + +// convert boolean vector to integer bitfield +NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) +{ return (npy_uint16)__lsx_vmsknz_b(a)[0]; } +NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) +{ + __m128i b = __lsx_vsat_hu(a, 7); + __m128i pack = __lsx_vpickev_b(b, b); + return (npy_uint8)__lsx_vmsknz_b(pack)[0]; +} +NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) +{ + __m128i b = __lsx_vmskltz_w(a); + v4i32 ret = (v4i32)b; + return ret[0]; +} + +NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) +{ + __m128i b = __lsx_vmskltz_d(a); + v2i64 ret = (v2i64)b; + return ret[0]; +} + +// expand +NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) { + npyv_u16x2 r; + r.val[0] = __lsx_vsllwil_hu_bu(data, 0); + r.val[1] = __lsx_vexth_hu_bu(data); + return r; +} + +NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) { + npyv_u32x2 r; + r.val[0] = __lsx_vsllwil_wu_hu(data, 0); + r.val[1] = __lsx_vexth_wu_hu(data); + return r; +} + +// pack two 16-bit boolean into one 8-bit boolean vector +NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { + return __lsx_vpickev_b(__lsx_vsat_h(b, 7),__lsx_vsat_h(a, 7)); +} + +// pack four 32-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { + __m128i ab = __lsx_vpickev_h(__lsx_vsat_w(b, 15), __lsx_vsat_w(a, 15)); + __m128i cd = __lsx_vpickev_h(__lsx_vsat_w(d, 15), __lsx_vsat_w(c, 15)); + return npyv_pack_b8_b16(ab, cd); +} + +// pack eight 64-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, + npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) { + __m128i ab = __lsx_vpickev_h(__lsx_vsat_w(b, 15), __lsx_vsat_w(a, 15)); + __m128i cd = __lsx_vpickev_h(__lsx_vsat_w(d, 15), __lsx_vsat_w(c, 15)); + __m128i ef = __lsx_vpickev_h(__lsx_vsat_w(f, 15), __lsx_vsat_w(e, 15)); + __m128i gh = __lsx_vpickev_h(__lsx_vsat_w(h, 15), __lsx_vsat_w(g, 15)); + return npyv_pack_b8_b32(ab, cd, ef, gh); +} + +// round to nearest integer (assuming even) +#define npyv_round_s32_f32 __lsx_vftintrne_w_s +NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) +{ + return __lsx_vftintrne_w_d(b, a); +} +#endif // _NPY_SIMD_LSX_CVT_H diff --git a/numpy/_core/src/common/simd/lsx/lsx.h b/numpy/_core/src/common/simd/lsx/lsx.h new file mode 100644 index 000000000000..80017296fc98 --- /dev/null +++ b/numpy/_core/src/common/simd/lsx/lsx.h @@ -0,0 +1,77 @@ +#ifndef _NPY_SIMD_H_ + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_LSX_LSX_H +#define _NPY_SIMD_LSX_LSX_H + +#define NPY_SIMD 128 +#define NPY_SIMD_WIDTH 16 +#define NPY_SIMD_F64 1 +#define NPY_SIMD_F32 1 +#define NPY_SIMD_FMA3 1 +#define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 1 + +typedef __m128i npyv_u8; +typedef __m128i npyv_s8; +typedef __m128i npyv_u16; +typedef __m128i npyv_s16; +typedef __m128i npyv_u32; +typedef __m128i npyv_s32; +typedef __m128i npyv_u64; +typedef __m128i npyv_s64; +typedef __m128 npyv_f32; +typedef __m128d npyv_f64; + +typedef __m128i npyv_b8; +typedef __m128i npyv_b16; +typedef __m128i npyv_b32; +typedef __m128i npyv_b64; + +typedef struct { __m128i val[2]; } npyv_m128ix2; +typedef npyv_m128ix2 npyv_u8x2; +typedef npyv_m128ix2 npyv_s8x2; +typedef npyv_m128ix2 npyv_u16x2; +typedef npyv_m128ix2 npyv_s16x2; +typedef npyv_m128ix2 npyv_u32x2; +typedef npyv_m128ix2 npyv_s32x2; +typedef npyv_m128ix2 npyv_u64x2; +typedef npyv_m128ix2 npyv_s64x2; + +typedef struct { __m128i val[3]; } npyv_m128ix3; +typedef npyv_m128ix3 npyv_u8x3; +typedef npyv_m128ix3 npyv_s8x3; +typedef npyv_m128ix3 npyv_u16x3; +typedef npyv_m128ix3 npyv_s16x3; +typedef npyv_m128ix3 npyv_u32x3; +typedef npyv_m128ix3 npyv_s32x3; +typedef npyv_m128ix3 npyv_u64x3; +typedef npyv_m128ix3 npyv_s64x3; + +typedef struct { __m128 val[2]; } npyv_f32x2; +typedef struct { __m128d val[2]; } npyv_f64x2; +typedef struct { __m128 val[3]; } npyv_f32x3; +typedef struct { __m128d val[3]; } npyv_f64x3; + +#define npyv_nlanes_u8 16 +#define npyv_nlanes_s8 16 +#define npyv_nlanes_u16 8 +#define npyv_nlanes_s16 8 +#define npyv_nlanes_u32 4 +#define npyv_nlanes_s32 4 +#define npyv_nlanes_u64 2 +#define npyv_nlanes_s64 2 +#define npyv_nlanes_f32 4 +#define npyv_nlanes_f64 2 + + +#include "memory.h" +#include "misc.h" +#include "reorder.h" +#include "operators.h" +#include "conversion.h" +#include "arithmetic.h" +#include "math.h" + +#endif //#ifndef _NPY_SIMD_LSX_LSX_H \ No newline at end of file diff --git a/numpy/_core/src/common/simd/lsx/math.h b/numpy/_core/src/common/simd/lsx/math.h new file mode 100644 index 000000000000..6109fb4e8260 --- /dev/null +++ b/numpy/_core/src/common/simd/lsx/math.h @@ -0,0 +1,228 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_LSX_MATH_H +#define _NPY_SIMD_LSX_MATH_H +/*************************** + * Elementary + ***************************/ +// Square root +#define npyv_sqrt_f32 __lsx_vfsqrt_s +#define npyv_sqrt_f64 __lsx_vfsqrt_d + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ return __lsx_vfrecip_s(a); } +NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) +{ return __lsx_vfrecip_d(a); } + +// Absolute +NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) +{ + return (npyv_f32)__lsx_vbitclri_w(a, 0x1F); +} +NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) +{ + return (npyv_f64)__lsx_vbitclri_d(a, 0x3F); +} + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return __lsx_vfmul_s(a, a); } +NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) +{ return __lsx_vfmul_d(a, a); } + +// Maximum, natively mapping with no guarantees to handle NaN. +#define npyv_max_f32 __lsx_vfmax_s +#define npyv_max_f64 __lsx_vfmax_d +// Maximum, supports IEEE floating-point arithmetic (IEC 60559), +// - If one of the two vectors contains NaN, the equivalent element of the other vector is set +// - Only if both corresponded elements are NaN, NaN is set. +NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) +{ + return __lsx_vfmax_s(a, b); +} +NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b) +{ + return __lsx_vfmax_d(a, b); +} +// If any of corresponded element is NaN, NaN is set. +NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b) +{ + __m128i mask = __lsx_vand_v(npyv_notnan_f32(a), npyv_notnan_f32(b)); + __m128 max = __lsx_vfmax_s(a, b); + return npyv_select_f32(mask, max, (__m128){NAN, NAN, NAN, NAN}); +} +NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b) +{ + __m128i mask = __lsx_vand_v(npyv_notnan_f64(a), npyv_notnan_f64(b)); + __m128d max = __lsx_vfmax_d(a, b); + return npyv_select_f64(mask, max, (__m128d){NAN, NAN}); +} + +// Maximum, integer operations +#define npyv_max_u8 __lsx_vmax_bu +#define npyv_max_s8 __lsx_vmax_b +#define npyv_max_u16 __lsx_vmax_hu +#define npyv_max_s16 __lsx_vmax_h +#define npyv_max_u32 __lsx_vmax_wu +#define npyv_max_s32 __lsx_vmax_w +#define npyv_max_u64 __lsx_vmax_du +#define npyv_max_s64 __lsx_vmax_d + +// Minimum, natively mapping with no guarantees to handle NaN. +#define npyv_min_f32 __lsx_vfmin_s +#define npyv_min_f64 __lsx_vfmin_d + +// Minimum, supports IEEE floating-point arithmetic (IEC 60559), +// - If one of the two vectors contains NaN, the equivalent element of the other vector is set +// - Only if both corresponded elements are NaN, NaN is set. +NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) +{ + return __lsx_vfmin_s(a, b); +} +NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) +{ + return __lsx_vfmin_d(a, b); +} +NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b) +{ + __m128i mask = __lsx_vand_v(npyv_notnan_f32(a), npyv_notnan_f32(b)); + __m128 min = __lsx_vfmin_s(a, b); + return npyv_select_f32(mask, min, (__m128){NAN, NAN, NAN, NAN}); +} +NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) +{ + __m128i mask = __lsx_vand_v(npyv_notnan_f64(a), npyv_notnan_f64(b)); + __m128d min = __lsx_vfmin_d(a, b); + return npyv_select_f64(mask, min, (__m128d){NAN, NAN}); +} + +// Minimum, integer operations +#define npyv_min_u8 __lsx_vmin_bu +#define npyv_min_s8 __lsx_vmin_b +#define npyv_min_u16 __lsx_vmin_hu +#define npyv_min_s16 __lsx_vmin_h +#define npyv_min_u32 __lsx_vmin_wu +#define npyv_min_s32 __lsx_vmin_w +#define npyv_min_u64 __lsx_vmin_du +#define npyv_min_s64 __lsx_vmin_d + +// reduce min&max for ps & pd +#define NPY_IMPL_LSX_REDUCE_MINMAX(INTRIN, INF, INF64) \ + NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \ + { \ + __m128i vector2 = {0, 0}; \ + v4i32 index1 = {2, 3, 0, 0}; \ + v4i32 index2 = {1, 0, 0, 0}; \ + __m128 v64 = __lsx_vf##INTRIN##_s(a, (__m128)__lsx_vshuf_w((__m128i)index1, (__m128i)vector2, (__m128i)a)); \ + __m128 v32 = __lsx_vf##INTRIN##_s(v64, (__m128)__lsx_vshuf_w((__m128i)index2, (__m128i)vector2, (__m128i)v64)); \ + return v32[0]; \ + } \ + NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \ + { \ + npyv_b32 notnan = npyv_notnan_f32(a); \ + if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \ + const union { npy_uint32 i; float f;} pnan = {0x7fc00000UL}; \ + return pnan.f; \ + } \ + return npyv_reduce_##INTRIN##_f32(a); \ + } \ + NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \ + { \ + npyv_b32 notnan = npyv_notnan_f32(a); \ + if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \ + return a[0]; \ + } \ + a = npyv_select_f32(notnan, a, npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \ + return npyv_reduce_##INTRIN##_f32(a); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \ + { \ + __m128i index2 = {1, 0}; \ + __m128d v64 = __lsx_vf##INTRIN##_d(a, (__m128d)__lsx_vshuf_d(index2, (__m128i){0, 0}, (__m128i)a)); \ + return (double)v64[0]; \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \ + { \ + npyv_b64 notnan = npyv_notnan_f64(a); \ + if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \ + return a[0]; \ + } \ + a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(npyv_setall_u64(INF64))); \ + return npyv_reduce_##INTRIN##_f64(a); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ + { \ + npyv_b64 notnan = npyv_notnan_f64(a); \ + if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \ + const union { npy_uint64 i; double d;} pnan = {0x7ff8000000000000ull}; \ + return pnan.d; \ + } \ + return npyv_reduce_##INTRIN##_f64(a); \ + } + +NPY_IMPL_LSX_REDUCE_MINMAX(min, 0x7f800000, 0x7ff0000000000000) +NPY_IMPL_LSX_REDUCE_MINMAX(max, 0xff800000, 0xfff0000000000000) +#undef NPY_IMPL_LSX_REDUCE_MINMAX + +// reduce min&max for 8&16&32&64-bits +#define NPY_IMPL_LSX_REDUCE_MINMAX(STYPE, INTRIN, TFLAG) \ + NPY_FINLINE STYPE##64 npyv_reduce_##INTRIN##64(__m128i a) \ + { \ + __m128i vector2 = {0, 0}; \ + v4i32 index1 = {2, 3, 0, 0}; \ + __m128i v64 = npyv_##INTRIN##64(a, __lsx_vshuf_w((__m128i)index1, (__m128i)vector2, a)); \ + return (STYPE##64)__lsx_vpickve2gr_d##TFLAG(v64, 0); \ + } \ + NPY_FINLINE STYPE##32 npyv_reduce_##INTRIN##32(__m128i a) \ + { \ + __m128i vector2 = {0, 0}; \ + v4i32 index1 = {2, 3, 0, 0}; \ + v4i32 index2 = {1, 0, 0, 0}; \ + __m128i v64 = npyv_##INTRIN##32(a, __lsx_vshuf_w((__m128i)index1, (__m128i)vector2, a)); \ + __m128i v32 = npyv_##INTRIN##32(v64, __lsx_vshuf_w((__m128i)index2, (__m128i)vector2, v64)); \ + return (STYPE##32)__lsx_vpickve2gr_w##TFLAG(v32, 0); \ + } \ + NPY_FINLINE STYPE##16 npyv_reduce_##INTRIN##16(__m128i a) \ + { \ + __m128i vector2 = {0, 0}; \ + v4i32 index1 = {2, 3, 0, 0}; \ + v4i32 index2 = {1, 0, 0, 0}; \ + v8i16 index3 = {1, 0, 0, 0, 4, 5, 6, 7 }; \ + __m128i v64 = npyv_##INTRIN##16(a, __lsx_vshuf_w((__m128i)index1, (__m128i)vector2, a)); \ + __m128i v32 = npyv_##INTRIN##16(v64, __lsx_vshuf_w((__m128i)index2, (__m128i)vector2, v64)); \ + __m128i v16 = npyv_##INTRIN##16(v32, __lsx_vshuf_h((__m128i)index3, (__m128i)vector2, v32)); \ + return (STYPE##16)__lsx_vpickve2gr_h##TFLAG(v16, 0); \ + } \ + NPY_FINLINE STYPE##8 npyv_reduce_##INTRIN##8(__m128i a) \ + { \ + __m128i val =npyv_##INTRIN##8((__m128i)a, __lsx_vbsrl_v(a, 8)); \ + val = npyv_##INTRIN##8(val, __lsx_vbsrl_v(val, 4)); \ + val = npyv_##INTRIN##8(val, __lsx_vbsrl_v(val, 2)); \ + val = npyv_##INTRIN##8(val, __lsx_vbsrl_v(val, 1)); \ + return (STYPE##8)__lsx_vpickve2gr_b##TFLAG(val, 0); \ + } +NPY_IMPL_LSX_REDUCE_MINMAX(npy_uint, min_u, u) +NPY_IMPL_LSX_REDUCE_MINMAX(npy_int, min_s,) +NPY_IMPL_LSX_REDUCE_MINMAX(npy_uint, max_u, u) +NPY_IMPL_LSX_REDUCE_MINMAX(npy_int, max_s,) +#undef NPY_IMPL_LSX_REDUCE_MINMAX + +// round to nearest integer even +#define npyv_rint_f32 (__m128)__lsx_vfrintrne_s +#define npyv_rint_f64 (__m128d)__lsx_vfrintrne_d +// ceil +#define npyv_ceil_f32 (__m128)__lsx_vfrintrp_s +#define npyv_ceil_f64 (__m128d)__lsx_vfrintrp_d + +// trunc +#define npyv_trunc_f32 (__m128)__lsx_vfrintrz_s +#define npyv_trunc_f64 (__m128d)__lsx_vfrintrz_d + +// floor +#define npyv_floor_f32 (__m128)__lsx_vfrintrm_s +#define npyv_floor_f64 (__m128d)__lsx_vfrintrm_d + +#endif diff --git a/numpy/_core/src/common/simd/lsx/memory.h b/numpy/_core/src/common/simd/lsx/memory.h new file mode 100644 index 000000000000..9c3e6442c6d6 --- /dev/null +++ b/numpy/_core/src/common/simd/lsx/memory.h @@ -0,0 +1,594 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_LSX_MEMORY_H +#define _NPY_SIMD_LSX_MEMORY_H + +#include +#include "misc.h" + +/*************************** + * load/store + ***************************/ +#define NPYV_IMPL_LSX_MEM(SFX, CTYPE) \ + NPY_FINLINE npyv_##SFX npyv_load_##SFX(const CTYPE *ptr) \ + { return (npyv_##SFX)(__lsx_vld(ptr, 0)); } \ + NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const CTYPE *ptr) \ + { return (npyv_##SFX)(__lsx_vld(ptr, 0)); } \ + NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const CTYPE *ptr) \ + { return (npyv_##SFX)(__lsx_vld(ptr, 0)); } \ + NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const CTYPE *ptr) \ + { return (npyv_##SFX)__lsx_vldrepl_d(ptr, 0); } \ + NPY_FINLINE void npyv_store_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { __lsx_vst(vec, ptr, 0); } \ + NPY_FINLINE void npyv_storea_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { __lsx_vst(vec, ptr, 0); } \ + NPY_FINLINE void npyv_stores_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { __lsx_vst(vec, ptr, 0); } \ + NPY_FINLINE void npyv_storel_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { __lsx_vstelm_d(vec, ptr, 0, 0); } \ + NPY_FINLINE void npyv_storeh_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { __lsx_vstelm_d(vec, ptr, 0, 1); } + +NPYV_IMPL_LSX_MEM(u8, npy_uint8) +NPYV_IMPL_LSX_MEM(s8, npy_int8) +NPYV_IMPL_LSX_MEM(u16, npy_uint16) +NPYV_IMPL_LSX_MEM(s16, npy_int16) +NPYV_IMPL_LSX_MEM(u32, npy_uint32) +NPYV_IMPL_LSX_MEM(s32, npy_int32) +NPYV_IMPL_LSX_MEM(u64, npy_uint64) +NPYV_IMPL_LSX_MEM(s64, npy_int64) +NPYV_IMPL_LSX_MEM(f32, float) +NPYV_IMPL_LSX_MEM(f64, double) + +/*************************** + * Non-contiguous Load + ***************************/ +//// 32 +NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) +{ + __m128i a = __lsx_vreplgr2vr_w(*ptr); + a = __lsx_vinsgr2vr_w(a, ptr[stride], 1); + a = __lsx_vinsgr2vr_w(a, ptr[stride*2], 2); + a = __lsx_vinsgr2vr_w(a, ptr[stride*3], 3); + return a; +} +NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride) +{ return npyv_reinterpret_u32_s32(npyv_loadn_s32((const npy_int32*)ptr, stride)); } +NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) //ok +{ return npyv_reinterpret_f32_s32(npyv_loadn_s32((const npy_int32*)ptr, stride)); } +//// 64 +NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) +{ return (npyv_f64)__lsx_vilvl_d((__m128i)(v2f64)__lsx_vld((ptr + stride), 0), (__m128i)(v2f64)__lsx_vld(ptr, 0)); } +NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) +{ return npyv_reinterpret_u64_f64(npyv_loadn_f64((const double*)ptr, stride)); } +NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) +{ return npyv_reinterpret_s64_f64(npyv_loadn_f64((const double*)ptr, stride)); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ return (npyv_f32)__lsx_vilvl_d(__lsx_vld((const double *)(ptr + stride), 0), __lsx_vld((const double *)ptr, 0)); } +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ return npyv_reinterpret_u32_f32(npyv_loadn2_f32((const float*)ptr, stride)); } +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return npyv_reinterpret_s32_f32(npyv_loadn2_f32((const float*)ptr, stride)); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ (void)stride; return npyv_load_f64(ptr); } +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_u64(ptr); } +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_s64(ptr); } + +/*************************** + * Non-contiguous Store + ***************************/ +//// 32 +NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ + + __lsx_vstelm_w(a, ptr, 0, 0); + __lsx_vstelm_w(a, ptr + stride, 0, 1); + __lsx_vstelm_w(a, ptr + stride*2, 0, 2); + __lsx_vstelm_w(a, ptr + stride*3, 0, 3); +} +NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ npyv_storen_s32((npy_int32*)ptr, stride, a); } +NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen_s32((npy_int32*)ptr, stride, (npyv_s32)a); } +//// 64 +NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ + __lsx_vstelm_d(a, ptr, 0, 0); + __lsx_vstelm_d(a, ptr + stride, 0, 1); +} +NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ npyv_storen_f64((double*)ptr, stride, (npyv_f64)a); } +NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ npyv_storen_f64((double*)ptr, stride, (npyv_f64)a); } +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + __lsx_vstelm_d(npyv_reinterpret_u64_u32(a), ptr, 0, 0); + __lsx_vstelm_d(npyv_reinterpret_u64_u32(a), ptr+stride, 0, 1); // zn:TODO +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); } +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); } + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ (void)stride; npyv_store_u64(ptr, a); } +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ (void)stride; npyv_store_s64(ptr, a); } +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ (void)stride; npyv_store_f64(ptr, a); } +/********************************* + * Partial Load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + const __m128i vfill = npyv_setall_s32(fill); + switch(nlane) { + case 1: + return __lsx_vinsgr2vr_w(vfill, ptr[0], 0); + case 2: + return __lsx_vinsgr2vr_d(vfill, *(unsigned long *)ptr, 0); + case 3: + return __lsx_vinsgr2vr_w(__lsx_vld(ptr, 0), fill, 3); + default: + return npyv_load_s32(ptr); + } +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + __m128i zfill = __lsx_vldi(0); + switch(nlane) { + case 1: + return __lsx_vinsgr2vr_w(zfill, ptr[0], 0); + case 2: + return __lsx_vinsgr2vr_d(zfill, *(unsigned long *)ptr, 0); + case 3: + return __lsx_vinsgr2vr_w(__lsx_vld(ptr, 0), 0, 3); + default: + return npyv_load_s32(ptr); + } +} +//// 64 +NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + if (nlane == 1) { + const __m128i vfill = npyv_setall_s64(fill); + return __lsx_vinsgr2vr_d(vfill, ptr[0], 0); + } + return npyv_load_s64(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + return __lsx_vinsgr2vr_d(__lsx_vld(ptr, 0), 0, 1); + } + return npyv_load_s64(ptr); +} + +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const __m128i vfill = npyv_set_s32(fill_lo, fill_hi, fill_lo, fill_hi); + return (npyv_s32)__lsx_vinsgr2vr_d(vfill, *(long *)ptr, 0); + } + return npyv_load_s32(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return (npyv_s32)npyv_load_tillz_s64((const npy_int64*)ptr, nlane); } + +//// 128-bit nlane +NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ (void)nlane; return npyv_load_s64(ptr); } + +/********************************* + * Non-contiguous partial load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 +npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + __m128i vfill = npyv_setall_s32(fill); + switch(nlane) { + case 3: + vfill = __lsx_vinsgr2vr_w(vfill, ptr[stride*2], 2); + case 2: + vfill = __lsx_vinsgr2vr_w(vfill, ptr[stride], 1); + case 1: + vfill = __lsx_vinsgr2vr_w(vfill, ptr[0], 0); + break; + default: + return npyv_loadn_s32(ptr, stride); + } // switch + return vfill; +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 +npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + switch(nlane) { + case 1: + return __lsx_vinsgr2vr_w(__lsx_vldi(0), ptr[0], 0); + case 2: + { + npyv_s32 a = __lsx_vinsgr2vr_w(__lsx_vldi(0), ptr[0], 0); + return __lsx_vinsgr2vr_w(a, ptr[stride], 1); + } + case 3: + { + npyv_s32 a = __lsx_vinsgr2vr_w(__lsx_vldi(0), ptr[0], 0); + a = __lsx_vinsgr2vr_w(a, ptr[stride], 1); + a = __lsx_vinsgr2vr_w(a, ptr[stride*2], 2); + return a; + } + default: + return npyv_loadn_s32(ptr, stride); + } +} +//// 64 +NPY_FINLINE npyv_s64 +npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + if (nlane == 1) { + const __m128i vfill = npyv_setall_s64(fill); + return __lsx_vinsgr2vr_d(vfill, ptr[0], 0); + } + return npyv_loadn_s64(ptr, stride); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + return __lsx_vinsgr2vr_d(__lsx_vldi(0), ptr[0], 0); + } + return npyv_loadn_s64(ptr, stride); +} + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const __m128i vfill = npyv_set_s32(0, 0, fill_lo, fill_hi); + return (npyv_s32)__lsx_vinsgr2vr_d(vfill, *(long *)ptr, 0); + } + return npyv_loadn2_s32(ptr, stride); +} +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + return (npyv_s32)__lsx_vinsgr2vr_d(__lsx_vldi(0), *(long *)ptr, 0); + } + return npyv_loadn2_s32(ptr, stride); +} + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); } + +/********************************* + * Partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + switch(nlane) { + case 1: + __lsx_vstelm_w(a, ptr, 0, 0); + break; + case 2: + __lsx_vstelm_d(a, (long *)ptr, 0, 0); + break; + case 3: + __lsx_vstelm_d(a, (long *)ptr, 0, 0); + __lsx_vstelm_w(a, ptr, 2<<2, 2); + break; + default: + npyv_store_s32(ptr, a); + } +} +//// 64 +NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + if (nlane == 1) { + __lsx_vstelm_d(a, ptr, 0, 0); + return; + } + npyv_store_s64(ptr, a); +} +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); } + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); (void)nlane; + npyv_store_s64(ptr, a); +} + +/********************************* + * Non-contiguous partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + __lsx_vstelm_w(a, ptr, 0, 0); + switch(nlane) { + case 1: + return; + case 2: + ptr[stride*1] = __lsx_vpickve2gr_w(a, 1); + return; + case 3: + ptr[stride*1] = __lsx_vpickve2gr_w(a, 1); + ptr[stride*2] = __lsx_vpickve2gr_w(a, 2); + return; + default: + ptr[stride*1] = __lsx_vpickve2gr_w(a, 1); + ptr[stride*2] = __lsx_vpickve2gr_w(a, 2); + ptr[stride*3] = __lsx_vpickve2gr_w(a, 3); + } +} +//// 64 +NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + if (nlane == 1) { + __lsx_vstelm_d(a, ptr, 0, 0); + return; + } + npyv_storen_s64(ptr, stride, a); +} + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + npyv_storel_s32(ptr, a); + if (nlane > 1) { + npyv_storeh_s32(ptr + stride, a); + } +} + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); } + +/***************************************************************** + * Implement partial load/store for u32/f32/u64/f64... via casting + *****************************************************************/ +#define NPYV_IMPL_LSX_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun; \ + pun.from_##F_SFX = fill; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun; \ + pun.from_##F_SFX = fill; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_LSX_REST_PARTIAL_TYPES(u32, s32) +NPYV_IMPL_LSX_REST_PARTIAL_TYPES(f32, s32) +NPYV_IMPL_LSX_REST_PARTIAL_TYPES(u64, s64) +NPYV_IMPL_LSX_REST_PARTIAL_TYPES(f64, s64) + +// 128-bit/64-bit stride +#define NPYV_IMPL_LSX_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_LSX_REST_PARTIAL_TYPES_PAIR(u32, s32) +NPYV_IMPL_LSX_REST_PARTIAL_TYPES_PAIR(f32, s32) +NPYV_IMPL_LSX_REST_PARTIAL_TYPES_PAIR(u64, s64) +NPYV_IMPL_LSX_REST_PARTIAL_TYPES_PAIR(f64, s64) + +/************************************************************ + * de-interlave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_LSX_MEM_INTERLEAVE(SFX, ZSFX) \ + NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX, npyv_##SFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX, npyv_##SFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return npyv_unzip_##SFX( \ + npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \ + ); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + npyv_##SFX##x2 zip = npyv_zip_##SFX(v.val[0], v.val[1]); \ + npyv_store_##SFX(ptr, zip.val[0]); \ + npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \ + } + +NPYV_IMPL_LSX_MEM_INTERLEAVE(u8, uint8_t); +NPYV_IMPL_LSX_MEM_INTERLEAVE(s8, int8_t) +NPYV_IMPL_LSX_MEM_INTERLEAVE(u16, uint16_t) +NPYV_IMPL_LSX_MEM_INTERLEAVE(s16, int16_t) +NPYV_IMPL_LSX_MEM_INTERLEAVE(u32, uint32_t) +NPYV_IMPL_LSX_MEM_INTERLEAVE(s32, int32_t) +NPYV_IMPL_LSX_MEM_INTERLEAVE(u64, uint64_t) +NPYV_IMPL_LSX_MEM_INTERLEAVE(s64, int64_t) +NPYV_IMPL_LSX_MEM_INTERLEAVE(f32, float) +NPYV_IMPL_LSX_MEM_INTERLEAVE(f64, double) + +/********************************* + * Lookup table + *********************************/ +// uses vector as indexes into a table +// that contains 32 elements of float32. +NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) +{ + const int i0 = __lsx_vpickve2gr_wu(idx, 0); + const int i1 = __lsx_vpickve2gr_wu(idx, 1); + const int i2 = __lsx_vpickve2gr_wu(idx, 2); + const int i3 = __lsx_vpickve2gr_wu(idx, 3); + return npyv_set_f32(table[i0], table[i1], table[i2], table[i3]); +} +NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) +{ return npyv_reinterpret_u32_f32(npyv_lut32_f32((const float*)table, idx)); } +NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx) +{ return npyv_reinterpret_s32_f32(npyv_lut32_f32((const float*)table, idx)); } + +// uses vector as indexes into a table +// that contains 16 elements of float64. +NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx) +{ + const int i0 = __lsx_vpickve2gr_wu(idx, 0); + const int i1 = __lsx_vpickve2gr_wu(idx, 2); + return npyv_set_f64(table[i0], table[i1]); +} +NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx) +{ return npyv_reinterpret_u64_f64(npyv_lut16_f64((const double*)table, idx)); } +NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx) +{ return npyv_reinterpret_s64_f64(npyv_lut16_f64((const double*)table, idx)); } + +#endif // _NPY_SIMD_LSX_MEMORY_H diff --git a/numpy/_core/src/common/simd/lsx/misc.h b/numpy/_core/src/common/simd/lsx/misc.h new file mode 100644 index 000000000000..a65eda3c5573 --- /dev/null +++ b/numpy/_core/src/common/simd/lsx/misc.h @@ -0,0 +1,268 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif +#include +#ifndef _NPY_SIMD_LSX_MISC_H +#define _NPY_SIMD_LSX_MISC_H + +// vector with zero lanes +#define npyv_zero_u8() __lsx_vldi(0) +#define npyv_zero_s8() __lsx_vldi(0) +#define npyv_zero_u16() __lsx_vldi(0) +#define npyv_zero_s16() __lsx_vldi(0) +#define npyv_zero_u32() __lsx_vldi(0) +#define npyv_zero_s32() __lsx_vldi(0) +#define npyv_zero_u64() __lsx_vldi(0) +#define npyv_zero_s64() __lsx_vldi(0) +#define npyv_zero_f32() (__m128)__lsx_vldi(0) +#define npyv_zero_f64() (__m128d)__lsx_vldi(0) + +// vector with a specific value set to all lanes +#define npyv_setall_u8(VAL) __lsx_vreplgr2vr_b((unsigned char)(VAL)) +#define npyv_setall_s8(VAL) __lsx_vreplgr2vr_b((signed char)(VAL)) +#define npyv_setall_u16(VAL) __lsx_vreplgr2vr_h((unsigned short)(VAL)) +#define npyv_setall_s16(VAL) __lsx_vreplgr2vr_h((signed short)(VAL)) +#define npyv_setall_u32(VAL) __lsx_vreplgr2vr_w((unsigned int)(VAL)) +#define npyv_setall_s32(VAL) __lsx_vreplgr2vr_w((signed int)(VAL)) +#define npyv_setall_u64(VAL) __lsx_vreplgr2vr_d((unsigned long long)(VAL)) +#define npyv_setall_s64(VAL) __lsx_vreplgr2vr_d((long long)(VAL)) +#define npyv_setall_f32(VAL) (__m128)(v4f32){VAL, VAL, VAL, VAL} +#define npyv_setall_f64(VAL) (__m128d)(v2f64){VAL, VAL} + +/** + * vector with specific values set to each lane and + * set a specific value to all remained lanes + * + * Args that generated by NPYV__SET_FILL_* not going to expand if + * _mm_setr_* are defined as macros. + */ +NPY_FINLINE __m128i npyv__set_u8( + npy_uint8 i0, npy_uint8 i1, npy_uint8 i2, npy_uint8 i3, npy_uint8 i4, npy_uint8 i5, npy_uint8 i6, npy_uint8 i7, + npy_uint8 i8, npy_uint8 i9, npy_uint8 i10, npy_uint8 i11, npy_uint8 i12, npy_uint8 i13, npy_uint8 i14, npy_uint8 i15) +{ + v16u8 vec = {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}; + return (__m128i)vec; +} +NPY_FINLINE __m128i npyv__set_s8( + npy_int8 i0, npy_int8 i1, npy_int8 i2, npy_int8 i3, npy_int8 i4, npy_int8 i5, npy_int8 i6, npy_int8 i7, + npy_int8 i8, npy_int8 i9, npy_int8 i10, npy_int8 i11, npy_int8 i12, npy_int8 i13, npy_int8 i14, npy_int8 i15) +{ + v16i8 vec = {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}; + return (__m128i)vec; +} +NPY_FINLINE __m128i npyv__set_u16(npy_uint16 i0, npy_uint16 i1, npy_uint16 i2, npy_uint16 i3, npy_uint16 i4, npy_uint16 i5, + npy_uint16 i6, npy_uint16 i7) +{ + v8u16 vec = {i0, i1, i2, i3, i4, i5, i6, i7}; + return (__m128i)vec; +} +NPY_FINLINE __m128i npyv__set_s16(npy_int16 i0, npy_int16 i1, npy_int16 i2, npy_int16 i3, npy_int16 i4, npy_int16 i5, + npy_int16 i6, npy_int16 i7) +{ + v8i16 vec = {i0, i1, i2, i3, i4, i5, i6, i7}; + return (__m128i)vec; +} +NPY_FINLINE __m128i npyv__set_u32(npy_uint32 i0, npy_uint32 i1, npy_uint32 i2, npy_uint32 i3) +{ + v4u32 vec = {i0, i1, i2, i3}; + return (__m128i)vec; +} +NPY_FINLINE __m128i npyv__set_s32(npy_int32 i0, npy_int32 i1, npy_int32 i2, npy_int32 i3) +{ + v4i32 vec = {i0, i1, i2, i3}; + return (__m128i)vec; +} +NPY_FINLINE __m128i npyv__set_u64(npy_uint64 i0, npy_uint64 i1) +{ + v2u64 vec = {i0, i1}; + return (__m128i)vec; +} +NPY_FINLINE __m128i npyv__set_s64(npy_int64 i0, npy_int64 i1) +{ + v2i64 vec = {i0, i1}; + return (__m128i)vec; +} +NPY_FINLINE __m128 npyv__set_f32(float i0, float i1, float i2, float i3) +{ + __m128 vec = {i0, i1, i2, i3}; + return vec; +} +NPY_FINLINE __m128d npyv__set_f64(double i0, double i1) +{ + __m128d vec = {i0, i1}; + return vec; +} +#define npyv_setf_u8(FILL, ...) npyv__set_u8(NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)) +#define npyv_setf_s8(FILL, ...) npyv__set_s8(NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)) +#define npyv_setf_u16(FILL, ...) npyv__set_u16(NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)) +#define npyv_setf_s16(FILL, ...) npyv__set_s16(NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)) +#define npyv_setf_u32(FILL, ...) npyv__set_u32(NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)) +#define npyv_setf_s32(FILL, ...) npyv__set_s32(NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)) +#define npyv_setf_u64(FILL, ...) npyv__set_u64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) +#define npyv_setf_s64(FILL, ...) npyv__set_s64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) +#define npyv_setf_f32(FILL, ...) npyv__set_f32(NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)) +#define npyv_setf_f64(FILL, ...) npyv__set_f64(NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)) + +// vector with specific values set to each lane and +// set zero to all remained lanes +#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__) +#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__) +#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__) +#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__) +#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__) +#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__) +#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__) +#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__) +#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) +#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) + +// Per lane select +NPY_FINLINE __m128i npyv_select_u8(__m128i mask, __m128i a, __m128i b) +{ + return __lsx_vbitsel_v(b, a, mask); +} + +NPY_FINLINE __m128 npyv_select_f32(__m128i mask, __m128 a, __m128 b) +{ + return (__m128)__lsx_vbitsel_v((__m128i)b, (__m128i)a, mask); +} +NPY_FINLINE __m128d npyv_select_f64(__m128i mask, __m128d a, __m128d b) +{ + return (__m128d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, mask); +} + +#define npyv_select_s8 npyv_select_u8 +#define npyv_select_u16 npyv_select_u8 +#define npyv_select_s16 npyv_select_u8 +#define npyv_select_u32 npyv_select_u8 +#define npyv_select_s32 npyv_select_u8 +#define npyv_select_u64 npyv_select_u8 +#define npyv_select_s64 npyv_select_u8 + +// extract the first vector's lane +#define npyv_extract0_u8(A) ((npy_uint8)__lsx_vpickve2gr_bu(A, 0)) +#define npyv_extract0_s8(A) ((npy_int8)__lsx_vpickve2gr_b(A, 0)) +#define npyv_extract0_u16(A) ((npy_uint16)__lsx_vpickve2gr_hu(A, 0)) +#define npyv_extract0_s16(A) ((npy_int16)__lsx_vpickve2gr_h(A, 0)) +#define npyv_extract0_u32(A) ((npy_uint32)__lsx_vpickve2gr_wu(A, 0)) +#define npyv_extract0_s32(A) ((npy_int32)__lsx_vpickve2gr_w(A, 0)) +#define npyv_extract0_u64(A) ((npy_uint64)__lsx_vpickve2gr_du(A, 0)) +#define npyv_extract0_s64(A) ((npy_int64)__lsx_vpickve2gr_d(A, 0)) +#define npyv_extract0_f32(A) A[0] +#define npyv_extract0_f64(A) A[0] + +// Reinterpret +#define npyv_reinterpret_u8_u8(X) X +#define npyv_reinterpret_u8_s8(X) X +#define npyv_reinterpret_u8_u16(X) X +#define npyv_reinterpret_u8_s16(X) X +#define npyv_reinterpret_u8_u32(X) X +#define npyv_reinterpret_u8_s32(X) X +#define npyv_reinterpret_u8_u64(X) X +#define npyv_reinterpret_u8_s64(X) X +#define npyv_reinterpret_u8_f32(X) (__m128i)X +#define npyv_reinterpret_u8_f64(X) (__m128i)X + +#define npyv_reinterpret_s8_s8(X) X +#define npyv_reinterpret_s8_u8(X) X +#define npyv_reinterpret_s8_u16(X) X +#define npyv_reinterpret_s8_s16(X) X +#define npyv_reinterpret_s8_u32(X) X +#define npyv_reinterpret_s8_s32(X) X +#define npyv_reinterpret_s8_u64(X) X +#define npyv_reinterpret_s8_s64(X) X +#define npyv_reinterpret_s8_f32(X) (__m128i)X +#define npyv_reinterpret_s8_f64(X) (__m128i)X + +#define npyv_reinterpret_u16_u16(X) X +#define npyv_reinterpret_u16_u8(X) X +#define npyv_reinterpret_u16_s8(X) X +#define npyv_reinterpret_u16_s16(X) X +#define npyv_reinterpret_u16_u32(X) X +#define npyv_reinterpret_u16_s32(X) X +#define npyv_reinterpret_u16_u64(X) X +#define npyv_reinterpret_u16_s64(X) X +#define npyv_reinterpret_u16_f32(X) (__m128i)X +#define npyv_reinterpret_u16_f64(X) (__m128i)X + +#define npyv_reinterpret_s16_s16(X) X +#define npyv_reinterpret_s16_u8(X) X +#define npyv_reinterpret_s16_s8(X) X +#define npyv_reinterpret_s16_u16(X) X +#define npyv_reinterpret_s16_u32(X) X +#define npyv_reinterpret_s16_s32(X) X +#define npyv_reinterpret_s16_u64(X) X +#define npyv_reinterpret_s16_s64(X) X +#define npyv_reinterpret_s16_f32(X) (__m128i)X +#define npyv_reinterpret_s16_f64(X) (__m128i)X + +#define npyv_reinterpret_u32_u32(X) X +#define npyv_reinterpret_u32_u8(X) X +#define npyv_reinterpret_u32_s8(X) X +#define npyv_reinterpret_u32_u16(X) X +#define npyv_reinterpret_u32_s16(X) X +#define npyv_reinterpret_u32_s32(X) X +#define npyv_reinterpret_u32_u64(X) X +#define npyv_reinterpret_u32_s64(X) X +#define npyv_reinterpret_u32_f32(X) (__m128i)X +#define npyv_reinterpret_u32_f64(X) (__m128i)X + +#define npyv_reinterpret_s32_s32(X) X +#define npyv_reinterpret_s32_u8(X) X +#define npyv_reinterpret_s32_s8(X) X +#define npyv_reinterpret_s32_u16(X) X +#define npyv_reinterpret_s32_s16(X) X +#define npyv_reinterpret_s32_u32(X) X +#define npyv_reinterpret_s32_u64(X) X +#define npyv_reinterpret_s32_s64(X) X +#define npyv_reinterpret_s32_f32(X) (__m128i)X +#define npyv_reinterpret_s32_f64(X) (__m128i)X + +#define npyv_reinterpret_u64_u64(X) X +#define npyv_reinterpret_u64_u8(X) X +#define npyv_reinterpret_u64_s8(X) X +#define npyv_reinterpret_u64_u16(X) X +#define npyv_reinterpret_u64_s16(X) X +#define npyv_reinterpret_u64_u32(X) X +#define npyv_reinterpret_u64_s32(X) X +#define npyv_reinterpret_u64_s64(X) X +#define npyv_reinterpret_u64_f32(X) (__m128i)X +#define npyv_reinterpret_u64_f64(X) (__m128i)X + +#define npyv_reinterpret_s64_s64(X) X +#define npyv_reinterpret_s64_u8(X) X +#define npyv_reinterpret_s64_s8(X) X +#define npyv_reinterpret_s64_u16(X) X +#define npyv_reinterpret_s64_s16(X) X +#define npyv_reinterpret_s64_u32(X) X +#define npyv_reinterpret_s64_s32(X) X +#define npyv_reinterpret_s64_u64(X) X +#define npyv_reinterpret_s64_f32(X) (__m128i)X +#define npyv_reinterpret_s64_f64(X) (__m128i)X + +#define npyv_reinterpret_f32_f32(X) X +#define npyv_reinterpret_f32_u8(X) (__m128)X +#define npyv_reinterpret_f32_s8(X) (__m128)X +#define npyv_reinterpret_f32_u16(X) (__m128)X +#define npyv_reinterpret_f32_s16(X) (__m128)X +#define npyv_reinterpret_f32_u32(X) (__m128)X +#define npyv_reinterpret_f32_s32(X) (__m128)X +#define npyv_reinterpret_f32_u64(X) (__m128)X +#define npyv_reinterpret_f32_s64(X) (__m128)X +#define npyv_reinterpret_f32_f64(X) (__m128)X + +#define npyv_reinterpret_f64_f64(X) X +#define npyv_reinterpret_f64_u8(X) (__m128d)X +#define npyv_reinterpret_f64_s8(X) (__m128d)X +#define npyv_reinterpret_f64_u16(X) (__m128d)X +#define npyv_reinterpret_f64_s16(X) (__m128d)X +#define npyv_reinterpret_f64_u32(X) (__m128d)X +#define npyv_reinterpret_f64_s32(X) (__m128d)X +#define npyv_reinterpret_f64_u64(X) (__m128d)X +#define npyv_reinterpret_f64_s64(X) (__m128d)X +#define npyv_reinterpret_f64_f32(X) (__m128d)X + +// Only required by AVX2/AVX512 +#define npyv_cleanup() ((void)0) + +#endif diff --git a/numpy/_core/src/common/simd/lsx/operators.h b/numpy/_core/src/common/simd/lsx/operators.h new file mode 100644 index 000000000000..f2af02d52632 --- /dev/null +++ b/numpy/_core/src/common/simd/lsx/operators.h @@ -0,0 +1,263 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_LSX_OPERATORS_H +#define _NPY_SIMD_LSX_OPERATORS_H + +/*************************** + * Shifting + ***************************/ + +// left +#define npyv_shl_u16(A, C) __lsx_vsll_h(A, npyv_setall_s16(C)) +#define npyv_shl_s16(A, C) __lsx_vsll_h(A, npyv_setall_s16(C)) +#define npyv_shl_u32(A, C) __lsx_vsll_w(A, npyv_setall_s32(C)) +#define npyv_shl_s32(A, C) __lsx_vsll_w(A, npyv_setall_s32(C)) +#define npyv_shl_u64(A, C) __lsx_vsll_d(A, npyv_setall_s64(C)) +#define npyv_shl_s64(A, C) __lsx_vsll_d(A, npyv_setall_s64(C)) + +// left by an immediate constant +#define npyv_shli_u16 __lsx_vslli_h +#define npyv_shli_s16 __lsx_vslli_h +#define npyv_shli_u32 __lsx_vslli_w +#define npyv_shli_s32 __lsx_vslli_w +#define npyv_shli_u64 __lsx_vslli_d +#define npyv_shli_s64 __lsx_vslli_d + +// right +#define npyv_shr_u16(A, C) __lsx_vsrl_h(A, npyv_setall_u16(C)) +#define npyv_shr_s16(A, C) __lsx_vsra_h(A, npyv_setall_u16(C)) +#define npyv_shr_u32(A, C) __lsx_vsrl_w(A, npyv_setall_u32(C)) +#define npyv_shr_s32(A, C) __lsx_vsra_w(A, npyv_setall_u32(C)) +#define npyv_shr_u64(A, C) __lsx_vsrl_d(A, npyv_setall_u64(C)) +#define npyv_shr_s64(A, C) __lsx_vsra_d(A, npyv_setall_u64(C)) + +// Right by an immediate constant +#define npyv_shri_u16 __lsx_vsrli_h +#define npyv_shri_s16 __lsx_vsrai_h +#define npyv_shri_u32 __lsx_vsrli_w +#define npyv_shri_s32 __lsx_vsrai_w +#define npyv_shri_u64 __lsx_vsrli_d +#define npyv_shri_s64 __lsx_vsrai_d + +/*************************** + * Logical + ***************************/ + +// AND +#define npyv_and_u8 __lsx_vand_v +#define npyv_and_s8 __lsx_vand_v +#define npyv_and_u16 __lsx_vand_v +#define npyv_and_s16 __lsx_vand_v +#define npyv_and_u32 __lsx_vand_v +#define npyv_and_s32 __lsx_vand_v +#define npyv_and_u64 __lsx_vand_v +#define npyv_and_s64 __lsx_vand_v +#define npyv_and_f32(A, B) \ + (__m128)__lsx_vand_v((__m128i)A, (__m128i)B) +#define npyv_and_f64(A, B) \ + (__m128d)__lsx_vand_v((__m128i)A, (__m128i)B) +#define npyv_and_b8 __lsx_vand_v +#define npyv_and_b16 __lsx_vand_v +#define npyv_and_b32 __lsx_vand_v +#define npyv_and_b64 __lsx_vand_v + +// OR +#define npyv_or_u8 __lsx_vor_v +#define npyv_or_s8 __lsx_vor_v +#define npyv_or_u16 __lsx_vor_v +#define npyv_or_s16 __lsx_vor_v +#define npyv_or_u32 __lsx_vor_v +#define npyv_or_s32 __lsx_vor_v +#define npyv_or_u64 __lsx_vor_v +#define npyv_or_s64 __lsx_vor_v +#define npyv_or_f32(A, B) \ + (__m128)__lsx_vor_v((__m128i)A, (__m128i)B) +#define npyv_or_f64(A, B) \ + (__m128d)__lsx_vor_v((__m128i)A, (__m128i)B) +#define npyv_or_b8 __lsx_vor_v +#define npyv_or_b16 __lsx_vor_v +#define npyv_or_b32 __lsx_vor_v +#define npyv_or_b64 __lsx_vor_v + +// XOR +#define npyv_xor_u8 __lsx_vxor_v +#define npyv_xor_s8 __lsx_vxor_v +#define npyv_xor_u16 __lsx_vxor_v +#define npyv_xor_s16 __lsx_vxor_v +#define npyv_xor_u32 __lsx_vxor_v +#define npyv_xor_s32 __lsx_vxor_v +#define npyv_xor_u64 __lsx_vxor_v +#define npyv_xor_s64 __lsx_vxor_v +#define npyv_xor_f32(A, B) \ + (__m128)__lsx_vxor_v((__m128i)A, (__m128i)B) +#define npyv_xor_f64(A, B) \ + (__m128d)__lsx_vxor_v((__m128i)A, (__m128i)B) +#define npyv_xor_b8 __lsx_vxor_v +#define npyv_xor_b16 __lsx_vxor_v +#define npyv_xor_b32 __lsx_vxor_v +#define npyv_xor_b64 __lsx_vxor_v + +// NOT +#define npyv_not_u8(A) __lsx_vxori_b((__m128i)A, 0xff) +#define npyv_not_s8 npyv_not_u8 +#define npyv_not_u16 npyv_not_u8 +#define npyv_not_s16 npyv_not_u8 +#define npyv_not_u32 npyv_not_u8 +#define npyv_not_s32 npyv_not_u8 +#define npyv_not_u64 npyv_not_u8 +#define npyv_not_s64 npyv_not_u8 +#define npyv_not_f32 (__m128)npyv_not_u8 +#define npyv_not_f64 (__m128d)npyv_not_u8 +#define npyv_not_b8 npyv_not_u8 +#define npyv_not_b16 npyv_not_u8 +#define npyv_not_b32 npyv_not_u8 +#define npyv_not_b64 npyv_not_u8 + +// ANDC, ORC and XNOR +#define npyv_andc_u8(A, B) __lsx_vandn_v(B, A) +#define npyv_andc_b8(A, B) __lsx_vandn_v(B, A) +#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) +#define npyv_xnor_b8 __lsx_vseq_b + +/*************************** + * Comparison + ***************************/ + +// Int Equal +#define npyv_cmpeq_u8 __lsx_vseq_b +#define npyv_cmpeq_s8 __lsx_vseq_b +#define npyv_cmpeq_u16 __lsx_vseq_h +#define npyv_cmpeq_s16 __lsx_vseq_h +#define npyv_cmpeq_u32 __lsx_vseq_w +#define npyv_cmpeq_s32 __lsx_vseq_w +#define npyv_cmpeq_u64 __lsx_vseq_d +#define npyv_cmpeq_s64 __lsx_vseq_d + +// Int Not Equal +#define npyv_cmpneq_u8(A, B) npyv_not_u8(npyv_cmpeq_u8(A, B)) +#define npyv_cmpneq_u16(A, B) npyv_not_u16(npyv_cmpeq_u16(A, B)) +#define npyv_cmpneq_u32(A, B) npyv_not_u32(npyv_cmpeq_u32(A, B)) +#define npyv_cmpneq_u64(A, B) npyv_not_u64(npyv_cmpeq_u64(A, B)) +#define npyv_cmpneq_s8 npyv_cmpneq_u8 +#define npyv_cmpneq_s16 npyv_cmpneq_u16 +#define npyv_cmpneq_s32 npyv_cmpneq_u32 +#define npyv_cmpneq_s64 npyv_cmpneq_u64 + +// signed greater than +#define npyv_cmpgt_s8(A, B) __lsx_vslt_b(B, A) +#define npyv_cmpgt_s16(A, B) __lsx_vslt_h(B, A) +#define npyv_cmpgt_s32(A, B) __lsx_vslt_w(B, A) +#define npyv_cmpgt_s64(A, B) __lsx_vslt_d(B, A) + +// signed greater than or equal +#define npyv_cmpge_s8(A, B) __lsx_vsle_b(B, A) +#define npyv_cmpge_s16(A, B) __lsx_vsle_h(B, A) +#define npyv_cmpge_s32(A, B) __lsx_vsle_w(B, A) +#define npyv_cmpge_s64(A, B) __lsx_vsle_d(B, A) + +// unsigned greater than +#define npyv_cmpgt_u8(A, B) __lsx_vslt_bu(B, A) +#define npyv_cmpgt_u16(A, B) __lsx_vslt_hu(B, A) +#define npyv_cmpgt_u32(A, B) __lsx_vslt_wu(B, A) +#define npyv_cmpgt_u64(A, B) __lsx_vslt_du(B, A) + +// unsigned greater than or equal +#define npyv_cmpge_u8(A, B) __lsx_vsle_bu(B, A) +#define npyv_cmpge_u16(A, B) __lsx_vsle_hu(B, A) +#define npyv_cmpge_u32(A, B) __lsx_vsle_wu(B, A) +#define npyv_cmpge_u64(A, B) __lsx_vsle_du(B, A) + +// less than +#define npyv_cmplt_u8 __lsx_vslt_bu +#define npyv_cmplt_s8 __lsx_vslt_b +#define npyv_cmplt_u16 __lsx_vslt_hu +#define npyv_cmplt_s16 __lsx_vslt_h +#define npyv_cmplt_u32 __lsx_vslt_wu +#define npyv_cmplt_s32 __lsx_vslt_w +#define npyv_cmplt_u64 __lsx_vslt_du +#define npyv_cmplt_s64 __lsx_vslt_d + +// less than or equal +#define npyv_cmple_u8 __lsx_vsle_bu +#define npyv_cmple_s8 __lsx_vsle_b +#define npyv_cmple_u16 __lsx_vsle_hu +#define npyv_cmple_s16 __lsx_vsle_h +#define npyv_cmple_u32 __lsx_vsle_wu +#define npyv_cmple_s32 __lsx_vsle_w +#define npyv_cmple_u64 __lsx_vsle_du +#define npyv_cmple_s64 __lsx_vsle_d + +// precision comparison +#define npyv_cmpeq_f32 __lsx_vfcmp_ceq_s +#define npyv_cmpeq_f64 __lsx_vfcmp_ceq_d +#define npyv_cmpneq_f32 __lsx_vfcmp_cune_s +#define npyv_cmpneq_f64 __lsx_vfcmp_cune_d +#define npyv_cmplt_f32 __lsx_vfcmp_clt_s +#define npyv_cmplt_f64 __lsx_vfcmp_clt_d +#define npyv_cmple_f32 __lsx_vfcmp_cle_s +#define npyv_cmple_f64 __lsx_vfcmp_cle_d +#define npyv_cmpgt_f32(A, B) npyv_cmplt_f32(B, A) +#define npyv_cmpgt_f64(A, B) npyv_cmplt_f64(B, A) +#define npyv_cmpge_f32(A, B) npyv_cmple_f32(B, A) +#define npyv_cmpge_f64(A, B) npyv_cmple_f64(B, A) + +// check special cases +NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) +{ return __lsx_vfcmp_cor_s(a, a); } //!nan,return:ffffffff +NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) +{ return __lsx_vfcmp_cor_d(a, a); } + +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +#define NPYV_IMPL_LSX_ANYALL(SFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return __lsx_vmsknz_b((__m128i)a)[0] != 0; } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return __lsx_vmsknz_b((__m128i)a)[0] == 0xffff; } +NPYV_IMPL_LSX_ANYALL(b8) +NPYV_IMPL_LSX_ANYALL(b16) +NPYV_IMPL_LSX_ANYALL(b32) +NPYV_IMPL_LSX_ANYALL(b64) +#undef NPYV_IMPL_LSX_ANYALL + +#define NPYV_IMPL_LSX_ANYALL(SFX, TSFX, MASK) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { \ + return __lsx_vmsknz_b(a)[0] != 0; \ + } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { \ + return __lsx_vmsknz_b( \ + __lsx_vseq_##TSFX(a, npyv_zero_##SFX()) \ + )[0] == 0; \ + } +NPYV_IMPL_LSX_ANYALL(u8, b, 0xffff) +NPYV_IMPL_LSX_ANYALL(s8, b, 0xffff) +NPYV_IMPL_LSX_ANYALL(u16, h, 0xffff) +NPYV_IMPL_LSX_ANYALL(s16, h, 0xffff) +NPYV_IMPL_LSX_ANYALL(u32, w, 0xffff) +NPYV_IMPL_LSX_ANYALL(s32, w, 0xffff) +NPYV_IMPL_LSX_ANYALL(u64, d, 0xffff) +NPYV_IMPL_LSX_ANYALL(s64, d, 0xffff) +#undef NPYV_IMPL_LSX_ANYALL + +NPY_FINLINE bool npyv_any_f32(npyv_f32 a) +{ + return __lsx_vmsknz_b(__lsx_vfcmp_ceq_s(a, npyv_zero_f32()))[0] != 0xffff; +} +NPY_FINLINE bool npyv_all_f32(npyv_f32 a) +{ + return __lsx_vmsknz_b(__lsx_vfcmp_ceq_s(a, npyv_zero_f32()))[0] == 0; +} +NPY_FINLINE bool npyv_any_f64(npyv_f64 a) +{ + return __lsx_vmsknz_b(__lsx_vfcmp_ceq_d(a, npyv_zero_f64()))[0] != 0xffff; +} +NPY_FINLINE bool npyv_all_f64(npyv_f64 a) +{ + return __lsx_vmsknz_b(__lsx_vfcmp_ceq_d(a, npyv_zero_f64()))[0] == 0; +} +#endif // _NPY_SIMD_LSX_OPERATORS_H diff --git a/numpy/_core/src/common/simd/lsx/reorder.h b/numpy/_core/src/common/simd/lsx/reorder.h new file mode 100644 index 000000000000..0c8f07a8c207 --- /dev/null +++ b/numpy/_core/src/common/simd/lsx/reorder.h @@ -0,0 +1,186 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_LSX_REORDER_H +#define _NPY_SIMD_LSX_REORDER_H + +// combine lower part of two vectors +#define npyv_combinel_u8(A, B) __lsx_vilvl_d(B, A) +#define npyv_combinel_s8(A, B) __lsx_vilvl_d(B, A) +#define npyv_combinel_u16(A, B) __lsx_vilvl_d(B, A) +#define npyv_combinel_s16(A, B) __lsx_vilvl_d(B, A) +#define npyv_combinel_u32(A, B) __lsx_vilvl_d(B, A) +#define npyv_combinel_s32(A, B) __lsx_vilvl_d(B, A) +#define npyv_combinel_u64(A, B) __lsx_vilvl_d(B, A) +#define npyv_combinel_s64(A, B) __lsx_vilvl_d(B, A) +#define npyv_combinel_f32(A, B) (__m128)(__lsx_vilvl_d((__m128i)B, (__m128i)A)) +#define npyv_combinel_f64(A, B) (__m128d)(__lsx_vilvl_d((__m128i)B, (__m128i)A)) + +// combine higher part of two vectors +#define npyv_combineh_u8(A, B) __lsx_vilvh_d(B, A) +#define npyv_combineh_s8(A, B) __lsx_vilvh_d(B, A) +#define npyv_combineh_u16(A, B) __lsx_vilvh_d(B, A) +#define npyv_combineh_s16(A, B) __lsx_vilvh_d(B, A) +#define npyv_combineh_u32(A, B) __lsx_vilvh_d(B, A) +#define npyv_combineh_s32(A, B) __lsx_vilvh_d(B, A) +#define npyv_combineh_u64(A, B) __lsx_vilvh_d(B, A) +#define npyv_combineh_s64(A, B) __lsx_vilvh_d(B, A) +#define npyv_combineh_f32(A, B) (__m128)(__lsx_vilvh_d((__m128i)B, (__m128i)A)) +#define npyv_combineh_f64(A, B) (__m128d)(__lsx_vilvh_d((__m128i)B, (__m128i)A)) + +// combine two vectors from lower and higher parts of two other vectors +NPY_FINLINE npyv_s64x2 npyv__combine(__m128i a, __m128i b) +{ + npyv_s64x2 r; + r.val[0] = npyv_combinel_u8(a, b); + r.val[1] = npyv_combineh_u8(a, b); + return r; +} +NPY_FINLINE npyv_f32x2 npyv_combine_f32(__m128 a, __m128 b) +{ + npyv_f32x2 r; + r.val[0] = npyv_combinel_f32(a, b); + r.val[1] = npyv_combineh_f32(a, b); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_combine_f64(__m128d a, __m128d b) +{ + npyv_f64x2 r; + r.val[0] = npyv_combinel_f64(a, b); + r.val[1] = npyv_combineh_f64(a, b); + return r; +} +#define npyv_combine_u8 npyv__combine +#define npyv_combine_s8 npyv__combine +#define npyv_combine_u16 npyv__combine +#define npyv_combine_s16 npyv__combine +#define npyv_combine_u32 npyv__combine +#define npyv_combine_s32 npyv__combine +#define npyv_combine_u64 npyv__combine +#define npyv_combine_s64 npyv__combine + +// interleave two vectors +#define NPYV_IMPL_LSX_ZIP(T_VEC, SFX, INTR_SFX) \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = __lsx_vilvl_##INTR_SFX(b, a); \ + r.val[1] = __lsx_vilvh_##INTR_SFX(b, a); \ + return r; \ + } + +NPYV_IMPL_LSX_ZIP(npyv_u8, u8, b) +NPYV_IMPL_LSX_ZIP(npyv_s8, s8, b) +NPYV_IMPL_LSX_ZIP(npyv_u16, u16, h) +NPYV_IMPL_LSX_ZIP(npyv_s16, s16, h) +NPYV_IMPL_LSX_ZIP(npyv_u32, u32, w) +NPYV_IMPL_LSX_ZIP(npyv_s32, s32, w) +NPYV_IMPL_LSX_ZIP(npyv_u64, u64, d) +NPYV_IMPL_LSX_ZIP(npyv_s64, s64, d) + +NPY_FINLINE npyv_f32x2 npyv_zip_f32(__m128 a, __m128 b) +{ + npyv_f32x2 r; + r.val[0] = (__m128)(__lsx_vilvl_w((__m128i)b, (__m128i)a)); + r.val[1] = (__m128)(__lsx_vilvh_w((__m128i)b, (__m128i)a)); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m128d a, __m128d b) +{ + npyv_f64x2 r; + r.val[0] = (__m128d)(__lsx_vilvl_d((__m128i)b, (__m128i)a)); + r.val[1] = (__m128d)(__lsx_vilvh_d((__m128i)b, (__m128i)a)); + return r; +} + +// deinterleave two vectors +#define NPYV_IMPL_LSX_UNZIP(T_VEC, SFX, INTR_SFX) \ + NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = __lsx_vpickev_##INTR_SFX(b, a); \ + r.val[1] = __lsx_vpickod_##INTR_SFX(b, a); \ + return r; \ + } + +NPYV_IMPL_LSX_UNZIP(npyv_u8, u8, b) +NPYV_IMPL_LSX_UNZIP(npyv_s8, s8, b) +NPYV_IMPL_LSX_UNZIP(npyv_u16, u16, h) +NPYV_IMPL_LSX_UNZIP(npyv_s16, s16, h) +NPYV_IMPL_LSX_UNZIP(npyv_u32, u32, w) +NPYV_IMPL_LSX_UNZIP(npyv_s32, s32, w) +NPYV_IMPL_LSX_UNZIP(npyv_u64, u64, d) +NPYV_IMPL_LSX_UNZIP(npyv_s64, s64, d) + +NPY_FINLINE npyv_f32x2 npyv_unzip_f32(__m128 a, __m128 b) +{ + npyv_f32x2 r; + r.val[0] = (__m128)(__lsx_vpickev_w((__m128i)b, (__m128i)a)); + r.val[1] = (__m128)(__lsx_vpickod_w((__m128i)b, (__m128i)a)); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(__m128d a, __m128d b) +{ + npyv_f64x2 r; + r.val[0] = (__m128d)(__lsx_vpickev_d((__m128i)b, (__m128i)a)); + r.val[1] = (__m128d)(__lsx_vpickod_d((__m128i)b, (__m128i)a)); + return r; +} + +// Reverse elements of each 64-bit lane +NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) +{ + v16u8 idx = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; + return __lsx_vshuf_b(a, a, (__m128i)idx); +} + +#define npyv_rev64_s8 npyv_rev64_u8 + +NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) +{ + v8u16 idx = {3, 2, 1, 0, 7, 6, 5, 4}; + return __lsx_vshuf_h((__m128i)idx, a, a); +} + +#define npyv_rev64_s16 npyv_rev64_u16 + +NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a) +{ + v4u32 idx = {1, 0, 3, 2}; + return __lsx_vshuf_w((__m128i)idx, a, a); +} +#define npyv_rev64_s32 npyv_rev64_u32 + +NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) +{ + v4i32 idx = {1, 0, 3, 2}; + return (v4f32)__lsx_vshuf_w((__m128i)idx, (__m128i)a, (__m128i)a); +} + +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + npyv_set_u32( \ + __lsx_vpickve2gr_wu(A, E0), __lsx_vpickve2gr_wu(A, E1), \ + __lsx_vpickve2gr_wu(A, E2), __lsx_vpickve2gr_wu(A, E3) \ + ) +#define npyv_permi128_s32(A, E0, E1, E2, E3) \ + npyv_set_s32( \ + __lsx_vpickve2gr_w(A, E0), __lsx_vpickve2gr_w(A, E1), \ + __lsx_vpickve2gr_w(A, E2), __lsx_vpickve2gr_w(A, E3) \ + ) +#define npyv_permi128_u64(A, E0, E1) \ + npyv_set_u64( \ + __lsx_vpickve2gr_du(A, E0), __lsx_vpickve2gr_du(A, E1) \ + ) +#define npyv_permi128_s64(A, E0, E1) \ + npyv_set_s64( \ + __lsx_vpickve2gr_d(A, E0), __lsx_vpickve2gr_d(A, E1) \ + ) +#define npyv_permi128_f32(A, E0, E1, E2, E3) \ + (__m128)__lsx_vshuf_w((__m128i)(v4u32){E0, E1, E2, E3}, (__m128i)A, (__m128i)A) + +#define npyv_permi128_f64(A, E0, E1) \ + (__m128d)__lsx_vshuf_d((__m128i){E0, E1}, (__m128i)A, (__m128i)A) +#endif // _NPY_SIMD_LSX_REORDER_H diff --git a/numpy/_core/src/common/simd/simd.h b/numpy/_core/src/common/simd/simd.h index 706229af0a62..fe4ca4da92f5 100644 --- a/numpy/_core/src/common/simd/simd.h +++ b/numpy/_core/src/common/simd/simd.h @@ -87,6 +87,10 @@ typedef double npyv_lanetype_f64; #include "neon/neon.h" #endif +#ifdef NPY_HAVE_LSX + #include "lsx/lsx.h" +#endif + #ifndef NPY_SIMD /// SIMD width in bits or 0 if there's no SIMD extension available. #define NPY_SIMD 0 diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src index 16cb6ecb21ac..d330c21695d5 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src @@ -36,7 +36,7 @@ * q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign); ********************************************************************************/ -#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) +#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX) // Due to integer 128-bit multiplication emulation, SIMD 64-bit division // may not perform well on both neon and up to VSX3 compared to scalar // division. @@ -452,7 +452,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed) * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles. * Power10(VSX4) is an exception here since it has native support for integer vector division. */ -#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) +#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX)) #undef TO_SIMD_SFX #endif NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) diff --git a/numpy/_core/tests/test_cpu_dispatcher.py b/numpy/_core/tests/test_cpu_dispatcher.py index b86f28a32121..c52cd418a08b 100644 --- a/numpy/_core/tests/test_cpu_dispatcher.py +++ b/numpy/_core/tests/test_cpu_dispatcher.py @@ -12,7 +12,7 @@ def test_dispatcher(): "SSE2", "SSE41", "AVX2", "VSX", "VSX2", "VSX3", "NEON", "ASIMD", "ASIMDHP", - "VX", "VXE" + "VX", "VXE", "LSX" ) highest_sfx = "" # no suffix for the baseline all_sfx = [] diff --git a/numpy/_core/tests/test_cpu_features.py b/numpy/_core/tests/test_cpu_features.py index 7807f0b01a8b..32a9825b1d7a 100644 --- a/numpy/_core/tests/test_cpu_features.py +++ b/numpy/_core/tests/test_cpu_features.py @@ -420,3 +420,12 @@ def load_flags(self): # if the kernel reports any one of the following ARM8 features. "ASIMD": ("AES", "SHA1", "SHA2", "PMULL", "CRC32") } + + +is_loongarch = re.match("^(loongarch)", machine, re.IGNORECASE) +@pytest.mark.skipif(not is_linux or not is_loongarch, reason="Only for Linux and LoongArch") +class Test_LOONGARCH_Features(AbstractTest): + features = ["LSX"] + + def load_flags(self): + self.load_flags_cpuinfo("Features") diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py index b1a6fa36061c..4dea2f9b1da1 100644 --- a/numpy/distutils/ccompiler_opt.py +++ b/numpy/distutils/ccompiler_opt.py @@ -325,7 +325,7 @@ class _Config: ## ARMv8.2 dot product ASIMDDP = dict(interest=6, implies="ASIMD"), ## ARMv8.2 Single & half-precision Multiply - ASIMDFHM = dict(interest=7, implies="ASIMDHP"), + ASIMDFHM = dict(interest=7, implies="ASIMDHP") ) def conf_features_partial(self): """Return a dictionary of supported CPU features by the platform, diff --git a/numpy/distutils/checks/cpu_lsx.c b/numpy/distutils/checks/cpu_lsx.c new file mode 100644 index 000000000000..5993c93a5f86 --- /dev/null +++ b/numpy/distutils/checks/cpu_lsx.c @@ -0,0 +1,11 @@ +#ifndef __loongarch_sx +#error "HOST/ARCH doesn't support LSX" +#endif + +#include + +int main(void) +{ + __m128i a = __lsx_vadd_d(__lsx_vldi(0), __lsx_vldi(0)); + return __lsx_vpickve2gr_w(a, 0); +}