numpy · loongson-zn · Nov 21, 2023
diff --git a/meson_cpu/loongarch/meson.build b/meson_cpu/loongarch/meson.build
@@ -0,0 +1,7 @@
+source_root = meson.project_source_root()
+mod_features = import('features')
+LSX = mod_features.new(
+  'LSX', 1, args: ['-mlsx'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_lsx.c')[0]
+)
+LOONGARCH_FEATURES = {'LSX': LSX}
diff --git a/meson_cpu/main_config.h.in b/meson_cpu/main_config.h.in
@@ -385,4 +385,8 @@
 #ifdef @P@HAVE_NEON
    #include <arm_neon.h>
 #endif
+
+#ifdef @P@HAVE_LSX
+    #include <lsxintrin.h>
+#endif
 #endif // @P@_CPU_DISPATCHER_CONF_H_
diff --git a/meson_cpu/meson.build b/meson_cpu/meson.build
@@ -75,12 +75,14 @@ subdir('x86')
 subdir('ppc64')
 subdir('s390x')
 subdir('arm')
+subdir('loongarch')

 CPU_FEATURES = {}
 CPU_FEATURES += ARM_FEATURES
 CPU_FEATURES += X86_FEATURES
 CPU_FEATURES += PPC64_FEATURES
 CPU_FEATURES += S390X_FEATURES
+CPU_FEATURES += LOONGARCH_FEATURES

 # Parse the requested baseline (CPU_CONF_BASELINE) and dispatch features
 # (CPU_CONF_DISPATCH).
@@ -92,7 +94,8 @@ min_features = {
  'ppc64': [],
  's390x': [],
  'arm': [],
-  'aarch64': [ASIMD]
+  'aarch64': [ASIMD],
+  'loongarch64': [LSX]
 }.get(cpu_family, [])
 if host_machine.endian() == 'little' and cpu_family == 'ppc64'
  min_features = [VSX2]
@@ -106,6 +109,7 @@ max_features_dict = {
  's390x': S390X_FEATURES,
  'arm': ARM_FEATURES,
  'aarch64': ARM_FEATURES,
+  'loongarch64': LOONGARCH_FEATURES,
 }.get(cpu_family, {})
 max_features = []
 foreach fet_name, fet_obj : max_features_dict

diff --git a/meson_options.txt b/meson_options.txt
@@ -31,6 +31,7 @@ option('test-simd', type: 'array',
          'VSX', 'VSX2', 'VSX3', 'VSX4',
          'NEON', 'ASIMD',
          'VX', 'VXE', 'VXE2',
+          'LSX',
        ],
        description: 'Specify a list of CPU features to be tested against NumPy SIMD interface')
 option('test-simd-args', type: 'string', value: '',

diff --git a/numpy/_core/src/common/npy_cpu_features.c b/numpy/_core/src/common/npy_cpu_features.c
@@ -118,7 +118,8 @@ static struct {
                {NPY_CPU_FEATURE_FPHP, "FPHP"},
                {NPY_CPU_FEATURE_ASIMDHP, "ASIMDHP"},
                {NPY_CPU_FEATURE_ASIMDDP, "ASIMDDP"},
-                {NPY_CPU_FEATURE_ASIMDFHM, "ASIMDFHM"}};
+                {NPY_CPU_FEATURE_ASIMDFHM, "ASIMDFHM"},
+                {NPY_CPU_FEATURE_LSX, "LSX"}};


 NPY_VISIBILITY_HIDDEN PyObject *
@@ -653,6 +654,27 @@ npy__cpu_init_features(void)
    npy__cpu_have[NPY_CPU_FEATURE_VX]  = 1;
 }

+/***************** LoongArch ******************/
+
+#elif defined(__loongarch__)
+
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+static void
+npy__cpu_init_features(void)
+{
+   memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+   unsigned int hwcap = getauxval(AT_HWCAP);
+   if ((hwcap & HWCAP_LOONGARCH_LSX) == 0) {
+       return;
+   }
+
+   if ((hwcap & HWCAP_LOONGARCH_LSX)==0x10) {
+      npy__cpu_have[NPY_CPU_FEATURE_LSX]  = 1;
+      return;
+   }
+}

 /***************** ARM ******************/


diff --git a/numpy/_core/src/common/npy_cpu_features.h b/numpy/_core/src/common/npy_cpu_features.h
@@ -96,6 +96,9 @@ enum npy_cpu_features
    // Vector-Enhancements Facility 2
    NPY_CPU_FEATURE_VXE2              = 352,

+    // LOONGARCH
+    NPY_CPU_FEATURE_LSX              = 400,
+
    NPY_CPU_FEATURE_MAX
 };


diff --git a/numpy/_core/src/common/simd/intdiv.h b/numpy/_core/src/common/simd/intdiv.h
@@ -216,6 +216,10 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
    divisor.val[0] = npyv_setall_u8(m);
    divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1));
    divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2));
+#elif defined(NPY_HAVE_LSX)
+    divisor.val[0] = npyv_setall_u16(m);
+    divisor.val[1] = npyv_setall_u8(sh1);
+    divisor.val[2] = npyv_setall_u8(sh2);
 #else
    #error "please initialize the shifting operand for the new architecture"
 #endif
@@ -225,7 +229,7 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
 NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d);
 NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d)
 {
-#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+#if defined NPY_HAVE_SSE2 || defined(NPY_HAVE_LSX)// SSE/AVX2/AVX512
    npyv_s16x3 p = npyv_divisor_s16(d);
    npyv_s8x3 r;
    r.val[0] = npyv_reinterpret_s8_s16(p.val[0]);
@@ -291,6 +295,9 @@ NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d)
 #elif defined(NPY_HAVE_NEON)
    divisor.val[1] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh1));
    divisor.val[2] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh2));
+#elif defined(NPY_HAVE_LSX)
+    divisor.val[1] = npyv_setall_u16(sh1);
+    divisor.val[2] = npyv_setall_u16(sh2);
 #else
    #error "please initialize the shifting operand for the new architecture"
 #endif
@@ -321,6 +328,8 @@ NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d)
    divisor.val[1] = npyv_setall_s16(sh);
 #elif defined(NPY_HAVE_NEON)
    divisor.val[1] = npyv_setall_s16(-sh);
+#elif defined(NPY_HAVE_LSX)
+    divisor.val[1] = npyv_setall_s16(sh);
 #else
    #error "please initialize the shifting operand for the new architecture"
 #endif
@@ -358,6 +367,9 @@ NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d)
 #elif defined(NPY_HAVE_NEON)
    divisor.val[1] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh1));
    divisor.val[2] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh2));
+#elif defined(NPY_HAVE_LSX)
+    divisor.val[1] = npyv_setall_u32(sh1);
+    divisor.val[2] = npyv_setall_u32(sh2);
 #else
    #error "please initialize the shifting operand for the new architecture"
 #endif
@@ -393,6 +405,8 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
    divisor.val[1] = npyv_setall_s32(sh);
 #elif defined(NPY_HAVE_NEON)
    divisor.val[1] = npyv_setall_s32(-sh);
+#elif defined(NPY_HAVE_LSX)
+    divisor.val[1] = npyv_setall_s32(sh);
 #else
    #error "please initialize the shifting operand for the new architecture"
 #endif
@@ -427,6 +441,9 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
    #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
        divisor.val[1] = npyv_set_u64(sh1);
        divisor.val[2] = npyv_set_u64(sh2);
+    #elif defined(NPY_HAVE_LSX)
+        divisor.val[1] = npyv_setall_u64(sh1);
+        divisor.val[2] = npyv_setall_u64(sh2);
    #else
        #error "please initialize the shifting operand for the new architecture"
    #endif
@@ -465,6 +482,8 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
    divisor.val[2] = npyv_setall_s64(d < 0 ? -1 : 0);  // sign of divisor
    #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
    divisor.val[1] = npyv_set_s64(sh);
+    #elif defined(NPY_HAVE_LSX)
+    divisor.val[1] = npyv_setall_s64(sh);
    #else
        #error "please initialize the shifting operand for the new architecture"
    #endif