Improvements to vectorization on ARM (openmm#3555)

peastman · web-flow · commit 99a9fdc92659 · 2022-04-08T09:05:17.000-07:00
diff --git a/openmmapi/include/openmm/internal/vectorize_neon.h b/openmmapi/include/openmm/internal/vectorize_neon.h
@@ -9,7 +9,7 @@
  * Biological Structures at Stanford, funded under the NIH Roadmap for        *
  * Medical Research, grant U54 GM072970. See https://simtk.org.               *
  *                                                                            *
- * Portions copyright (c) 2013-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2022 Stanford University and the Authors.      *
  * Authors: Mateus Lima, Peter Eastman                                        *
  * Contributors:                                                              *
  *                                                                            *
@@ -130,13 +130,7 @@ class fvec4 {
         return vmulq_f32(val, other);
     }
     fvec4 operator/(fvec4 other) const {
-        // NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
-
-        float32x4_t reciprocal = vrecpeq_f32(other);
-        reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
-        reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
-        fvec4 result = vmulq_f32(val,reciprocal);
-        return result;
+        return vdivq_f32(val, other);
     }
     void operator+=(fvec4 other) {
         val = vaddq_f32(val, other);
@@ -337,16 +331,11 @@ static inline float dot3(fvec4 v1, fvec4 v2) {
 }
 
 static inline float dot4(fvec4 v1, fvec4 v2) {
-    fvec4 result = v1*v2;
-    return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
+    return vaddvq_f32(v1*v2);
 }
 
 static inline float reduceAdd(fvec4 v) {
-#ifdef __ARM64__
     return vaddvq_f32(v);
-#else
-    return dot4(v, fvec4(1.0f));
-#endif
 }
 
 static inline fvec4 cross(fvec4 v1, fvec4 v2) {
@@ -397,11 +386,7 @@ static inline ivec4 abs(ivec4 v) {
 }
 
 static inline bool any(ivec4 v) {
-#ifdef __ARM64__
     return (vmaxvq_u32(vreinterpretq_u32_s32(v)) != 0);
-#else
-    return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);
-#endif
 }
 
 // Mathematical operators involving a scalar and a vector.
@@ -439,19 +424,15 @@ static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
 // These are at the end since they involve other functions defined above.
 
 static inline fvec4 round(fvec4 v) {
-    fvec4 shift(0x1.0p23f);
-    fvec4 absResult = (abs(v)+shift)-shift;
-    return blend(v, absResult, ivec4(0x7FFFFFFF));
+    return vrndnq_f32(v);
 }
 
 static inline fvec4 floor(fvec4 v) {
-    fvec4 rounded = round(v);
-    return rounded + blend(0.0f, -1.0f, rounded>v);
+    return vrndmq_f32(v);
 }
 
 static inline fvec4 ceil(fvec4 v) {
-    fvec4 rounded = round(v);
-    return rounded + blend(0.0f, 1.0f, rounded<v);
+    return vrndpq_f32(v);
 }
 
 /* Given a table of floating-point values and a set of indexes, perform a gather read into a pair