Skip to content

Commit 99a9fdc

Browse files
authored
Improvements to vectorization on ARM (openmm#3555)
1 parent fd13a65 commit 99a9fdc

File tree

1 file changed

+6
-25
lines changed

1 file changed

+6
-25
lines changed

openmmapi/include/openmm/internal/vectorize_neon.h

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* Biological Structures at Stanford, funded under the NIH Roadmap for *
1010
* Medical Research, grant U54 GM072970. See https://simtk.org. *
1111
* *
12-
* Portions copyright (c) 2013-2014 Stanford University and the Authors. *
12+
* Portions copyright (c) 2013-2022 Stanford University and the Authors. *
1313
* Authors: Mateus Lima, Peter Eastman *
1414
* Contributors: *
1515
* *
@@ -130,13 +130,7 @@ class fvec4 {
130130
return vmulq_f32(val, other);
131131
}
132132
fvec4 operator/(fvec4 other) const {
133-
// NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
134-
135-
float32x4_t reciprocal = vrecpeq_f32(other);
136-
reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
137-
reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
138-
fvec4 result = vmulq_f32(val,reciprocal);
139-
return result;
133+
return vdivq_f32(val, other);
140134
}
141135
void operator+=(fvec4 other) {
142136
val = vaddq_f32(val, other);
@@ -337,16 +331,11 @@ static inline float dot3(fvec4 v1, fvec4 v2) {
337331
}
338332

339333
static inline float dot4(fvec4 v1, fvec4 v2) {
340-
fvec4 result = v1*v2;
341-
return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
334+
return vaddvq_f32(v1*v2);
342335
}
343336

344337
static inline float reduceAdd(fvec4 v) {
345-
#ifdef __ARM64__
346338
return vaddvq_f32(v);
347-
#else
348-
return dot4(v, fvec4(1.0f));
349-
#endif
350339
}
351340

352341
static inline fvec4 cross(fvec4 v1, fvec4 v2) {
@@ -397,11 +386,7 @@ static inline ivec4 abs(ivec4 v) {
397386
}
398387

399388
static inline bool any(ivec4 v) {
400-
#ifdef __ARM64__
401389
return (vmaxvq_u32(vreinterpretq_u32_s32(v)) != 0);
402-
#else
403-
return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);
404-
#endif
405390
}
406391

407392
// Mathematical operators involving a scalar and a vector.
@@ -439,19 +424,15 @@ static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
439424
// These are at the end since they involve other functions defined above.
440425

441426
static inline fvec4 round(fvec4 v) {
442-
fvec4 shift(0x1.0p23f);
443-
fvec4 absResult = (abs(v)+shift)-shift;
444-
return blend(v, absResult, ivec4(0x7FFFFFFF));
427+
return vrndnq_f32(v);
445428
}
446429

447430
static inline fvec4 floor(fvec4 v) {
448-
fvec4 rounded = round(v);
449-
return rounded + blend(0.0f, -1.0f, rounded>v);
431+
return vrndmq_f32(v);
450432
}
451433

452434
static inline fvec4 ceil(fvec4 v) {
453-
fvec4 rounded = round(v);
454-
return rounded + blend(0.0f, 1.0f, rounded<v);
435+
return vrndpq_f32(v);
455436
}
456437

457438
/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair

0 commit comments

Comments
 (0)