|
9 | 9 | * Biological Structures at Stanford, funded under the NIH Roadmap for * |
10 | 10 | * Medical Research, grant U54 GM072970. See https://simtk.org. * |
11 | 11 | * * |
12 | | - * Portions copyright (c) 2013-2014 Stanford University and the Authors. * |
| 12 | + * Portions copyright (c) 2013-2022 Stanford University and the Authors. * |
13 | 13 | * Authors: Mateus Lima, Peter Eastman * |
14 | 14 | * Contributors: * |
15 | 15 | * * |
@@ -130,13 +130,7 @@ class fvec4 { |
130 | 130 | return vmulq_f32(val, other); |
131 | 131 | } |
132 | 132 | fvec4 operator/(fvec4 other) const { |
133 | | - // NEON does not have a divide float-point operator, so we get the reciprocal and multiply. |
134 | | - |
135 | | - float32x4_t reciprocal = vrecpeq_f32(other); |
136 | | - reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal); |
137 | | - reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal); |
138 | | - fvec4 result = vmulq_f32(val,reciprocal); |
139 | | - return result; |
| 133 | + return vdivq_f32(val, other); |
140 | 134 | } |
141 | 135 | void operator+=(fvec4 other) { |
142 | 136 | val = vaddq_f32(val, other); |
@@ -337,16 +331,11 @@ static inline float dot3(fvec4 v1, fvec4 v2) { |
337 | 331 | } |
338 | 332 |
|
339 | 333 | static inline float dot4(fvec4 v1, fvec4 v2) { |
340 | | - fvec4 result = v1*v2; |
341 | | - return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3); |
| 334 | + return vaddvq_f32(v1*v2); |
342 | 335 | } |
343 | 336 |
|
344 | 337 | static inline float reduceAdd(fvec4 v) { |
345 | | -#ifdef __ARM64__ |
346 | 338 | return vaddvq_f32(v); |
347 | | -#else |
348 | | - return dot4(v, fvec4(1.0f)); |
349 | | -#endif |
350 | 339 | } |
351 | 340 |
|
352 | 341 | static inline fvec4 cross(fvec4 v1, fvec4 v2) { |
@@ -397,11 +386,7 @@ static inline ivec4 abs(ivec4 v) { |
397 | 386 | } |
398 | 387 |
|
399 | 388 | static inline bool any(ivec4 v) { |
400 | | -#ifdef __ARM64__ |
401 | 389 | return (vmaxvq_u32(vreinterpretq_u32_s32(v)) != 0); |
402 | | -#else |
403 | | - return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0); |
404 | | -#endif |
405 | 390 | } |
406 | 391 |
|
407 | 392 | // Mathematical operators involving a scalar and a vector. |
@@ -439,19 +424,15 @@ static inline ivec4 blendZero(ivec4 v, ivec4 mask) { |
439 | 424 | // These are at the end since they involve other functions defined above. |
440 | 425 |
|
441 | 426 | static inline fvec4 round(fvec4 v) { |
442 | | - fvec4 shift(0x1.0p23f); |
443 | | - fvec4 absResult = (abs(v)+shift)-shift; |
444 | | - return blend(v, absResult, ivec4(0x7FFFFFFF)); |
| 427 | + return vrndnq_f32(v); |
445 | 428 | } |
446 | 429 |
|
447 | 430 | static inline fvec4 floor(fvec4 v) { |
448 | | - fvec4 rounded = round(v); |
449 | | - return rounded + blend(0.0f, -1.0f, rounded>v); |
| 431 | + return vrndmq_f32(v); |
450 | 432 | } |
451 | 433 |
|
452 | 434 | static inline fvec4 ceil(fvec4 v) { |
453 | | - fvec4 rounded = round(v); |
454 | | - return rounded + blend(0.0f, 1.0f, rounded<v); |
| 435 | + return vrndpq_f32(v); |
455 | 436 | } |
456 | 437 |
|
457 | 438 | /* Given a table of floating-point values and a set of indexes, perform a gather read into a pair |
|
0 commit comments