Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -196,18 +196,25 @@ public static Vector128<double> Invoke(Vector128<double> x)

// r = x - (dn * (ln(2) / 64))
// where ln(2) / 64 is split into Head and Tail values
Vector128<double> r = x - (dn * Vector128.Create(V_LN2_HEAD)) - (dn * Vector128.Create(V_LN2_TAIL));
Vector128<double> r;
r = MultiplyAddEstimateOperator<double>.Invoke(dn, Vector128.Create(-V_LN2_HEAD), x);
r = MultiplyAddEstimateOperator<double>.Invoke(dn, Vector128.Create(-V_LN2_TAIL), r);

// POLY_EVAL_ODD_9
Vector128<double> r2 = r * r;
Vector128<double> r4 = r2 * r2;
Vector128<double> r8 = r4 * r4;

// Compute polynomial
Vector128<double> poly = ((Vector128.Create(C12) * r + Vector128.Create(C11)) * r2 +
Vector128.Create(C10) * r + Vector128.Create(C9)) * r8 +
((Vector128.Create(C8) * r + Vector128.Create(C7)) * r2 +
(Vector128.Create(C6) * r + Vector128.Create(C5))) * r4 +
((Vector128.Create(C4) * r + Vector128.Create(C3)) * r2 + (r + Vector128<double>.One));
Vector128<double> a0 = MultiplyAddEstimateOperator<double>.Invoke(Vector128.Create(C12), r, Vector128.Create(C11));
Vector128<double> a1 = MultiplyAddEstimateOperator<double>.Invoke(Vector128.Create(C10), r, Vector128.Create(C9));
Vector128<double> a2 = MultiplyAddEstimateOperator<double>.Invoke(Vector128.Create(C8), r, Vector128.Create(C7));
Vector128<double> a3 = MultiplyAddEstimateOperator<double>.Invoke(Vector128.Create(C6), r, Vector128.Create(C5));
Vector128<double> a4 = MultiplyAddEstimateOperator<double>.Invoke(Vector128.Create(C4), r, Vector128.Create(C3));
Vector128<double> a5 = MultiplyAddEstimateOperator<double>.Invoke(a0, r2, a1);
Vector128<double> a6 = MultiplyAddEstimateOperator<double>.Invoke(a2, r2, a3);
Vector128<double> a7 = MultiplyAddEstimateOperator<double>.Invoke(a4, r2, r + Vector128<double>.One);
Comment on lines +207 to +214
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: This many locals isn't really "good" for the JIT and can hinder some optimizations.

It can also make it a little bit harder to follow the intended computation ordering, which can itself limit other optimizations or reorderings the JIT would be allowed to do.

We could reduce the local count significantly if we broke it up like this, and then its also equivalent in codegen to as if there were no locals at all:

Vector128<double> t1 = MultiplyAddEstimate(
    MultiplyAddEstimate(Vector128.Create(C12), r, Vector128.Create(C11)),
    r2,
    MultiplyAddEstimate(Vector128.Create(C10), r, Vector128.Create(C9))
);

Vector128<double> t2 = MultiplyAddEstimate(
   MultiplyAddEstimate(Vector128.Create(C8),  r, Vector128.Create(C7)),
   r2,
   MultiplyAddEstimate(Vector128.Create(C6),  r, Vector128.Create(C5))
);

t1 = MultiplyAddEstimate(t1, r8, t2);
t2 = MultiplyAddEstimate(Vector128.Create(C4),  r, Vector128.Create(C3));
t2 = MultiplyAddEstimate(t2, r2, r + Vector128<double>.One);

MultiplyAddEstimate(t1, r4, t2);

You can see it results in smaller codegen, lets the initial multiplies be parallel dispatched up front, keeps the total register usage lower, etc: https://sharplab.io/#v2:EYLgxg9gTgpgtADwGwBYA0AXEBDAzgWwB8ABABgAJiBGAOgCUBXAOwwEt8YaBJFqVp3KzC4A3AFgAUGUq1GLdpx4Y+AobhoANABxJxUgMyUATOQDC5AN6TyN8gAc+AN2wYY5SAIzkAJhAbAAGzdTQwBeclIaAFZSWLj4uKotPVt7Jxc3D1wvX38gsxRycMiqJDLyivKqAHYU2wdWZ1d3CE8fP0DgqKKImlIUUsrylABOI2ra63r05qycjvzTJB7I2K19Dc3NqKix6qM6mwamzNbs9rzg6pW+0ip9LUenkaiqIyMUfsO0xoyWttynTMWhuiRGWgGRiQIxQ+ihE1GUW+xz+cwuQNMIx6RhoKC0dzxSCiRnu1RQeP0AFE4KQkVMjjNTgCFsEqBRwjjqjtqkh3lE4VFYaQjFSabp6T8Tv9zoDFlQqNiaDy4UYRnDwUYYi8UNTSJMJKkUbMzvNLmY3orRjD9P19FQXlootUqLrkhKJcRDAA1GBgDDQN5aAA8spgAD5yABZAAUPr9AaMwdDEagAEpLBLUnH/VBAyGWSmTOEoOQAFTkKDfLO+nN55MVwrFkzlqAHTO2bMJpMFisg4uFFsoKsdmtd/OXCPYdlRhgBNh2AIATwAgt5vJTsuwMrHR7nEzRTLBt6Y3qm0BXz5291oD0fXNGT1RU6nhzYr3We9gFeFI7P50vV3XTd8G3d990PGBjzZM8L3IMCbwg48RmfV84N3D8J3IbAixnOdWAXFc1w3NgQPveDb0g+9TC0GCoEvdDwLvGAH2qFD2zfBju0w7Awlw/9CKAkjQM4ijjyQWj6Pja9RKoqI2INVIRykjDOknRs+PwgCiOA4TlMYyjmNMFAJLQvSEKYh99HkxSlNrRNx1UrDuh/P9NIE4it3vKdz1bc8vxfdjTLsrjHOwZYXLwgjAI80jmOwnyjD8qzUPghygkna4Iv46KdK89AK0SityAAaiCsdkxoAB5JgYAChSbMCgB6Rq7AgJcemwboWxBUqwrLBsSqw/UbI4sy0vDew2sXHpf0irTBM8uKkB8/LsFY1DiGuWbsu0oSvKiHytHPVqlzq1IAF93XqyhvU48aIwAOR3Mb6zTDNrurF6e1bHoSxbFK7te9SfpbNsPts8rvr7AbBwBr7MIwb8NKi3bFujQLUm2tycr25jyMQqjTx8yTgpkwz5WfNAMdsXzqZsLGUYW2LntJgnydIEz8YszFn0CurAtS+sMBwhn5pi7c6dF9zcrxkS2YfGjz1grmDJYym6dp8HFKlnG0ZVsSYOVuXubk1M+b0QLEZm1zGfF+9EcO89hbOkbhetubpdxlmuzJh9jKVuiyuk+WQms12RZtsWZejYWEp8wbBYLKqav5rXKC2yPPbRh2GydowXZsS7rolI03ETzCddR5ny8coIADMMBJyHML4ABzAALRug5U9KsLXGAmG8M3rqsNPWDr8howAMRA7hcAAZQYOxWqgVwh8C0eRsUzbyBn7AaEr7xo3rru287vz+8HgvFKLrfAp34+YAb/qz4wdNesv7xviL86gA=

//poly = a5 * r8 + a6 * r4 + a7;
Vector128<double> poly = MultiplyAddEstimateOperator<double>.Invoke(a6, r4, a7);
poly = MultiplyAddEstimateOperator<double>.Invoke(a5, r8, poly);

// m = (n - j) / 64
// result = polynomial * 2^m
Expand Down Expand Up @@ -247,18 +254,25 @@ public static Vector256<double> Invoke(Vector256<double> x)

// r = x - (dn * (ln(2) / 64))
// where ln(2) / 64 is split into Head and Tail values
Vector256<double> r = x - (dn * Vector256.Create(V_LN2_HEAD)) - (dn * Vector256.Create(V_LN2_TAIL));
Vector256<double> r;
r = MultiplyAddEstimateOperator<double>.Invoke(dn, Vector256.Create(-V_LN2_HEAD), x);
r = MultiplyAddEstimateOperator<double>.Invoke(dn, Vector256.Create(-V_LN2_TAIL), r);

// POLY_EVAL_ODD_9
Vector256<double> r2 = r * r;
Vector256<double> r4 = r2 * r2;
Vector256<double> r8 = r4 * r4;

// Compute polynomial
Vector256<double> poly = ((Vector256.Create(C12) * r + Vector256.Create(C11)) * r2 +
Vector256.Create(C10) * r + Vector256.Create(C9)) * r8 +
((Vector256.Create(C8) * r + Vector256.Create(C7)) * r2 +
(Vector256.Create(C6) * r + Vector256.Create(C5))) * r4 +
((Vector256.Create(C4) * r + Vector256.Create(C3)) * r2 + (r + Vector256<double>.One));
Vector256<double> a0 = MultiplyAddEstimateOperator<double>.Invoke(Vector256.Create(C12), r, Vector256.Create(C11));
Vector256<double> a1 = MultiplyAddEstimateOperator<double>.Invoke(Vector256.Create(C10), r, Vector256.Create(C9));
Vector256<double> a2 = MultiplyAddEstimateOperator<double>.Invoke(Vector256.Create(C8), r, Vector256.Create(C7));
Vector256<double> a3 = MultiplyAddEstimateOperator<double>.Invoke(Vector256.Create(C6), r, Vector256.Create(C5));
Vector256<double> a4 = MultiplyAddEstimateOperator<double>.Invoke(Vector256.Create(C4), r, Vector256.Create(C3));
Vector256<double> a5 = MultiplyAddEstimateOperator<double>.Invoke(a0, r2, a1);
Vector256<double> a6 = MultiplyAddEstimateOperator<double>.Invoke(a2, r2, a3);
Vector256<double> a7 = MultiplyAddEstimateOperator<double>.Invoke(a4, r2, r + Vector256<double>.One);
//poly = a5 * r8 + a6 * r4 + a7;
Vector256<double> poly = MultiplyAddEstimateOperator<double>.Invoke(a6, r4, a7);
poly = MultiplyAddEstimateOperator<double>.Invoke(a5, r8, poly);

// m = (n - j) / 64
// result = polynomial * 2^m
Expand Down Expand Up @@ -298,18 +312,25 @@ public static Vector512<double> Invoke(Vector512<double> x)

// r = x - (dn * (ln(2) / 64))
// where ln(2) / 64 is split into Head and Tail values
Vector512<double> r = x - (dn * Vector512.Create(V_LN2_HEAD)) - (dn * Vector512.Create(V_LN2_TAIL));
Vector512<double> r;
r = MultiplyAddEstimateOperator<double>.Invoke(dn, Vector512.Create(-V_LN2_HEAD), x);
r = MultiplyAddEstimateOperator<double>.Invoke(dn, Vector512.Create(-V_LN2_TAIL), r);

// POLY_EVAL_ODD_9
Vector512<double> r2 = r * r;
Vector512<double> r4 = r2 * r2;
Vector512<double> r8 = r4 * r4;

// Compute polynomial
Vector512<double> poly = ((Vector512.Create(C12) * r + Vector512.Create(C11)) * r2 +
Vector512.Create(C10) * r + Vector512.Create(C9)) * r8 +
((Vector512.Create(C8) * r + Vector512.Create(C7)) * r2 +
(Vector512.Create(C6) * r + Vector512.Create(C5))) * r4 +
((Vector512.Create(C4) * r + Vector512.Create(C3)) * r2 + (r + Vector512<double>.One));
Vector512<double> a0 = MultiplyAddEstimateOperator<double>.Invoke(Vector512.Create(C12), r, Vector512.Create(C11));
Vector512<double> a1 = MultiplyAddEstimateOperator<double>.Invoke(Vector512.Create(C10), r, Vector512.Create(C9));
Vector512<double> a2 = MultiplyAddEstimateOperator<double>.Invoke(Vector512.Create(C8), r, Vector512.Create(C7));
Vector512<double> a3 = MultiplyAddEstimateOperator<double>.Invoke(Vector512.Create(C6), r, Vector512.Create(C5));
Vector512<double> a4 = MultiplyAddEstimateOperator<double>.Invoke(Vector512.Create(C4), r, Vector512.Create(C3));
Vector512<double> a5 = MultiplyAddEstimateOperator<double>.Invoke(a0, r2, a1);
Vector512<double> a6 = MultiplyAddEstimateOperator<double>.Invoke(a2, r2, a3);
Vector512<double> a7 = MultiplyAddEstimateOperator<double>.Invoke(a4, r2, r + Vector512<double>.One);
//poly = a5 * r8 + a6 * r4 + a7;
Vector512<double> poly = MultiplyAddEstimateOperator<double>.Invoke(a6, r4, a7);
poly = MultiplyAddEstimateOperator<double>.Invoke(a5, r8, poly);

// m = (n - j) / 64
// result = polynomial * 2^m
Expand Down Expand Up @@ -411,31 +432,32 @@ public static Vector128<float> Invoke(Vector128<float> x)
dnu -= v_expf_huge;

// r = z - dn
Vector128<double> rl = zl - dnl;
Vector128<double> ru = zu - dnu;

// POLY_EVAL_6
Vector128<double> c1 = Vector128.Create(C1);
Vector128<double> c2 = Vector128.Create(C2);
Vector128<double> c3 = Vector128.Create(C3);
Vector128<double> c4 = Vector128.Create(C4);
Vector128<double> c5 = Vector128.Create(C5);
Vector128<double> c6 = Vector128.Create(C6);

Vector128<double> rl = zl - dnl;

Vector128<double> rl2 = rl * rl;
Vector128<double> rl4 = rl2 * rl2;

Vector128<double> polyl = (c4 * rl + c3) * rl2
+ ((c6 * rl + c5) * rl4
+ (c2 * rl + c1));


Vector128<double> ru = zu - dnu;
Vector128<double> al0 = MultiplyAddEstimateOperator<double>.Invoke(c4, rl, c3);
Vector128<double> al1 = MultiplyAddEstimateOperator<double>.Invoke(c6, rl, c5);
Vector128<double> al2 = MultiplyAddEstimateOperator<double>.Invoke(c2, rl, c1);
Vector128<double> polyl = MultiplyAddEstimateOperator<double>.Invoke(al1, rl4, al2);
polyl = MultiplyAddEstimateOperator<double>.Invoke(al0, rl2, polyl);

Vector128<double> ru2 = ru * ru;
Vector128<double> ru4 = ru2 * ru2;

Vector128<double> polyu = (c4 * ru + c3) * ru2
+ ((c6 * ru + c5) * ru4
+ (c2 * ru + c1));
Vector128<double> au0 = MultiplyAddEstimateOperator<double>.Invoke(c4, ru, c3);
Vector128<double> au1 = MultiplyAddEstimateOperator<double>.Invoke(c6, ru, c5);
Vector128<double> au2 = MultiplyAddEstimateOperator<double>.Invoke(c2, ru, c1);
Vector128<double> polyu = MultiplyAddEstimateOperator<double>.Invoke(au1, ru4, au2);
polyu = MultiplyAddEstimateOperator<double>.Invoke(au0, ru2, polyu);

// result = (float)(poly + (n << 52))
Vector128<float> ret = Vector128.Narrow(
Expand Down Expand Up @@ -487,31 +509,32 @@ public static Vector256<float> Invoke(Vector256<float> x)
dnu -= v_expf_huge;

// r = z - dn
Vector256<double> rl = zl - dnl;
Vector256<double> ru = zu - dnu;

// POLY_EVAL_6
Vector256<double> c1 = Vector256.Create(C1);
Vector256<double> c2 = Vector256.Create(C2);
Vector256<double> c3 = Vector256.Create(C3);
Vector256<double> c4 = Vector256.Create(C4);
Vector256<double> c5 = Vector256.Create(C5);
Vector256<double> c6 = Vector256.Create(C6);

Vector256<double> rl = zl - dnl;

Vector256<double> rl2 = rl * rl;
Vector256<double> rl4 = rl2 * rl2;

Vector256<double> polyl = (c4 * rl + c3) * rl2
+ ((c6 * rl + c5) * rl4
+ (c2 * rl + c1));


Vector256<double> ru = zu - dnu;
Vector256<double> al0 = MultiplyAddEstimateOperator<double>.Invoke(c4, rl, c3);
Vector256<double> al1 = MultiplyAddEstimateOperator<double>.Invoke(c6, rl, c5);
Vector256<double> al2 = MultiplyAddEstimateOperator<double>.Invoke(c2, rl, c1);
Vector256<double> polyl = MultiplyAddEstimateOperator<double>.Invoke(al1, rl4, al2);
polyl = MultiplyAddEstimateOperator<double>.Invoke(al0, rl2, polyl);

Vector256<double> ru2 = ru * ru;
Vector256<double> ru4 = ru2 * ru2;

Vector256<double> polyu = (c4 * ru + c3) * ru2
+ ((c6 * ru + c5) * ru4
+ (c2 * ru + c1));
Vector256<double> au0 = MultiplyAddEstimateOperator<double>.Invoke(c4, ru, c3);
Vector256<double> au1 = MultiplyAddEstimateOperator<double>.Invoke(c6, ru, c5);
Vector256<double> au2 = MultiplyAddEstimateOperator<double>.Invoke(c2, ru, c1);
Vector256<double> polyu = MultiplyAddEstimateOperator<double>.Invoke(au1, ru4, au2);
polyu = MultiplyAddEstimateOperator<double>.Invoke(au0, ru2, polyu);

// result = (float)(poly + (n << 52))
Vector256<float> ret = Vector256.Narrow(
Expand Down Expand Up @@ -563,31 +586,32 @@ public static Vector512<float> Invoke(Vector512<float> x)
dnu -= v_expf_huge;

// r = z - dn
Vector512<double> rl = zl - dnl;
Vector512<double> ru = zu - dnu;

// POLY_EVAL_6
Vector512<double> c1 = Vector512.Create(C1);
Vector512<double> c2 = Vector512.Create(C2);
Vector512<double> c3 = Vector512.Create(C3);
Vector512<double> c4 = Vector512.Create(C4);
Vector512<double> c5 = Vector512.Create(C5);
Vector512<double> c6 = Vector512.Create(C6);

Vector512<double> rl = zl - dnl;

Vector512<double> rl2 = rl * rl;
Vector512<double> rl4 = rl2 * rl2;

Vector512<double> polyl = (c4 * rl + c3) * rl2
+ ((c6 * rl + c5) * rl4
+ (c2 * rl + c1));


Vector512<double> ru = zu - dnu;
Vector512<double> al0 = MultiplyAddEstimateOperator<double>.Invoke(c4, rl, c3);
Vector512<double> al1 = MultiplyAddEstimateOperator<double>.Invoke(c6, rl, c5);
Vector512<double> al2 = MultiplyAddEstimateOperator<double>.Invoke(c2, rl, c1);
Vector512<double> polyl = MultiplyAddEstimateOperator<double>.Invoke(al1, rl4, al2);
polyl = MultiplyAddEstimateOperator<double>.Invoke(al0, rl2, polyl);

Vector512<double> ru2 = ru * ru;
Vector512<double> ru4 = ru2 * ru2;

Vector512<double> polyu = (c4 * ru + c3) * ru2
+ ((c6 * ru + c5) * ru4
+ (c2 * ru + c1));
Vector512<double> au0 = MultiplyAddEstimateOperator<double>.Invoke(c4, ru, c3);
Vector512<double> au1 = MultiplyAddEstimateOperator<double>.Invoke(c6, ru, c5);
Vector512<double> au2 = MultiplyAddEstimateOperator<double>.Invoke(c2, ru, c1);
Vector512<double> polyu = MultiplyAddEstimateOperator<double>.Invoke(au1, ru4, au2);
polyu = MultiplyAddEstimateOperator<double>.Invoke(au0, ru2, polyu);

// result = (float)(poly + (n << 52))
Vector512<float> ret = Vector512.Narrow(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -374,15 +374,14 @@ public static IEnumerable<object[]> SpanDestinationFunctionsToTest()
yield return Create(TensorPrimitives.DegreesToRadians, T.DegreesToRadians);
yield return Create(TensorPrimitives.Exp, T.Exp);
// TODO https://github.com/dotnet/runtime/issues/98861
yield return Create(TensorPrimitives.Exp2, T.Exp2, Helpers.DetermineTolerance<T>(doubleTolerance: 1e-6, floatTolerance: 1e-5f));
yield return Create(TensorPrimitives.Exp2, T.Exp2, Helpers.DetermineTolerance<T>(doubleTolerance: 1e-14, floatTolerance: 1e-5f));
// TODO https://github.com/dotnet/runtime/issues/98861
yield return Create(TensorPrimitives.Exp10, T.Exp10, Helpers.DetermineTolerance<T>(doubleTolerance: 1e-6, floatTolerance: 1e-5f));
yield return Create(TensorPrimitives.Exp10, T.Exp10, Helpers.DetermineTolerance<T>(doubleTolerance: 1e-13, floatTolerance: 1e-5f));
yield return Create(TensorPrimitives.ExpM1, T.ExpM1);
// TODO https://github.com/dotnet/runtime/issues/98861
yield return Create(TensorPrimitives.ExpM1, T.ExpM1, Helpers.DetermineTolerance<T>(doubleTolerance: 1e-6));
yield return Create(TensorPrimitives.Exp2M1, T.Exp2M1, Helpers.DetermineTolerance<T>(doubleTolerance: 1e-14, floatTolerance: 1e-5f));
// TODO https://github.com/dotnet/runtime/issues/98861
yield return Create(TensorPrimitives.Exp2M1, T.Exp2M1, Helpers.DetermineTolerance<T>(doubleTolerance: 1e-6, floatTolerance: 1e-5f));
// TODO https://github.com/dotnet/runtime/issues/98861
yield return Create(TensorPrimitives.Exp10M1, T.Exp10M1, Helpers.DetermineTolerance<T>(doubleTolerance: 1e-6, floatTolerance: 1e-5f));
yield return Create(TensorPrimitives.Exp10M1, T.Exp10M1, Helpers.DetermineTolerance<T>(doubleTolerance: 1e-13, floatTolerance: 1e-5f));
yield return Create(TensorPrimitives.Floor, T.Floor);
yield return Create(TensorPrimitives.Log, T.Log);
yield return Create(TensorPrimitives.Log2, T.Log2);
Expand Down