using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Environments;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
var config = DefaultConfig.Instance
.AddJob(Job.Default.WithRuntime(CoreRuntime.Core90));
BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args, config);
public class Benchmark
{
private int[] _arr = null!;
[Params(100, 1000, 10000)]
public int Count { get; set; }
[GlobalSetup]
public void GlobalSetup()
{
_arr = new int[Count];
}
[Benchmark]
public void Sum2()
{
Sum_bounds_check(_arr);
}
[Benchmark]
public void Sum3()
{
Sum_no_bounds_check(_arr);
}
private static int Sum_bounds_check(int[] arr)
{
int sum = 0;
for (nint i = 0; i < arr.Length; i++)
{
sum += arr[i];
}
return sum;
}
private static int Sum_no_bounds_check(int[] arr)
{
int sum = 0;
for (nint i = 0; i < arr.Length; i++)
{
sum += Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(arr), i);
}
return sum;
}
}
In this benchmark Sum_bounds_check has a bounds check in it (#104659). That makes the loop body big enough for loop alignment to kick in, which aligns it so that the loop starts at 0x20.
OTOH Sum_no_bounds_check is not aligned by the JIT, and its loop ends up starting at 0xD. Overall this seems to cause Sum_bounds_check to be faster than Sum_no_bounds_check, even though it has the extra bounds check:
| Method |
Count |
Mean |
Error |
StdDev |
| Sum_bounds_check |
100 |
27.81 ns |
0.172 ns |
0.143 ns |
| Sum_no_bounds_check |
100 |
29.53 ns |
0.156 ns |
0.146 ns |
| Sum_bounds_check |
1000 |
219.52 ns |
2.484 ns |
2.202 ns |
| Sum_no_bounds_check |
1000 |
239.76 ns |
1.713 ns |
1.518 ns |
| Sum_bounds_check |
10000 |
2,163.16 ns |
3.656 ns |
3.420 ns |
| Sum_no_bounds_check |
10000 |
2,232.11 ns |
7.284 ns |
6.813 ns |
(from my 5950X).
cc @kunalspathak
In this benchmark
Sum_bounds_checkhas a bounds check in it (#104659). That makes the loop body big enough for loop alignment to kick in, which aligns it so that the loop starts at 0x20.OTOH
Sum_no_bounds_checkis not aligned by the JIT, and its loop ends up starting at 0xD. Overall this seems to causeSum_bounds_checkto be faster thanSum_no_bounds_check, even though it has the extra bounds check:(from my 5950X).
cc @kunalspathak