From 0ab5908f763a17fa5cf7e2fffd6cf2645b3456a3 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 8 Feb 2026 14:10:14 -0500 Subject: [PATCH 1/9] feature: add selection, filtering, primitive comparison, formatting and limited casting --- Apache.Arrow.sln | 4 + .../Apache.Arrow.Operations.csproj | 16 + src/Apache.Arrow.Operations/Comparison.cs | 374 ++++++++++ src/Apache.Arrow.Operations/Conversion.cs | 427 +++++++++++ src/Apache.Arrow.Operations/Select.cs | 687 ++++++++++++++++++ src/Apache.Arrow.Operations/Text.cs | 220 ++++++ .../Apache.Arrow.Operations.Tests.csproj | 41 ++ .../TestOperations.cs | 61 ++ 8 files changed, 1830 insertions(+) create mode 100644 src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj create mode 100644 src/Apache.Arrow.Operations/Comparison.cs create mode 100644 src/Apache.Arrow.Operations/Conversion.cs create mode 100644 src/Apache.Arrow.Operations/Select.cs create mode 100644 src/Apache.Arrow.Operations/Text.cs create mode 100644 test/Apache.Arrow.Operations.Tests/Apache.Arrow.Operations.Tests.csproj create mode 100644 test/Apache.Arrow.Operations.Tests/TestOperations.cs diff --git a/Apache.Arrow.sln b/Apache.Arrow.sln index 0dd6853a..3481f174 100644 --- a/Apache.Arrow.sln +++ b/Apache.Arrow.sln @@ -29,6 +29,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Apache.Arrow.Flight.Integra EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Apache.Arrow.IntegrationTest", "test\Apache.Arrow.IntegrationTest\Apache.Arrow.IntegrationTest.csproj", "{E8264B7F-B680-4A55-939B-85DB628164BB}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Apache.Arrow.Operations", "src\Apache.Arrow.Operations\Apache.Arrow.Operations.csproj", "{BA6B2B0D-EAAE-4183-8A39-1B9CF571F71F}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Apache.Arrow.Operations.Tests", "test\Apache.Arrow.Operations.Tests\Apache.Arrow.Operations.Tests.csproj", "{BA6B2B0D-EAAE-4183-8A39-1B9CF571F71F}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU diff --git a/src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj b/src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj new file mode 100644 index 00000000..efe7b036 --- /dev/null +++ b/src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj @@ -0,0 +1,16 @@ + + + + + + + net8.0 + enable + enable + + + + + + + \ No newline at end of file diff --git a/src/Apache.Arrow.Operations/Comparison.cs b/src/Apache.Arrow.Operations/Comparison.cs new file mode 100644 index 00000000..8eeecae4 --- /dev/null +++ b/src/Apache.Arrow.Operations/Comparison.cs @@ -0,0 +1,374 @@ +using System; +using System.Numerics; +using Apache.Arrow; +using Apache.Arrow.Memory; +using Apache.Arrow.Types; + +namespace Apache.Arrow.Operations; + +public static class Comparison +{ + /// + /// Negate a boolean array, flipping true to false, false to true. Nulls remain null + /// + /// + /// + /// + public static BooleanArray Invert(BooleanArray mask, MemoryAllocator? allocator = null) + { + var builder = new BooleanArray.Builder(); + builder.Reserve(mask.Length); + foreach (var val in mask) + { + if (val != null) + { + builder.Append(!(bool)val); + } + else + { + builder.AppendNull(); + } + } + return builder.Build(allocator); + } + + /// + /// Perform a pairwise boolean AND operation. + /// + /// + /// + /// + /// + /// + public static BooleanArray And(BooleanArray lhs, BooleanArray rhs, MemoryAllocator? allocator = null) + { + if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + var builder = new BooleanArray.Builder(); + builder.Reserve(lhs.Length); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var b = rhs.GetValue(i); + if (a != null && b != null) + { + builder.Append((bool)a && (bool)b); + } + else + { + builder.AppendNull(); + } + } + return builder.Build(allocator); + } + + /// + /// Performa a pairwise boolean OR operation. + /// + /// + /// + /// + /// + /// + public static BooleanArray Or(BooleanArray lhs, BooleanArray rhs, MemoryAllocator? allocator = null) + { + if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + var builder = new BooleanArray.Builder(); + builder.Reserve(lhs.Length); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var b = rhs.GetValue(i); + if (a != null && b != null) + { + builder.Append((bool)a || (bool)b); + } + else + { + builder.AppendNull(); + } + } + return builder.Build(allocator); + } + + /// + /// Performa a pairwise boolean XOR operation. + /// + /// + /// + /// + /// + /// + public static BooleanArray Xor(BooleanArray lhs, BooleanArray rhs, MemoryAllocator? allocator = null) + { + if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + var builder = new BooleanArray.Builder(); + builder.Reserve(lhs.Length); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var b = rhs.GetValue(i); + if (a != null && b != null) + { + builder.Append((bool)a ^ (bool)b); + } + else + { + builder.AppendNull(); + } + } + return builder.Build(allocator); + } + + /// + /// Compare each value in `lhs` to a scalar `rhs`, returning boolean mask + /// + /// + /// + /// + /// + /// + public static BooleanArray Equal(PrimitiveArray lhs, T rhs, MemoryAllocator? allocator = null) where T : struct, INumber + { + var cmp = new BooleanArray.Builder(); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var flag = a == rhs; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + /// + /// Perform a pairwise comparison between each position in `lhs` and `rhs`, returning a boolean mask + /// + /// + /// + /// + /// + /// + /// + public static BooleanArray Equal(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null) where T : struct, INumber + { + var cmp = new BooleanArray.Builder(); + if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var b = rhs.GetValue(i); + var flag = a == b; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + /// + /// Compare each value in `lhs` to a scalar `rhs`, returning boolean mask + /// + /// + /// + /// + /// + public static BooleanArray Equal(StringArray lhs, string rhs, MemoryAllocator? allocator = null) + { + var cmp = new BooleanArray.Builder(); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetString(i); + var flag = a == rhs; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + /// + /// Perform a pairwise comparison between each position in `lhs` and `rhs`, returning a boolean mask + /// + /// + /// + /// + /// + /// + public static BooleanArray Equal(StringArray lhs, StringArray rhs, MemoryAllocator? allocator = null) + { + var cmp = new BooleanArray.Builder(); + if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetString(i); + var b = rhs.GetString(i); + var flag = a == b; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + /// + /// Compare each value in `lhs` to a scalar `rhs`, returning boolean mask + /// + /// + /// + /// + /// + public static BooleanArray Equal(LargeStringArray lhs, string rhs, MemoryAllocator? allocator = null) + { + var cmp = new BooleanArray.Builder(); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetString(i); + var flag = a == rhs; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + /// + /// Perform a pairwise comparison between each position in `lhs` and `rhs`, returning a boolean mask + /// + /// + /// + /// + /// + /// + public static BooleanArray Equal(LargeStringArray lhs, LargeStringArray rhs, MemoryAllocator? allocator = null) + { + var cmp = new BooleanArray.Builder(); + if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetString(i); + var b = rhs.GetString(i); + var flag = a == b; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + /// + /// A dispatching comparison between a string array and a single string. If the `lhs` is not some flavor + /// of string array, an exception is thrown. + /// + /// + /// + /// + /// + /// + public static BooleanArray Equal(IArrowArray lhs, string rhs, MemoryAllocator? allocator = null) + { + switch (lhs.Data.DataType.TypeId) + { + case ArrowTypeId.String: + return Equal((StringArray)lhs, rhs, allocator); + case ArrowTypeId.LargeString: + return Equal((LargeStringArray)lhs, rhs, allocator); + default: + throw new InvalidDataException("Unsupported data type " + lhs.Data.DataType.Name); + } + } + + public static BooleanArray GreaterThan(PrimitiveArray lhs, T rhs, MemoryAllocator? allocator = null) where T : struct, INumber + { + var cmp = new BooleanArray.Builder(); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var flag = a > rhs; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + public static BooleanArray GreaterThan(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null) where T : struct, INumber + { + var cmp = new BooleanArray.Builder(); + if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var b = rhs.GetValue(i); + var flag = a > b; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + public static BooleanArray LessThan(PrimitiveArray lhs, T rhs, MemoryAllocator? allocator = null) where T : struct, INumber + { + var cmp = new BooleanArray.Builder(); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var flag = a < rhs; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + public static BooleanArray LessThan(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null) where T : struct, INumber + { + var cmp = new BooleanArray.Builder(); + if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var b = rhs.GetValue(i); + var flag = a < b; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + public static BooleanArray GreaterThanOrEqual(PrimitiveArray lhs, T rhs, MemoryAllocator? allocator = null) where T : struct, INumber + { + var cmp = new BooleanArray.Builder(); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var flag = a >= rhs; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + public static BooleanArray GreaterThanOrEqual(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null) where T : struct, INumber + { + var cmp = new BooleanArray.Builder(); + if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var b = rhs.GetValue(i); + var flag = a >= b; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + public static BooleanArray LessThanOrEqual(PrimitiveArray lhs, T rhs, MemoryAllocator? allocator = null) where T : struct, INumber + { + var cmp = new BooleanArray.Builder(); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var flag = a <= rhs; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + + public static BooleanArray LessThanOrEqual(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null) where T : struct, INumber + { + var cmp = new BooleanArray.Builder(); + if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var b = rhs.GetValue(i); + var flag = a <= b; + cmp.Append(flag); + } + return cmp.Build(allocator); + } + +} + diff --git a/src/Apache.Arrow.Operations/Conversion.cs b/src/Apache.Arrow.Operations/Conversion.cs new file mode 100644 index 00000000..8714d1fb --- /dev/null +++ b/src/Apache.Arrow.Operations/Conversion.cs @@ -0,0 +1,427 @@ +using System.Numerics; + +using Apache.Arrow; +using Apache.Arrow.Memory; +using Apache.Arrow.Types; + +namespace Apache.Arrow.Operations; + +public class ArrowCompatibilityVisitor : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor +{ + public IArrowArray? Result = null; + + public static IArrowArray Convert(IArrowArray array) + { + var visitor = new ArrowCompatibilityVisitor(); + visitor.Visit(array); + if (visitor.Result == null) throw new InvalidOperationException(); + return visitor.Result; + } + + public StructArray HandleStruct(StructArray array) + { + var dtype = (StructType)array.Data.DataType; + var newFields = new List(); + var newVals = new List(); + int size = 0; + foreach (var (field, arr) in dtype.Fields.Zip(array.Fields)) + { + var visitor = new ArrowCompatibilityVisitor(); + visitor.Visit(arr); + if (visitor.Result == null) throw new InvalidOperationException(); + newFields.Add(new Field(field.Name, visitor.Result.Data.DataType, field.IsNullable)); + newVals.Add(visitor.Result); + if (size != 0 && visitor.Result.Length != 0 && visitor.Result.Length != size) throw new InvalidDataException(); + size = visitor.Result.Length; + } + var result = new StructArray(new StructType(newFields), size, newVals, array.NullBitmapBuffer); + if (result.Fields.Count > 0) { } + return result; + } + + public void Visit(StructArray array) + { + Result = HandleStruct(array); + } + + public void Visit(IArrowArray array) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Struct: + { + Visit((StructArray)array); + break; + } + case ArrowTypeId.LargeList: + { + Visit((LargeListArray)array); + break; + } + case ArrowTypeId.LargeString: + { + Visit((LargeStringArray)array); + break; + } + case ArrowTypeId.LargeBinary: + { + Visit((LargeBinaryArray)array); + break; + } + default: + { + Result = array; + break; + } + } + } + + public void Visit(LargeListArray array) + { + ArrowCompatibilityVisitor visitor = new(); + visitor.Visit(array.Values); + var offsetsBuffer = new ArrowBuffer.Builder(); + foreach (var v in array.ValueOffsets) + { + offsetsBuffer.Append((int)v); + } + if (visitor.Result == null) throw new InvalidOperationException(); + Result = new ListArray( + new ListType(((LargeListType)array.Data.DataType).ValueDataType), + array.Length, + offsetsBuffer.Build(), + visitor.Result, + array.NullBitmapBuffer, + array.NullCount, + array.Offset + ); + } + + public void Visit(LargeStringArray array) + { + var offsetsBuffer = new ArrowBuffer.Builder(); + foreach (var v in array.ValueOffsets) + { + offsetsBuffer.Append((int)v); + } + Result = new StringArray( + array.Length, + offsetsBuffer.Build(), + array.ValueBuffer, + array.NullBitmapBuffer, + array.NullCount, + array.Offset + ); + } + + public void Visit(LargeBinaryArray type) + { + throw new NotImplementedException(); + } +} + + +/// +/// Specifies how null values should be handled in aggregate computations. +/// +public enum NullHandling +{ + /// + /// Skip null values when computing the result. + /// Returns null only if the array is empty or all values are null. + /// + Skip, + + /// + /// Propagate null: if any value in the array is null, return null. + /// + Propagate +} + + +/// +/// Copy primitive arraays between types to explicitly known numerical types. When the type already +/// matches, no copy is performed. +/// +public static class Conversion +{ + static void NullToZero(PrimitiveArray array, IArrowArrayBuilder, TBuilder> accumulator) + where T : struct, INumber where TBuilder : IArrowArrayBuilder> + { + accumulator.Reserve(array.Length); + foreach (var value in array) + { + accumulator.Append(value == null ? T.Zero : (T)value); + } + } + + public static Array NullToZero(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + { + var builder = new DoubleArray.Builder(); + NullToZero((DoubleArray)(IArrowArray)array, builder); + return builder.Build(allocator); + } + case ArrowTypeId.Float: + { + var builder = new FloatArray.Builder(); + NullToZero((FloatArray)(IArrowArray)array, builder); + return builder.Build(allocator); + } + case ArrowTypeId.Int32: + { + var builder = new Int32Array.Builder(); + NullToZero((Int32Array)(IArrowArray)array, builder); + return builder.Build(allocator); + } + case ArrowTypeId.Int64: + { + var builder = new Int64Array.Builder(); + NullToZero((Int64Array)(IArrowArray)array, builder); + return builder.Build(allocator); + } + case ArrowTypeId.UInt32: + { + var builder = new UInt32Array.Builder(); + NullToZero((UInt32Array)(IArrowArray)array, builder); + return builder.Build(allocator); + } + case ArrowTypeId.UInt64: + { + var builder = new UInt64Array.Builder(); + NullToZero((UInt64Array)(IArrowArray)array, builder); + return builder.Build(allocator); + } + case ArrowTypeId.Int16: + { + var builder = new Int16Array.Builder(); + NullToZero((Int16Array)(IArrowArray)array, builder); + return builder.Build(allocator); + } + case ArrowTypeId.Int8: + { + var builder = new Int8Array.Builder(); + NullToZero((Int8Array)(IArrowArray)array, builder); + return builder.Build(allocator); + } + case ArrowTypeId.UInt16: + { + var builder = new UInt16Array.Builder(); + NullToZero((UInt16Array)(IArrowArray)array, builder); + return builder.Build(allocator); + } + case ArrowTypeId.UInt8: + { + var builder = new UInt8Array.Builder(); + NullToZero((UInt8Array)(IArrowArray)array, builder); + return builder.Build(allocator); + } + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + public static Int64Array CastInt64(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new Int64Array.Builder(); + builder.Reserve(array.Length); + foreach (var val in array) + { + if (val != null) builder.Append(long.CreateChecked((T)val)); + else builder.AppendNull(); + } + return builder.Build(allocator); + } + + public static Int32Array CastInt32(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new Int32Array.Builder(); + builder.Reserve(array.Length); + foreach (var val in array) + { + if (val != null) builder.Append(int.CreateChecked((T)val)); + else builder.AppendNull(); + } + return builder.Build(allocator); + } + + public static FloatArray CastFloat(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new FloatArray.Builder(); + builder.Reserve(array.Length); + foreach (var val in array) + { + if (val != null) builder.Append(float.CreateChecked((T)val)); + else builder.AppendNull(); + } + return builder.Build(allocator); + } + + public static DoubleArray CastDouble(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new DoubleArray.Builder(); + builder.Reserve(array.Length); + foreach (var val in array) + { + if (val != null) builder.Append(double.CreateChecked((T)val)); + else builder.AppendNull(); + } + return builder.Build(allocator); + } + + public static DoubleArray CastDouble(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new DoubleArray.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(double.CreateChecked(val)); + return builder.Build(allocator); + } + + public static FloatArray CastFloat(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new FloatArray.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(float.CreateChecked(val)); + return builder.Build(allocator); + } + + public static Int32Array CastInt32(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new Int32Array.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(int.CreateChecked(val)); + return builder.Build(allocator); + } + + public static Int64Array CastInt64(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new Int64Array.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(long.CreateChecked(val)); + return builder.Build(allocator); + } + + public static Int64Array CastInt64(IArrowArray array, MemoryAllocator? allocator = null) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return CastInt64((DoubleArray)array, allocator); + case ArrowTypeId.Float: + return CastInt64((FloatArray)array, allocator); + case ArrowTypeId.Int32: + return CastInt64((Int32Array)array, allocator); + case ArrowTypeId.Int64: + return (Int64Array)array; + case ArrowTypeId.UInt32: + return CastInt64((UInt32Array)array, allocator); + case ArrowTypeId.UInt64: + return CastInt64((UInt64Array)array, allocator); + case ArrowTypeId.Int16: + return CastInt64((Int16Array)array, allocator); + case ArrowTypeId.Int8: + return CastInt64((Int8Array)array, allocator); + case ArrowTypeId.UInt16: + return CastInt64((UInt16Array)array, allocator); + case ArrowTypeId.UInt8: + return CastInt64((UInt8Array)array, allocator); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + public static Int32Array CastInt32(IArrowArray array, MemoryAllocator? allocator = null) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return CastInt32((DoubleArray)array, allocator); + case ArrowTypeId.Float: + return CastInt32((FloatArray)array, allocator); + case ArrowTypeId.Int32: + return (Int32Array)array; + case ArrowTypeId.Int64: + return CastInt32((Int64Array)array, allocator); + case ArrowTypeId.UInt32: + return CastInt32((UInt32Array)array, allocator); + case ArrowTypeId.UInt64: + return CastInt32((UInt64Array)array, allocator); + case ArrowTypeId.Int16: + return CastInt32((Int16Array)array, allocator); + case ArrowTypeId.Int8: + return CastInt32((Int8Array)array, allocator); + case ArrowTypeId.UInt16: + return CastInt32((UInt16Array)array, allocator); + case ArrowTypeId.UInt8: + return CastInt32((UInt8Array)array, allocator); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + public static FloatArray CastFloat(IArrowArray array, MemoryAllocator? allocator = null) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return CastFloat((DoubleArray)array, allocator); + case ArrowTypeId.Float: + return (FloatArray)array; + case ArrowTypeId.Int32: + return CastFloat((Int32Array)array, allocator); + case ArrowTypeId.Int64: + return CastFloat((Int64Array)array, allocator); + case ArrowTypeId.UInt32: + return CastFloat((UInt32Array)array, allocator); + case ArrowTypeId.UInt64: + return CastFloat((UInt64Array)array, allocator); + case ArrowTypeId.Int16: + return CastFloat((Int16Array)array, allocator); + case ArrowTypeId.Int8: + return CastFloat((Int8Array)array, allocator); + case ArrowTypeId.UInt16: + return CastFloat((UInt16Array)array, allocator); + case ArrowTypeId.UInt8: + return CastFloat((UInt8Array)array, allocator); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + public static DoubleArray CastDouble(IArrowArray array, MemoryAllocator? allocator = null) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return (DoubleArray)array; + case ArrowTypeId.Float: + return CastDouble((FloatArray)array, allocator); + case ArrowTypeId.Int32: + return CastDouble((Int32Array)array, allocator); + case ArrowTypeId.Int64: + return CastDouble((Int64Array)array, allocator); + case ArrowTypeId.UInt32: + return CastDouble((UInt32Array)array, allocator); + case ArrowTypeId.UInt64: + return CastDouble((UInt64Array)array, allocator); + case ArrowTypeId.Int16: + return CastDouble((Int16Array)array, allocator); + case ArrowTypeId.Int8: + return CastDouble((Int8Array)array, allocator); + case ArrowTypeId.UInt16: + return CastDouble((UInt16Array)array, allocator); + case ArrowTypeId.UInt8: + return CastDouble((UInt8Array)array, allocator); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } +} + diff --git a/src/Apache.Arrow.Operations/Select.cs b/src/Apache.Arrow.Operations/Select.cs new file mode 100644 index 00000000..468c3897 --- /dev/null +++ b/src/Apache.Arrow.Operations/Select.cs @@ -0,0 +1,687 @@ +using System.Numerics; + +using Apache.Arrow; +using Apache.Arrow.Memory; +using Apache.Arrow.Types; + +namespace Apache.Arrow.Operations; + +public static class Select +{ + /// + /// Returns a copy of the positions in the array where the mask is true. All other values in the array will be + /// excluded. + /// + /// This internally reduces to building a true-value run index map and calling `Take` + /// + /// The array to select from + /// The mask defining which values to keep or exclude + /// The memory allocator to build the new array from + /// + /// If the mask and the array are not of equal size + public static Array Filter(Array array, BooleanArray mask, MemoryAllocator? allocator = null) + { + if (array.Length != mask.Length) throw new InvalidOperationException("Array and mask must have the same length"); + List<(int, int)> spans = new(); + int? start = null; + for (int i = 0; i < mask.Length; i++) + { + var v = mask.GetValue(i); + if (v != null && (bool)v) + { + if (start != null) { } + else start = i; + } + else if (v != null && !(bool)v) + { + if (start != null) + { + // Slices in Take include the trailing index + spans.Add(((int)start, i - 1)); + start = null; + } + else { } + } + } + if (start != null) + { + spans.Add(((int)start, mask.Length - 1)); + } + return Take(array, spans, allocator); + } + + /// + /// Returns a copy of the positions in the array included in the provided start-end spans. All other values in the array will be + /// excluded. + /// + /// The array to select from + /// The index ranges to select + /// The memory allocator to build the new array from + /// + /// + public static Array Take(Array array, IList<(int, int)> spans, MemoryAllocator? allocator = null) + { + if (spans.Count == 0) + { + return array.Slice(0, 0); + } + List chunks = new(); + foreach (var (start, end) in spans) + { + if (end < start || end < 0 || start < 0) throw new InvalidOperationException(string.Format("Invalid span: {0} {1}", start, end)); + chunks.Add(array.Slice(start, end - start + 1)); + } + return (Array)ArrowArrayConcatenator.Concatenate(chunks, allocator); + } + + /// + /// Returns a copy of the positions in the array included in the provided indices list. All other values in the array will be + /// excluded. + /// + /// The array to select from + /// The indices to select + /// The memory allocator to build the new array from + /// + /// + public static Array Take(Array array, IList indices, MemoryAllocator? allocator = null) + { + if (indices.Count == 0) + { + return array.Slice(0, 0); + } + List chunks = new(); + for (var i = 0; i < indices.Count; i++) + { + chunks.Add(array.Slice(indices[i], 1)); + } + return (Array)ArrowArrayConcatenator.Concatenate(chunks, allocator); + } + + /// + /// Apply `Take` to each array in `batch` using the same `indices` + /// + /// + /// + /// + /// + public static List Take(List batch, IList indices, MemoryAllocator? allocator = null) + { + return batch.Select(arr => Take(arr, indices, allocator)).ToList(); + } + + /// + /// Apply `Filter` to each array in `batch` using the same `mask` + /// + /// + /// + /// + /// + public static List Filter(List batch, BooleanArray mask, MemoryAllocator? allocator = null) + { + return batch.Select(arr => Filter(arr, mask, allocator)).ToList(); + } + + /// + /// Apply `Take` to each array in `batch` using the same `indices` + /// + /// + /// + /// + /// + public static Dictionary Take(Dictionary batch, IList indices, MemoryAllocator? allocator = null) where T : notnull + { + Dictionary result = new(); + foreach (var kv in batch) + { + result[kv.Key] = Take(kv.Value, indices, allocator); + } + return result; + } + + /// + /// Apply `Filter` to each array in `batch` using the same `mask` + /// + /// + /// + /// + /// + public static Dictionary Filter(Dictionary batch, BooleanArray mask, MemoryAllocator? allocator = null) where T : notnull + { + Dictionary result = new(); + foreach (var kv in batch) + { + result[kv.Key] = Filter(kv.Value, mask, allocator); + } + return result; + } + + /// + /// Apply `Filter` to each array in `batch` using the same `mask` + /// + /// + /// + /// + /// + public static RecordBatch Filter(RecordBatch batch, BooleanArray mask, MemoryAllocator? allocator = null) + { + if (batch.Length != mask.Length) throw new InvalidOperationException("Array and mask must have the same length"); + List<(int, int)> spans = new(); + int? start = null; + for (int i = 0; i < mask.Length; i++) + { + var v = mask.GetValue(i); + if (v != null && (bool)v) + { + if (start != null) { } + else start = i; + } + else if (v != null && !(bool)v) + { + if (start != null) + { + // Slices in Take include the trailing index + spans.Add(((int)start, i - 1)); + start = null; + } + else { } + } + } + if (start != null) + { + spans.Add(((int)start, mask.Length - 1)); + } + return Take(batch, spans, allocator); + } + + /// + /// Apply `Take` to each array in `batch` using the same `indices` + /// + /// + /// + /// + /// + public static RecordBatch Take(RecordBatch batch, IList<(int, int)> spans, MemoryAllocator? allocator = null) + { + if (spans.Count == 0) + { + return batch.Slice(0, 0); + } + List columns = new(); + var size = 0; + foreach (var col in batch.Arrays) + { + columns.Add(Take((Array)col, spans, allocator)); + size = columns.Last().Length; + } + return new RecordBatch(batch.Schema, columns, size); + } + + /// + /// Apply `Take` to each array in `batch` using the same `indices` + /// + /// + /// + /// + /// + public static RecordBatch Take(RecordBatch batch, IList indices, MemoryAllocator? allocator = null) + { + var spans = IndicesToSpans(indices); + return Take(batch, spans, allocator); + } + + /// + /// Convert a list of indices into a list of index start-end spans for ease-of selection + /// + /// + /// + /// + public static List<(T, T)> IndicesToSpans(IList indices) where T : struct, INumber + { + List<(T, T)> acc = new(); + T? start = null; + T? last = null; + foreach (var i in indices) + { + if (last == null) + { + start = i; + last = i; + } + else + { + if (i - last == T.One) + { + last = i; + } + else if (start != null) + { + acc.Add(((T)start, (T)last)); + start = i; + last = i; + } + } + } + if (start != null && last != null) + { + acc.Add(((T)start, indices.Last())); + } + return acc; + } +} + + +public static class Aggregate +{ + + /// + /// Returns the minimum value in the array. + /// + /// The numeric type of array elements. + /// The input array. + /// How to handle null values. + /// The minimum value, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static T? Min(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + where T : struct, INumber + { + if (array.Length == 0) + return null; + + T? min = null; + for (int i = 0; i < array.Length; i++) + { + var value = array.GetValue(i); + if (value == null) + { + if (nullHandling == NullHandling.Propagate) + return null; + continue; + } + + if (min == null || (T)value < min) + min = value; + } + return min; + } + + /// + /// Returns the minimum value in the array. + /// + /// The input array. + /// How to handle null values. + /// The minimum value, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static double? Min(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return Min((DoubleArray)array, nullHandling); + case ArrowTypeId.Float: + return Min((FloatArray)array, nullHandling); + case ArrowTypeId.Int32: + return Min((Int32Array)array, nullHandling); + case ArrowTypeId.Int64: + return Min((Int64Array)array, nullHandling); + case ArrowTypeId.UInt32: + return Min((UInt32Array)array, nullHandling); + case ArrowTypeId.UInt64: + return Min((UInt64Array)array, nullHandling); + case ArrowTypeId.Int16: + return Min((Int16Array)array, nullHandling); + case ArrowTypeId.Int8: + return Min((Int8Array)array, nullHandling); + case ArrowTypeId.UInt16: + return Min((UInt16Array)array, nullHandling); + case ArrowTypeId.UInt8: + return Min((UInt8Array)array, nullHandling); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + /// + /// Returns the maximum value in the array. + /// + /// The numeric type of array elements. + /// The input array. + /// How to handle null values. + /// The maximum value, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static T? Max(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + where T : struct, INumber + { + if (array.Length == 0) + return null; + + T? max = null; + for (int i = 0; i < array.Length; i++) + { + var value = array.GetValue(i); + if (value == null) + { + if (nullHandling == NullHandling.Propagate) + return null; + continue; + } + + if (max == null || (T)value > max) + max = value; + } + return max; + } + + /// + /// Returns the maximum value in the array. + /// + /// The input array. + /// How to handle null values. + /// The maximum value, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static double? Max(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return Max((DoubleArray)array, nullHandling); + case ArrowTypeId.Float: + return Max((FloatArray)array, nullHandling); + case ArrowTypeId.Int32: + return Max((Int32Array)array, nullHandling); + case ArrowTypeId.Int64: + return Max((Int64Array)array, nullHandling); + case ArrowTypeId.UInt32: + return Max((UInt32Array)array, nullHandling); + case ArrowTypeId.UInt64: + return Max((UInt64Array)array, nullHandling); + case ArrowTypeId.Int16: + return Max((Int16Array)array, nullHandling); + case ArrowTypeId.Int8: + return Max((Int8Array)array, nullHandling); + case ArrowTypeId.UInt16: + return Max((UInt16Array)array, nullHandling); + case ArrowTypeId.UInt8: + return Max((UInt8Array)array, nullHandling); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + /// + /// Returns the index of the minimum value in the array (first occurrence). + /// + /// The numeric type of array elements. + /// The input array. + /// How to handle null values. + /// The index of the minimum value, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static int? ArgMin(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + where T : struct, INumber + { + if (array.Length == 0) + return null; + + T? min = null; + int? minIndex = null; + for (int i = 0; i < array.Length; i++) + { + var value = array.GetValue(i); + if (value == null) + { + if (nullHandling == NullHandling.Propagate) + return null; + continue; + } + + if (min == null || (T)value < min) + { + min = value; + minIndex = i; + } + } + return minIndex; + } + + /// + /// Returns the index of the minimum value in the array (first occurrence). + /// + /// The input array. + /// How to handle null values. + /// The index of the minimum value, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static int? ArgMin(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return ArgMin((DoubleArray)array, nullHandling); + case ArrowTypeId.Float: + return ArgMin((FloatArray)array, nullHandling); + case ArrowTypeId.Int32: + return ArgMin((Int32Array)array, nullHandling); + case ArrowTypeId.Int64: + return ArgMin((Int64Array)array, nullHandling); + case ArrowTypeId.UInt32: + return ArgMin((UInt32Array)array, nullHandling); + case ArrowTypeId.UInt64: + return ArgMin((UInt64Array)array, nullHandling); + case ArrowTypeId.Int16: + return ArgMin((Int16Array)array, nullHandling); + case ArrowTypeId.Int8: + return ArgMin((Int8Array)array, nullHandling); + case ArrowTypeId.UInt16: + return ArgMin((UInt16Array)array, nullHandling); + case ArrowTypeId.UInt8: + return ArgMin((UInt8Array)array, nullHandling); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + /// + /// Returns the index of the maximum value in the array (first occurrence). + /// + /// The numeric type of array elements. + /// The input array. + /// How to handle null values. + /// The index of the maximum value, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static int? ArgMax(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + where T : struct, INumber + { + if (array.Length == 0) + return null; + + T? max = null; + int? maxIndex = null; + for (int i = 0; i < array.Length; i++) + { + var value = array.GetValue(i); + if (value == null) + { + if (nullHandling == NullHandling.Propagate) + return null; + continue; + } + + if (max == null || (T)value > max) + { + max = value; + maxIndex = i; + } + } + return maxIndex; + } + + /// + /// Returns the index of the maximum value in the array (first occurrence). + /// + /// The input array. + /// How to handle null values. + /// The index of the maximum value, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static int? ArgMax(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return ArgMax((DoubleArray)array, nullHandling); + case ArrowTypeId.Float: + return ArgMax((FloatArray)array, nullHandling); + case ArrowTypeId.Int32: + return ArgMax((Int32Array)array, nullHandling); + case ArrowTypeId.Int64: + return ArgMax((Int64Array)array, nullHandling); + case ArrowTypeId.UInt32: + return ArgMax((UInt32Array)array, nullHandling); + case ArrowTypeId.UInt64: + return ArgMax((UInt64Array)array, nullHandling); + case ArrowTypeId.Int16: + return ArgMax((Int16Array)array, nullHandling); + case ArrowTypeId.Int8: + return ArgMax((Int8Array)array, nullHandling); + case ArrowTypeId.UInt16: + return ArgMax((UInt16Array)array, nullHandling); + case ArrowTypeId.UInt8: + return ArgMax((UInt8Array)array, nullHandling); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + /// + /// Returns the sum of all values in the array. + /// + /// The numeric type of array elements. + /// The input array. + /// How to handle null values. + /// The sum of values, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static T? Sum(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + where T : struct, INumber + { + if (array.Length == 0) + return null; + + T sum = T.Zero; + bool hasValue = false; + for (int i = 0; i < array.Length; i++) + { + var value = array.GetValue(i); + if (value == null) + { + if (nullHandling == NullHandling.Propagate) + return null; + continue; + } + + sum += (T)value; + hasValue = true; + } + return hasValue ? sum : null; + } + + /// + /// Returns the sum of all values in the array. + /// + /// The input array. + /// How to handle null values. + /// The sum of values, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static double? Sum(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return Sum((DoubleArray)array, nullHandling); + case ArrowTypeId.Float: + return Sum((FloatArray)array, nullHandling); + case ArrowTypeId.Int32: + return Sum((Int32Array)array, nullHandling); + case ArrowTypeId.Int64: + return Sum((Int64Array)array, nullHandling); + case ArrowTypeId.UInt32: + return Sum((UInt32Array)array, nullHandling); + case ArrowTypeId.UInt64: + return Sum((UInt64Array)array, nullHandling); + case ArrowTypeId.Int16: + return Sum((Int16Array)array, nullHandling); + case ArrowTypeId.Int8: + return Sum((Int8Array)array, nullHandling); + case ArrowTypeId.UInt16: + return Sum((UInt16Array)array, nullHandling); + case ArrowTypeId.UInt8: + return Sum((UInt8Array)array, nullHandling); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + /// + /// Returns the arithmetic mean of all values in the array. + /// + /// The numeric type of array elements. + /// The input array. + /// How to handle null values. + /// The mean as a double, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static double? Mean(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + where T : struct, INumber + { + if (array.Length == 0) + return null; + + T sum = T.Zero; + long count = 0; + for (int i = 0; i < array.Length; i++) + { + var value = array.GetValue(i); + if (value == null) + { + if (nullHandling == NullHandling.Propagate) + return null; + continue; + } + + sum += (T)value; + count++; + } + return count > 0 ? double.CreateChecked(sum) / count : null; + } + + /// + /// Returns the arithmetic mean of all values in the array. + /// + /// The input array. + /// How to handle null values. + /// The mean as a double, or null if the array is empty, all values are null, + /// or nullHandling is Propagate and any null exists. + public static double? Mean(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return Mean((DoubleArray)array, nullHandling); + case ArrowTypeId.Float: + return Mean((FloatArray)array, nullHandling); + case ArrowTypeId.Int32: + return Mean((Int32Array)array, nullHandling); + case ArrowTypeId.Int64: + return Mean((Int64Array)array, nullHandling); + case ArrowTypeId.UInt32: + return Mean((UInt32Array)array, nullHandling); + case ArrowTypeId.UInt64: + return Mean((UInt64Array)array, nullHandling); + case ArrowTypeId.Int16: + return Mean((Int16Array)array, nullHandling); + case ArrowTypeId.Int8: + return Mean((Int8Array)array, nullHandling); + case ArrowTypeId.UInt16: + return Mean((UInt16Array)array, nullHandling); + case ArrowTypeId.UInt8: + return Mean((UInt8Array)array, nullHandling); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } +} \ No newline at end of file diff --git a/src/Apache.Arrow.Operations/Text.cs b/src/Apache.Arrow.Operations/Text.cs new file mode 100644 index 00000000..8b774968 --- /dev/null +++ b/src/Apache.Arrow.Operations/Text.cs @@ -0,0 +1,220 @@ +using Apache.Arrow.Types; + +namespace Apache.Arrow.Operations; + +/// +/// Pretty printing utilities +/// +public static class Format +{ + /// + /// Recursively pretty print format and write `array` into `stream`, indenting as nesting increases. + /// + /// + /// + /// + /// + /// + public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int indent = 0, string indenter = " ") + { + + List indenting = Enumerable.Repeat(indenter, indent).ToList(); + string indentString = string.Concat(indenting); + + stream.WriteLine($"{indentString}["); + var pad = indentString + indenter; + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Float: + { + var valArray = (FloatArray)array; + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.Double: + { + var valArray = (DoubleArray)array; + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.Int32: + { + var valArray = (Int32Array)array; + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.Int64: + { + var valArray = (Int64Array)array; + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.Int16: + { + var valArray = (Int16Array)array; + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.Int8: + { + var valArray = (Int8Array)array; + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.UInt8: + { + var valArray = (UInt8Array)array; + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.UInt16: + { + var valArray = (UInt16Array)array; + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.UInt32: + { + var valArray = (UInt32Array)array; + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.UInt64: + { + var valArray = (UInt64Array)array; + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.Boolean: + { + var valArray = (BooleanArray)array; + + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.HalfFloat: + { + var valArray = (HalfFloatArray)array; + foreach (var v in valArray) + { + stream.WriteLine($"{pad}{v}"); + } + break; + } + case ArrowTypeId.List: + { + var valArray = (ListArray)array; + for (var i = 0; i < valArray.Length; i++) + { + if (valArray.IsNull(i)) + { + stream.WriteLine($"{pad}{null}"); + } + else + { + var slc = valArray.GetSlicedValues(i); + PrettyPrintFormat(slc, stream, indent + 1, indenter); + } + } + break; + } + case ArrowTypeId.String: + { + var valArray = (StringArray)array; + for (var i = 0; i < valArray.Length; i++) + { + if (valArray.IsNull(i)) + { + stream.WriteLine($"{pad}{null}"); + } + else + { + var slc = valArray.GetString(i); + stream.WriteLine($"{pad}{slc}"); + } + } + break; + } + case ArrowTypeId.Struct: + { + var dtype = (StructType)array.Data.DataType; + var valArray = (StructArray)array; + foreach (var (f, col) in dtype.Fields.Zip(valArray.Fields)) + { + stream.WriteLine($"{indentString}{f.Name}: {f.DataType.Name}"); + PrettyPrintFormat(col, stream, indent + 1, indenter); + } + break; + } + default: throw new NotImplementedException($"{array.Data.DataType.Name}"); + } + stream.WriteLine($"{indentString}]"); + } + + /// + /// Recursively pretty print format and write `array` into a string, indenting as nesting increases. + /// + /// + /// + /// + /// + public static string PrettyPrintFormat(IArrowArray array, int indent = 0, string indenter = " ") + { + using (var bufferStream = new MemoryStream()) + { + var writer = new StreamWriter(bufferStream); + PrettyPrintFormat(array, writer, indent, indenter); + writer.Flush(); + bufferStream.Seek(0, SeekOrigin.Begin); + var reader = new StreamReader(bufferStream); + var buff = reader.ReadToEnd(); + return buff; + } + } + + /// + /// Pretty print `array` to `STDOUT` via `Console.WriteLine`. Prefer `PrettyPrintFormat` to control where the + /// writing happens. + /// + /// + /// + /// + public static void PrettyPrint(IArrowArray array, int indent = 0, string indenter = " ") + { + var text = PrettyPrintFormat(array, indent, indenter); + Console.WriteLine(text); + } +} \ No newline at end of file diff --git a/test/Apache.Arrow.Operations.Tests/Apache.Arrow.Operations.Tests.csproj b/test/Apache.Arrow.Operations.Tests/Apache.Arrow.Operations.Tests.csproj new file mode 100644 index 00000000..d21e4e60 --- /dev/null +++ b/test/Apache.Arrow.Operations.Tests/Apache.Arrow.Operations.Tests.csproj @@ -0,0 +1,41 @@ + + + + + + true + true + + true + + + + net8.0 + + + + + + + + + + + all + runtime; build; native; contentfiles; analyzers + + + + + all + runtime; build; native; contentfiles; analyzers + + + + + + + + + \ No newline at end of file diff --git a/test/Apache.Arrow.Operations.Tests/TestOperations.cs b/test/Apache.Arrow.Operations.Tests/TestOperations.cs new file mode 100644 index 00000000..4fa7a713 --- /dev/null +++ b/test/Apache.Arrow.Operations.Tests/TestOperations.cs @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Ipc; +using Xunit; + + +namespace Apache.Arrow.Operations.Tests; + +public class ArrowOperationsTests +{ + + [Fact] + public void TestConversion() + { + var vals = Conversion.CastDouble([50L, 52L, 510L]); + Assert.Equal(vals.GetValue(0), 50.0); + Assert.Equal(vals.GetValue(1), 52.0); + Assert.Equal(vals.GetValue(2), 510.0); + + var valsF = Conversion.CastFloat(vals); + Assert.Equal(valsF.GetValue(0), 50.0f); + Assert.Equal(valsF.GetValue(1), 52.0f); + Assert.Equal(valsF.GetValue(2), 510.0f); + + var valsI = Conversion.CastInt32(vals); + Assert.Equal(valsI.GetValue(0), 50); + Assert.Equal(valsI.GetValue(1), 52); + Assert.Equal(valsI.GetValue(2), 510); + } + + [Fact] + public void TestSelectionTakeIndex() + { + var vals = Conversion.CastInt64([50L, 52L, 510L]); + var items = (Int64Array)Select.Take(vals, [1]); + Assert.Equal(52, items.GetValue(0)); + } + + [Fact] + public void TestSelectionFilterMask() + { + var vals = Conversion.CastInt64([50L, 52L, 510L]); + var mask = Comparison.Equal(vals, 52L); + var items = (Int64Array)Select.Filter(vals, mask); + Assert.Equal(52, items.GetValue(0)); + } +} \ No newline at end of file From ea299d7771a237363e06abed9d5219ad59762330 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 8 Feb 2026 22:24:53 -0500 Subject: [PATCH 2/9] chore: add file header license --- src/Apache.Arrow.Operations/Comparison.cs | 16 ++++++++++++++++ src/Apache.Arrow.Operations/Conversion.cs | 16 ++++++++++++++++ src/Apache.Arrow.Operations/Select.cs | 16 ++++++++++++++++ src/Apache.Arrow.Operations/Text.cs | 16 ++++++++++++++++ 4 files changed, 64 insertions(+) diff --git a/src/Apache.Arrow.Operations/Comparison.cs b/src/Apache.Arrow.Operations/Comparison.cs index 8eeecae4..dc03fa09 100644 --- a/src/Apache.Arrow.Operations/Comparison.cs +++ b/src/Apache.Arrow.Operations/Comparison.cs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + using System; using System.Numerics; using Apache.Arrow; diff --git a/src/Apache.Arrow.Operations/Conversion.cs b/src/Apache.Arrow.Operations/Conversion.cs index 8714d1fb..b4601bba 100644 --- a/src/Apache.Arrow.Operations/Conversion.cs +++ b/src/Apache.Arrow.Operations/Conversion.cs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + using System.Numerics; using Apache.Arrow; diff --git a/src/Apache.Arrow.Operations/Select.cs b/src/Apache.Arrow.Operations/Select.cs index 468c3897..6741dc78 100644 --- a/src/Apache.Arrow.Operations/Select.cs +++ b/src/Apache.Arrow.Operations/Select.cs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + using System.Numerics; using Apache.Arrow; diff --git a/src/Apache.Arrow.Operations/Text.cs b/src/Apache.Arrow.Operations/Text.cs index 8b774968..45d263b8 100644 --- a/src/Apache.Arrow.Operations/Text.cs +++ b/src/Apache.Arrow.Operations/Text.cs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + using Apache.Arrow.Types; namespace Apache.Arrow.Operations; From a66ad19c1f39e027aff5c9bf532b7e9e011e8354 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sat, 28 Feb 2026 21:50:25 -0500 Subject: [PATCH 3/9] feature: add builders with capacity, IEnumerable impls --- src/Apache.Arrow/Arrays/BooleanArray.cs | 6 ++++++ src/Apache.Arrow/Arrays/FixedSizeListArray.cs | 14 +++++++++++++- src/Apache.Arrow/Arrays/LargeListArray.cs | 15 ++++++++++++++- src/Apache.Arrow/Arrays/ListArray.cs | 14 +++++++++++++- src/Apache.Arrow/Arrays/ListViewArray.cs | 16 +++++++++++++++- src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs | 6 ++++++ 6 files changed, 67 insertions(+), 4 deletions(-) diff --git a/src/Apache.Arrow/Arrays/BooleanArray.cs b/src/Apache.Arrow/Arrays/BooleanArray.cs index f87c2ec7..8cd018c7 100644 --- a/src/Apache.Arrow/Arrays/BooleanArray.cs +++ b/src/Apache.Arrow/Arrays/BooleanArray.cs @@ -38,6 +38,12 @@ public Builder() ValidityBuffer = new ArrowBuffer.BitmapBuilder(); } + public Builder(int capacity) + { + ValueBuffer = new ArrowBuffer.BitmapBuilder(capacity); + ValidityBuffer = new ArrowBuffer.BitmapBuilder(capacity); + } + public Builder Append(bool value) { return NullableAppend(value); diff --git a/src/Apache.Arrow/Arrays/FixedSizeListArray.cs b/src/Apache.Arrow/Arrays/FixedSizeListArray.cs index f60daedb..9a81eedf 100644 --- a/src/Apache.Arrow/Arrays/FixedSizeListArray.cs +++ b/src/Apache.Arrow/Arrays/FixedSizeListArray.cs @@ -14,12 +14,14 @@ // limitations under the License. using System; +using System.Collections; +using System.Collections.Generic; using Apache.Arrow.Memory; using Apache.Arrow.Types; namespace Apache.Arrow { - public class FixedSizeListArray : Array + public class FixedSizeListArray : Array, IEnumerable { public class Builder : IArrowArrayBuilder { @@ -186,5 +188,15 @@ protected override void Dispose(bool disposing) } base.Dispose(disposing); } + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetSlicedValues(index); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/src/Apache.Arrow/Arrays/LargeListArray.cs b/src/Apache.Arrow/Arrays/LargeListArray.cs index 6e37aa4c..df2d90ca 100644 --- a/src/Apache.Arrow/Arrays/LargeListArray.cs +++ b/src/Apache.Arrow/Arrays/LargeListArray.cs @@ -14,11 +14,13 @@ // limitations under the License. using System; +using System.Collections; +using System.Collections.Generic; using Apache.Arrow.Types; namespace Apache.Arrow { - public class LargeListArray : Array + public class LargeListArray : Array, IEnumerable { public IArrowArray Values { get; } @@ -93,5 +95,16 @@ protected override void Dispose(bool disposing) } base.Dispose(disposing); } + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetSlicedValues(index); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } + } diff --git a/src/Apache.Arrow/Arrays/ListArray.cs b/src/Apache.Arrow/Arrays/ListArray.cs index 4d2ff96a..f245b49a 100644 --- a/src/Apache.Arrow/Arrays/ListArray.cs +++ b/src/Apache.Arrow/Arrays/ListArray.cs @@ -16,10 +16,12 @@ using System; using Apache.Arrow.Memory; using Apache.Arrow.Types; +using System.Collections; +using System.Collections.Generic; namespace Apache.Arrow { - public class ListArray : Array + public class ListArray : Array, IEnumerable { public class Builder : IArrowArrayBuilder { @@ -204,5 +206,15 @@ protected override void Dispose(bool disposing) } base.Dispose(disposing); } + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetSlicedValues(index); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/src/Apache.Arrow/Arrays/ListViewArray.cs b/src/Apache.Arrow/Arrays/ListViewArray.cs index 081385d9..9c6ad4b3 100644 --- a/src/Apache.Arrow/Arrays/ListViewArray.cs +++ b/src/Apache.Arrow/Arrays/ListViewArray.cs @@ -14,12 +14,14 @@ // limitations under the License. using System; +using System.Collections; +using System.Collections.Generic; using Apache.Arrow.Memory; using Apache.Arrow.Types; namespace Apache.Arrow { - public class ListViewArray : Array + public class ListViewArray : Array, IEnumerable { public class Builder : IArrowArrayBuilder { @@ -213,5 +215,17 @@ protected override void Dispose(bool disposing) } base.Dispose(disposing); } + + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + if (IsNull(index)) yield return null; + else yield return GetSlicedValues(index); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs b/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs index dc6fba2b..4b83378a 100644 --- a/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs +++ b/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs @@ -116,6 +116,12 @@ public PrimitiveArrayBuilder() ValidityBuffer = new ArrowBuffer.BitmapBuilder(); } + public PrimitiveArrayBuilder(int capacity) + { + ValueBuffer = new ArrowBuffer.Builder(capacity); + ValidityBuffer = new ArrowBuffer.BitmapBuilder(capacity); + } + public TBuilder Resize(int length) { ValueBuffer.Resize(length); From 9ab12a2687d3594f311a75847d70133621f3b22c Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sat, 28 Feb 2026 21:59:52 -0500 Subject: [PATCH 4/9] feature: refactor to use vector bits --- src/Apache.Arrow.Operations/Comparison.cs | 427 +++++++++++++----- .../TestOperations.cs | 88 +++- 2 files changed, 413 insertions(+), 102 deletions(-) diff --git a/src/Apache.Arrow.Operations/Comparison.cs b/src/Apache.Arrow.Operations/Comparison.cs index dc03fa09..81d8186c 100644 --- a/src/Apache.Arrow.Operations/Comparison.cs +++ b/src/Apache.Arrow.Operations/Comparison.cs @@ -16,12 +16,247 @@ using System; using System.Numerics; -using Apache.Arrow; using Apache.Arrow.Memory; using Apache.Arrow.Types; +using System.Runtime.Intrinsics; + namespace Apache.Arrow.Operations; +public static class BitVectorOps +{ + public static ArrowBuffer OnesComplement(ArrowBuffer buffer) + { + var builder = new ArrowBuffer.BitmapBuilder(buffer.Length * 8); + var store = builder.Span; + int offset = 0; + int size = buffer.Span.Length; + + while ((size - offset) >= 8) + { + if ((size - offset) >= 64) + { + var part = buffer.Span.Slice(offset, 64); + Vector512 vector = Vector512.Create(part); + vector = Vector512.OnesComplement(vector); + vector.CopyTo(store.Slice(offset, 64)); + offset += 64; + } + else if ((size - offset) >= 32) + { + var part = buffer.Span.Slice(offset, 32); + Vector256 vector = Vector256.Create(part); + vector = Vector256.OnesComplement(vector); + vector.CopyTo(store.Slice(offset, 32)); + offset += 32; + } + else if ((size - offset) >= 16) + { + var part = buffer.Span.Slice(offset, 16); + Vector128 vector = Vector128.Create(part); + vector = Vector128.OnesComplement(vector); + vector.CopyTo(store.Slice(offset, 16)); + offset += 16; + } + else if ((size - offset) >= 8) + { + var part = buffer.Span.Slice(offset, 8); + Vector64 vector = Vector64.Create(part); + vector = Vector64.OnesComplement(vector); + vector.CopyTo(store.Slice(offset, 8)); + offset += 8; + } + else break; + } + + for(var i = offset; i < size; i++) + { + store[i] = (byte)~buffer.Span[i]; + } + return builder.Build(); + } + + public static ArrowBuffer And(ArrowBuffer buffer, ArrowBuffer buffer2) + { + var builder = new ArrowBuffer.BitmapBuilder(buffer.Length * 8); + var store = builder.Span; + int offset = 0; + int size = buffer.Span.Length; + + while ((size - offset) >= 8) + { + if ((size - offset) >= 64) + { + var part = buffer.Span.Slice(offset, 64); + Vector512 vector = Vector512.Create(part); + part = buffer2.Span.Slice(offset, 64); + Vector512 vector2 = Vector512.Create(part); + vector = vector & vector2; + vector.CopyTo(store.Slice(offset, 64)); + offset += 64; + } + else if ((size - offset) >= 32) + { + var part = buffer.Span.Slice(offset, 32); + Vector256 vector = Vector256.Create(part); + part = buffer2.Span.Slice(offset, 32); + Vector256 vector2 = Vector256.Create(part); + vector = vector & vector2; + vector.CopyTo(store.Slice(offset, 32)); + offset += 32; + } + else if ((size - offset) >= 16) + { + var part = buffer.Span.Slice(offset, 16); + Vector128 vector = Vector128.Create(part); + part = buffer2.Span.Slice(offset, 16); + Vector128 vector2 = Vector128.Create(part); + vector = vector & vector2; + vector = Vector128.OnesComplement(vector); + vector.CopyTo(store.Slice(offset, 16)); + offset += 16; + } + else if ((size - offset) >= 8) + { + var part = buffer.Span.Slice(offset, 8); + Vector64 vector = Vector64.Create(part); + part = buffer2.Span.Slice(offset, 8); + Vector64 vector2 = Vector64.Create(part); + vector = vector & vector2; + vector.CopyTo(store.Slice(offset, 8)); + offset += 8; + } + else break; + } + + for (var i = offset; i < size; i++) + { + store[i] = (byte)(buffer.Span[i] & buffer2.Span[i]); + } + return builder.Build(); + } + + public static ArrowBuffer Or(ArrowBuffer buffer, ArrowBuffer buffer2) + { + var builder = new ArrowBuffer.BitmapBuilder(buffer.Length * 8); + var store = builder.Span; + int offset = 0; + int size = buffer.Span.Length; + + while ((size - offset) >= 8) + { + if ((size - offset) >= 64) + { + var part = buffer.Span.Slice(offset, 64); + Vector512 vector = Vector512.Create(part); + part = buffer2.Span.Slice(offset, 64); + Vector512 vector2 = Vector512.Create(part); + vector = vector | vector2; + vector.CopyTo(store.Slice(offset, 64)); + offset += 64; + } + else if ((size - offset) >= 32) + { + var part = buffer.Span.Slice(offset, 32); + Vector256 vector = Vector256.Create(part); + part = buffer2.Span.Slice(offset, 32); + Vector256 vector2 = Vector256.Create(part); + vector = vector | vector2; + vector.CopyTo(store.Slice(offset, 32)); + offset += 32; + } + else if ((size - offset) >= 16) + { + var part = buffer.Span.Slice(offset, 16); + Vector128 vector = Vector128.Create(part); + part = buffer2.Span.Slice(offset, 16); + Vector128 vector2 = Vector128.Create(part); + vector = vector | vector2; + vector = Vector128.OnesComplement(vector); + vector.CopyTo(store.Slice(offset, 16)); + offset += 16; + } + else if ((size - offset) >= 8) + { + var part = buffer.Span.Slice(offset, 8); + Vector64 vector = Vector64.Create(part); + part = buffer2.Span.Slice(offset, 8); + Vector64 vector2 = Vector64.Create(part); + vector = vector | vector2; + vector.CopyTo(store.Slice(offset, 8)); + offset += 8; + } + else break; + } + + for (var i = offset; i < size; i++) + { + store[i] = (byte)(buffer.Span[i] | buffer2.Span[i]); + } + return builder.Build(); + } + + public static ArrowBuffer Xor(ArrowBuffer buffer, ArrowBuffer buffer2) + { + var builder = new ArrowBuffer.BitmapBuilder(buffer.Length * 8); + var store = builder.Span; + int offset = 0; + int size = buffer.Span.Length; + + while ((size - offset) >= 8) + { + if ((size - offset) >= 64) + { + var part = buffer.Span.Slice(offset, 64); + Vector512 vector = Vector512.Create(part); + part = buffer2.Span.Slice(offset, 64); + Vector512 vector2 = Vector512.Create(part); + vector = vector ^ vector2; + vector.CopyTo(store.Slice(offset, 64)); + offset += 64; + } + else if ((size - offset) >= 32) + { + var part = buffer.Span.Slice(offset, 32); + Vector256 vector = Vector256.Create(part); + part = buffer2.Span.Slice(offset, 32); + Vector256 vector2 = Vector256.Create(part); + vector = vector ^ vector2; + vector.CopyTo(store.Slice(offset, 32)); + offset += 32; + } + else if ((size - offset) >= 16) + { + var part = buffer.Span.Slice(offset, 16); + Vector128 vector = Vector128.Create(part); + part = buffer2.Span.Slice(offset, 16); + Vector128 vector2 = Vector128.Create(part); + vector = vector ^ vector2; + vector = Vector128.OnesComplement(vector); + vector.CopyTo(store.Slice(offset, 16)); + offset += 16; + } + else if ((size - offset) >= 8) + { + var part = buffer.Span.Slice(offset, 8); + Vector64 vector = Vector64.Create(part); + part = buffer2.Span.Slice(offset, 8); + Vector64 vector2 = Vector64.Create(part); + vector = vector ^ vector2; + vector.CopyTo(store.Slice(offset, 8)); + offset += 8; + } + else break; + } + + for (var i = offset; i < size; i++) + { + store[i] = (byte)(buffer.Span[i] ^ buffer2.Span[i]); + } + return builder.Build(); + } +} + public static class Comparison { /// @@ -32,22 +267,19 @@ public static class Comparison /// public static BooleanArray Invert(BooleanArray mask, MemoryAllocator? allocator = null) { - var builder = new BooleanArray.Builder(); - builder.Reserve(mask.Length); - foreach (var val in mask) - { - if (val != null) - { - builder.Append(!(bool)val); - } - else - { - builder.AppendNull(); - } - } - return builder.Build(allocator); + var inverted = BitVectorOps.OnesComplement(mask.ValueBuffer); + var invertedmask = new BooleanArray(inverted, mask.NullBitmapBuffer.Clone(), mask.Length, mask.NullCount, 0); + return invertedmask; } + /// + /// An alias for that is idiomatic. + /// + /// + /// + /// + public static BooleanArray OnesComplement(BooleanArray mask, MemoryAllocator? allocator = null) => Invert(mask, allocator); + /// /// Perform a pairwise boolean AND operation. /// @@ -55,26 +287,14 @@ public static BooleanArray Invert(BooleanArray mask, MemoryAllocator? allocator /// /// /// - /// + /// public static BooleanArray And(BooleanArray lhs, BooleanArray rhs, MemoryAllocator? allocator = null) { - if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); - var builder = new BooleanArray.Builder(); - builder.Reserve(lhs.Length); - for (int i = 0; i < lhs.Length; i++) - { - var a = lhs.GetValue(i); - var b = rhs.GetValue(i); - if (a != null && b != null) - { - builder.Append((bool)a && (bool)b); - } - else - { - builder.AppendNull(); - } - } - return builder.Build(allocator); + if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); + var combined = BitVectorOps.And(lhs.ValueBuffer, rhs.ValueBuffer); + var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer); + var nullCount = BitUtility.CountBits(combinedMask.Span); + return new BooleanArray(combined, combinedMask, lhs.Length, nullCount, 0); } /// @@ -84,26 +304,31 @@ public static BooleanArray And(BooleanArray lhs, BooleanArray rhs, MemoryAllocat /// /// /// - /// + /// public static BooleanArray Or(BooleanArray lhs, BooleanArray rhs, MemoryAllocator? allocator = null) { - if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); - var builder = new BooleanArray.Builder(); - builder.Reserve(lhs.Length); - for (int i = 0; i < lhs.Length; i++) - { - var a = lhs.GetValue(i); - var b = rhs.GetValue(i); - if (a != null && b != null) - { - builder.Append((bool)a || (bool)b); - } - else - { - builder.AppendNull(); - } - } - return builder.Build(allocator); + if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); + var combined = BitVectorOps.Or(lhs.ValueBuffer, rhs.ValueBuffer); + var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer); + var nullCount = BitUtility.CountBits(combinedMask.Span); + return new BooleanArray(combined, combinedMask, lhs.Length, nullCount, 0); + } + + /// + /// Performa a pairwise boolean equality operation. + /// + /// + /// + /// + /// + /// + public static BooleanArray Equals(BooleanArray lhs, BooleanArray rhs, MemoryAllocator? allocator = null) + { + if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); + var combined = BitVectorOps.OnesComplement(BitVectorOps.Xor(lhs.ValueBuffer, rhs.ValueBuffer)); + var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer); + var nullCount = BitUtility.CountBits(combinedMask.Span); + return new BooleanArray(combined, combinedMask, lhs.Length, nullCount, 0); } /// @@ -113,26 +338,14 @@ public static BooleanArray Or(BooleanArray lhs, BooleanArray rhs, MemoryAllocato /// /// /// - /// + /// public static BooleanArray Xor(BooleanArray lhs, BooleanArray rhs, MemoryAllocator? allocator = null) { - if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); - var builder = new BooleanArray.Builder(); - builder.Reserve(lhs.Length); - for (int i = 0; i < lhs.Length; i++) - { - var a = lhs.GetValue(i); - var b = rhs.GetValue(i); - if (a != null && b != null) - { - builder.Append((bool)a ^ (bool)b); - } - else - { - builder.AppendNull(); - } - } - return builder.Build(allocator); + if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); + var combined = BitVectorOps.Xor(lhs.ValueBuffer, rhs.ValueBuffer); + var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer); + var nullCount = BitUtility.CountBits(combinedMask.Span); + return new BooleanArray(combined, combinedMask, lhs.Length, nullCount, 0); } /// @@ -143,9 +356,13 @@ public static BooleanArray Xor(BooleanArray lhs, BooleanArray rhs, MemoryAllocat /// /// /// - public static BooleanArray Equal(PrimitiveArray lhs, T rhs, MemoryAllocator? allocator = null) where T : struct, INumber + public static BooleanArray Equal(PrimitiveArray lhs, T? rhs, MemoryAllocator? allocator = null) where T : struct, INumber { - var cmp = new BooleanArray.Builder(); + if (rhs == null) + { + return new BooleanArray(lhs.NullBitmapBuffer.Clone(), ArrowBuffer.Empty, lhs.Length, 0, 0); + } + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetValue(i); @@ -163,11 +380,11 @@ public static BooleanArray Equal(PrimitiveArray lhs, T rhs, MemoryAllocato /// /// /// - /// + /// public static BooleanArray Equal(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null) where T : struct, INumber { - var cmp = new BooleanArray.Builder(); - if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetValue(i); @@ -185,9 +402,13 @@ public static BooleanArray Equal(PrimitiveArray lhs, PrimitiveArray rhs /// /// /// - public static BooleanArray Equal(StringArray lhs, string rhs, MemoryAllocator? allocator = null) + public static BooleanArray Equal(StringArray lhs, string? rhs, MemoryAllocator? allocator = null) { - var cmp = new BooleanArray.Builder(); + if (rhs == null) + { + return new BooleanArray(lhs.NullBitmapBuffer.Clone(), ArrowBuffer.Empty, lhs.Length, 0, 0); + } + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetString(i); @@ -204,11 +425,11 @@ public static BooleanArray Equal(StringArray lhs, string rhs, MemoryAllocator? a /// /// /// - /// + /// public static BooleanArray Equal(StringArray lhs, StringArray rhs, MemoryAllocator? allocator = null) { - var cmp = new BooleanArray.Builder(); - if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetString(i); @@ -226,9 +447,13 @@ public static BooleanArray Equal(StringArray lhs, StringArray rhs, MemoryAllocat /// /// /// - public static BooleanArray Equal(LargeStringArray lhs, string rhs, MemoryAllocator? allocator = null) + public static BooleanArray Equal(LargeStringArray lhs, string? rhs, MemoryAllocator? allocator = null) { - var cmp = new BooleanArray.Builder(); + if (rhs == null) + { + return new BooleanArray(lhs.NullBitmapBuffer.Clone(), ArrowBuffer.Empty, lhs.Length, 0, 0); + } + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetString(i); @@ -245,11 +470,11 @@ public static BooleanArray Equal(LargeStringArray lhs, string rhs, MemoryAllocat /// /// /// - /// + /// public static BooleanArray Equal(LargeStringArray lhs, LargeStringArray rhs, MemoryAllocator? allocator = null) { - var cmp = new BooleanArray.Builder(); - if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetString(i); @@ -269,7 +494,7 @@ public static BooleanArray Equal(LargeStringArray lhs, LargeStringArray rhs, Mem /// /// /// - public static BooleanArray Equal(IArrowArray lhs, string rhs, MemoryAllocator? allocator = null) + public static BooleanArray Equal(IArrowArray lhs, string? rhs, MemoryAllocator? allocator = null) { switch (lhs.Data.DataType.TypeId) { @@ -282,9 +507,9 @@ public static BooleanArray Equal(IArrowArray lhs, string rhs, MemoryAllocator? a } } - public static BooleanArray GreaterThan(PrimitiveArray lhs, T rhs, MemoryAllocator? allocator = null) where T : struct, INumber + public static BooleanArray GreaterThan(PrimitiveArray lhs, T? rhs, MemoryAllocator? allocator = null) where T : struct, INumber { - var cmp = new BooleanArray.Builder(); + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetValue(i); @@ -296,8 +521,8 @@ public static BooleanArray GreaterThan(PrimitiveArray lhs, T rhs, MemoryAl public static BooleanArray GreaterThan(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null) where T : struct, INumber { - var cmp = new BooleanArray.Builder(); - if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetValue(i); @@ -308,9 +533,9 @@ public static BooleanArray GreaterThan(PrimitiveArray lhs, PrimitiveArray< return cmp.Build(allocator); } - public static BooleanArray LessThan(PrimitiveArray lhs, T rhs, MemoryAllocator? allocator = null) where T : struct, INumber + public static BooleanArray LessThan(PrimitiveArray lhs, T? rhs, MemoryAllocator? allocator = null) where T : struct, INumber { - var cmp = new BooleanArray.Builder(); + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetValue(i); @@ -322,8 +547,8 @@ public static BooleanArray LessThan(PrimitiveArray lhs, T rhs, MemoryAlloc public static BooleanArray LessThan(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null) where T : struct, INumber { - var cmp = new BooleanArray.Builder(); - if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetValue(i); @@ -334,9 +559,9 @@ public static BooleanArray LessThan(PrimitiveArray lhs, PrimitiveArray return cmp.Build(allocator); } - public static BooleanArray GreaterThanOrEqual(PrimitiveArray lhs, T rhs, MemoryAllocator? allocator = null) where T : struct, INumber + public static BooleanArray GreaterThanOrEqual(PrimitiveArray lhs, T? rhs, MemoryAllocator? allocator = null) where T : struct, INumber { - var cmp = new BooleanArray.Builder(); + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetValue(i); @@ -348,8 +573,8 @@ public static BooleanArray GreaterThanOrEqual(PrimitiveArray lhs, T rhs, M public static BooleanArray GreaterThanOrEqual(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null) where T : struct, INumber { - var cmp = new BooleanArray.Builder(); - if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetValue(i); @@ -360,9 +585,9 @@ public static BooleanArray GreaterThanOrEqual(PrimitiveArray lhs, Primitiv return cmp.Build(allocator); } - public static BooleanArray LessThanOrEqual(PrimitiveArray lhs, T rhs, MemoryAllocator? allocator = null) where T : struct, INumber + public static BooleanArray LessThanOrEqual(PrimitiveArray lhs, T? rhs, MemoryAllocator? allocator = null) where T : struct, INumber { - var cmp = new BooleanArray.Builder(); + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetValue(i); @@ -374,8 +599,8 @@ public static BooleanArray LessThanOrEqual(PrimitiveArray lhs, T rhs, Memo public static BooleanArray LessThanOrEqual(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null) where T : struct, INumber { - var cmp = new BooleanArray.Builder(); - if (lhs.Length != rhs.Length) throw new InvalidOperationException("Arrays must have the same length"); + if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); + var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) { var a = lhs.GetValue(i); diff --git a/test/Apache.Arrow.Operations.Tests/TestOperations.cs b/test/Apache.Arrow.Operations.Tests/TestOperations.cs index 4fa7a713..4ca830d8 100644 --- a/test/Apache.Arrow.Operations.Tests/TestOperations.cs +++ b/test/Apache.Arrow.Operations.Tests/TestOperations.cs @@ -14,7 +14,7 @@ // limitations under the License. using System; -using Apache.Arrow.Ipc; +using System.Linq; using Xunit; @@ -58,4 +58,90 @@ public void TestSelectionFilterMask() var items = (Int64Array)Select.Filter(vals, mask); Assert.Equal(52, items.GetValue(0)); } +} + + +public class ArrowBooleanOperationsTests { + [Fact] + public void TestInvert() + { + var vals = Enumerable.Repeat(true, 5000); + var builder = new BooleanArray.Builder(5000); + builder.AppendRange(vals); + var array = builder.Build(); + Assert.True(array.All(v => v ?? false)); + + var inverted = BitVectorOps.OnesComplement(array.ValueBuffer); + var invertedArray = new BooleanArray(inverted, array.NullBitmapBuffer.Clone(), array.Length, array.NullCount, 0); + Assert.Equal(array.Length, invertedArray.Length); + Assert.False(invertedArray.All(v => v ?? false)); + } + + [Fact] + public void TesAnd() + { + var vals = Enumerable.Repeat(true, 5000); + var builder = new BooleanArray.Builder(5000); + builder.AppendRange(vals); + var array = builder.Build(); + Assert.True(array.All(v => v ?? false)); + + var result = Comparison.And(array, array); + Assert.True(result.All(v => v ?? false)); + + vals = Enumerable.Repeat(false, 5000); + builder = new BooleanArray.Builder(5000); + builder.AppendRange(vals); + var inverted = builder.Build(); + + result = Comparison.And(array, inverted); + Assert.Equal(result.Length, inverted.Length); + Assert.False(result.All(v => v ?? false)); + } + + [Fact] + public void TestOr() + { + var vals = Enumerable.Repeat(true, 5000); + var builder = new BooleanArray.Builder(5000); + builder.AppendRange(vals); + var array = builder.Build(); + Assert.True(array.All(v => v ?? false)); + + var result = Comparison.Or(array, array); + Assert.True(result.All(v => v ?? false)); + + vals = Enumerable.Repeat(false, 5000); + builder = new BooleanArray.Builder(5000); + builder.AppendRange(vals); + var inverted = builder.Build(); + + result = Comparison.Or(array, inverted); + Assert.Equal(result.Length, inverted.Length); + Assert.True(result.All(v => v ?? false)); + } + + [Fact] + public void TestXor() + { + var vals = Enumerable.Repeat(true, 2500); + var builder = new BooleanArray.Builder(5000); + builder.AppendRange(vals); + vals = Enumerable.Repeat(false, 2500); + builder.AppendRange(vals); + var array = builder.Build(); + + Assert.Equal(2500, array.Count(s => s ?? false)); + + builder = new BooleanArray.Builder(5000); + vals = Enumerable.Repeat(true, 2500); + builder.AppendRange(vals); + vals = Enumerable.Repeat(true, 2500); + builder.AppendRange(vals); + var array2 = builder.Build(); + + var result = Comparison.Xor(array, array2); + Assert.Equal(2500, result.Count(s => s ?? false)); + Assert.Equal(0, ((BooleanArray)result.Slice(0, 2500)).Count(s => s ?? false)); + } } \ No newline at end of file From 8f14079d1c394363a9ebdd8d24c41a77a8962945 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sat, 28 Feb 2026 22:00:39 -0500 Subject: [PATCH 5/9] feature: rework more numerical casts --- src/Apache.Arrow.Operations/Conversion.cs | 484 ++++++++++++++++------ 1 file changed, 346 insertions(+), 138 deletions(-) diff --git a/src/Apache.Arrow.Operations/Conversion.cs b/src/Apache.Arrow.Operations/Conversion.cs index b4601bba..0857a8f8 100644 --- a/src/Apache.Arrow.Operations/Conversion.cs +++ b/src/Apache.Arrow.Operations/Conversion.cs @@ -16,126 +16,11 @@ using System.Numerics; -using Apache.Arrow; using Apache.Arrow.Memory; using Apache.Arrow.Types; namespace Apache.Arrow.Operations; -public class ArrowCompatibilityVisitor : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor -{ - public IArrowArray? Result = null; - - public static IArrowArray Convert(IArrowArray array) - { - var visitor = new ArrowCompatibilityVisitor(); - visitor.Visit(array); - if (visitor.Result == null) throw new InvalidOperationException(); - return visitor.Result; - } - - public StructArray HandleStruct(StructArray array) - { - var dtype = (StructType)array.Data.DataType; - var newFields = new List(); - var newVals = new List(); - int size = 0; - foreach (var (field, arr) in dtype.Fields.Zip(array.Fields)) - { - var visitor = new ArrowCompatibilityVisitor(); - visitor.Visit(arr); - if (visitor.Result == null) throw new InvalidOperationException(); - newFields.Add(new Field(field.Name, visitor.Result.Data.DataType, field.IsNullable)); - newVals.Add(visitor.Result); - if (size != 0 && visitor.Result.Length != 0 && visitor.Result.Length != size) throw new InvalidDataException(); - size = visitor.Result.Length; - } - var result = new StructArray(new StructType(newFields), size, newVals, array.NullBitmapBuffer); - if (result.Fields.Count > 0) { } - return result; - } - - public void Visit(StructArray array) - { - Result = HandleStruct(array); - } - - public void Visit(IArrowArray array) - { - switch (array.Data.DataType.TypeId) - { - case ArrowTypeId.Struct: - { - Visit((StructArray)array); - break; - } - case ArrowTypeId.LargeList: - { - Visit((LargeListArray)array); - break; - } - case ArrowTypeId.LargeString: - { - Visit((LargeStringArray)array); - break; - } - case ArrowTypeId.LargeBinary: - { - Visit((LargeBinaryArray)array); - break; - } - default: - { - Result = array; - break; - } - } - } - - public void Visit(LargeListArray array) - { - ArrowCompatibilityVisitor visitor = new(); - visitor.Visit(array.Values); - var offsetsBuffer = new ArrowBuffer.Builder(); - foreach (var v in array.ValueOffsets) - { - offsetsBuffer.Append((int)v); - } - if (visitor.Result == null) throw new InvalidOperationException(); - Result = new ListArray( - new ListType(((LargeListType)array.Data.DataType).ValueDataType), - array.Length, - offsetsBuffer.Build(), - visitor.Result, - array.NullBitmapBuffer, - array.NullCount, - array.Offset - ); - } - - public void Visit(LargeStringArray array) - { - var offsetsBuffer = new ArrowBuffer.Builder(); - foreach (var v in array.ValueOffsets) - { - offsetsBuffer.Append((int)v); - } - Result = new StringArray( - array.Length, - offsetsBuffer.Build(), - array.ValueBuffer, - array.NullBitmapBuffer, - array.NullCount, - array.Offset - ); - } - - public void Visit(LargeBinaryArray type) - { - throw new NotImplementedException(); - } -} - /// /// Specifies how null values should be handled in aggregate computations. @@ -159,7 +44,7 @@ public enum NullHandling /// Copy primitive arraays between types to explicitly known numerical types. When the type already /// matches, no copy is performed. /// -public static class Conversion +public static partial class Conversion { static void NullToZero(PrimitiveArray array, IArrowArrayBuilder, TBuilder> accumulator) where T : struct, INumber where TBuilder : IArrowArrayBuilder> @@ -240,6 +125,90 @@ public static Array NullToZero(PrimitiveArray array, MemoryAllocator? allo } } + public static DoubleArray CastDouble(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new DoubleArray.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(double.CreateChecked(val)); + return builder.Build(allocator); + } + + public static FloatArray CastFloat(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new FloatArray.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(float.CreateChecked(val)); + return builder.Build(allocator); + } + + public static Int32Array CastInt32(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new Int32Array.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(int.CreateChecked(val)); + return builder.Build(allocator); + } + + public static Int64Array CastInt64(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new Int64Array.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(long.CreateChecked(val)); + return builder.Build(allocator); + } + + public static UInt16Array CastUInt16(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new UInt16Array.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(ushort.CreateChecked(val)); + return builder.Build(allocator); + } + + public static Int16Array CastInt16(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new Int16Array.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(short.CreateChecked(val)); + return builder.Build(allocator); + } + + public static UInt8Array CastUInt8(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new UInt8Array.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(byte.CreateChecked(val)); + return builder.Build(allocator); + } + + public static Int8Array CastInt8(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new Int8Array.Builder(); + builder.Reserve(array.Count); + foreach (var val in array) + builder.Append(sbyte.CreateChecked(val)); + return builder.Build(allocator); + } + + public static BooleanArray CastBool(PrimitiveArray array, MemoryAllocator? allocator = null) where T: struct, INumber + { + var builder = new BooleanArray.Builder(); + builder.Reserve(array.Length); + foreach (var val in array) + { + if (val != null) builder.Append(val.Value != T.Zero); + else builder.AppendNull(); + } + return builder.Build(allocator); + } + public static Int64Array CastInt64(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber { var builder = new Int64Array.Builder(); @@ -264,63 +233,99 @@ public static Int32Array CastInt32(PrimitiveArray array, MemoryAllocator? return builder.Build(allocator); } - public static FloatArray CastFloat(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber + public static Int16Array CastInt16(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber { - var builder = new FloatArray.Builder(); + var builder = new Int16Array.Builder(); builder.Reserve(array.Length); foreach (var val in array) { - if (val != null) builder.Append(float.CreateChecked((T)val)); + if (val != null) builder.Append(short.CreateChecked((T)val)); else builder.AppendNull(); } return builder.Build(allocator); } - public static DoubleArray CastDouble(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber + public static Int8Array CastInt8(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber { - var builder = new DoubleArray.Builder(); + var builder = new Int8Array.Builder(); builder.Reserve(array.Length); foreach (var val in array) { - if (val != null) builder.Append(double.CreateChecked((T)val)); + if (val != null) builder.Append(sbyte.CreateChecked((T)val)); else builder.AppendNull(); } return builder.Build(allocator); } - public static DoubleArray CastDouble(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + public static UInt64Array CastUInt64(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber { - var builder = new DoubleArray.Builder(); - builder.Reserve(array.Count); + var builder = new UInt64Array.Builder(); + builder.Reserve(array.Length); foreach (var val in array) - builder.Append(double.CreateChecked(val)); + { + if (val != null) builder.Append(ulong.CreateChecked((T)val)); + else builder.AppendNull(); + } return builder.Build(allocator); } - public static FloatArray CastFloat(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + public static UInt32Array CastUInt32(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber { - var builder = new FloatArray.Builder(); - builder.Reserve(array.Count); + var builder = new UInt32Array.Builder(); + builder.Reserve(array.Length); foreach (var val in array) - builder.Append(float.CreateChecked(val)); + { + if (val != null) builder.Append(uint.CreateChecked((T)val)); + else builder.AppendNull(); + } return builder.Build(allocator); } - public static Int32Array CastInt32(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + public static UInt16Array CastUInt16(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber { - var builder = new Int32Array.Builder(); - builder.Reserve(array.Count); + var builder = new UInt16Array.Builder(); + builder.Reserve(array.Length); foreach (var val in array) - builder.Append(int.CreateChecked(val)); + { + if (val != null) builder.Append(ushort.CreateChecked((T)val)); + else builder.AppendNull(); + } return builder.Build(allocator); } - public static Int64Array CastInt64(IList array, MemoryAllocator? allocator = null) where T : struct, INumber + public static UInt8Array CastUInt8(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber { - var builder = new Int64Array.Builder(); - builder.Reserve(array.Count); + var builder = new UInt8Array.Builder(); + builder.Reserve(array.Length); foreach (var val in array) - builder.Append(long.CreateChecked(val)); + { + if (val != null) builder.Append(byte.CreateChecked((T)val)); + else builder.AppendNull(); + } + return builder.Build(allocator); + } + + public static FloatArray CastFloat(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new FloatArray.Builder(); + builder.Reserve(array.Length); + foreach (var val in array) + { + if (val != null) builder.Append(float.CreateChecked((T)val)); + else builder.AppendNull(); + } + return builder.Build(allocator); + } + + public static DoubleArray CastDouble(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber + { + var builder = new DoubleArray.Builder(); + builder.Reserve(array.Length); + foreach (var val in array) + { + if (val != null) builder.Append(double.CreateChecked((T)val)); + else builder.AppendNull(); + } return builder.Build(allocator); } @@ -382,6 +387,64 @@ public static Int32Array CastInt32(IArrowArray array, MemoryAllocator? allocator } } + public static Int16Array CastInt16(IArrowArray array, MemoryAllocator? allocator = null) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return CastInt16((DoubleArray)array, allocator); + case ArrowTypeId.Float: + return CastInt16((FloatArray)array, allocator); + case ArrowTypeId.Int32: + return CastInt16((Int32Array)array, allocator); + case ArrowTypeId.Int64: + return CastInt16((Int64Array)array, allocator); + case ArrowTypeId.UInt32: + return CastInt16((UInt32Array)array, allocator); + case ArrowTypeId.UInt64: + return CastInt16((UInt64Array)array, allocator); + case ArrowTypeId.Int16: + return CastInt16((Int16Array)array, allocator); + case ArrowTypeId.Int8: + return CastInt16((Int8Array)array, allocator); + case ArrowTypeId.UInt16: + return CastInt16((UInt16Array)array, allocator); + case ArrowTypeId.UInt8: + return CastInt16((UInt8Array)array, allocator); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + public static Int8Array CastInt8(IArrowArray array, MemoryAllocator? allocator = null) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return CastInt8((DoubleArray)array, allocator); + case ArrowTypeId.Float: + return CastInt8((FloatArray)array, allocator); + case ArrowTypeId.Int32: + return CastInt8((Int32Array)array, allocator); + case ArrowTypeId.Int64: + return CastInt8((Int64Array)array, allocator); + case ArrowTypeId.UInt32: + return CastInt8((UInt32Array)array, allocator); + case ArrowTypeId.UInt64: + return CastInt8((UInt64Array)array, allocator); + case ArrowTypeId.Int16: + return CastInt8((Int16Array)array, allocator); + case ArrowTypeId.Int8: + return CastInt8((Int8Array)array, allocator); + case ArrowTypeId.UInt16: + return CastInt8((UInt16Array)array, allocator); + case ArrowTypeId.UInt8: + return CastInt8((UInt8Array)array, allocator); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + public static FloatArray CastFloat(IArrowArray array, MemoryAllocator? allocator = null) { switch (array.Data.DataType.TypeId) @@ -439,5 +502,150 @@ public static DoubleArray CastDouble(IArrowArray array, MemoryAllocator? allocat throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); } } + + public static UInt64Array CastUInt64(IArrowArray array, MemoryAllocator? allocator = null) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return CastUInt64((DoubleArray)array, allocator); + case ArrowTypeId.Float: + return CastUInt64((FloatArray)array, allocator); + case ArrowTypeId.Int32: + return CastUInt64((Int32Array)array, allocator); + case ArrowTypeId.Int64: + return CastUInt64((Int64Array)array, allocator); + case ArrowTypeId.UInt32: + return CastUInt64((UInt32Array)array, allocator); + case ArrowTypeId.UInt64: + return CastUInt64((UInt64Array)array, allocator); + case ArrowTypeId.Int16: + return CastUInt64((Int16Array)array, allocator); + case ArrowTypeId.Int8: + return CastUInt64((Int8Array)array, allocator); + case ArrowTypeId.UInt16: + return CastUInt64((UInt16Array)array, allocator); + case ArrowTypeId.UInt8: + return CastUInt64((UInt8Array)array, allocator); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + public static UInt32Array CastUInt32(IArrowArray array, MemoryAllocator? allocator = null) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return CastUInt32((DoubleArray)array, allocator); + case ArrowTypeId.Float: + return CastUInt32((FloatArray)array, allocator); + case ArrowTypeId.Int32: + return CastUInt32((Int32Array)array, allocator); + case ArrowTypeId.Int64: + return CastUInt32((Int64Array)array, allocator); + case ArrowTypeId.UInt32: + return CastUInt32((UInt32Array)array, allocator); + case ArrowTypeId.UInt64: + return CastUInt32((UInt64Array)array, allocator); + case ArrowTypeId.Int16: + return CastUInt32((Int16Array)array, allocator); + case ArrowTypeId.Int8: + return CastUInt32((Int8Array)array, allocator); + case ArrowTypeId.UInt16: + return CastUInt32((UInt16Array)array, allocator); + case ArrowTypeId.UInt8: + return CastUInt32((UInt8Array)array, allocator); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + public static UInt16Array CastUInt16(IArrowArray array, MemoryAllocator? allocator = null) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return CastUInt16((DoubleArray)array, allocator); + case ArrowTypeId.Float: + return CastUInt16((FloatArray)array, allocator); + case ArrowTypeId.Int32: + return CastUInt16((Int32Array)array, allocator); + case ArrowTypeId.Int64: + return CastUInt16((Int64Array)array, allocator); + case ArrowTypeId.UInt32: + return CastUInt16((UInt32Array)array, allocator); + case ArrowTypeId.UInt64: + return CastUInt16((UInt64Array)array, allocator); + case ArrowTypeId.Int16: + return CastUInt16((Int16Array)array, allocator); + case ArrowTypeId.Int8: + return CastUInt16((Int8Array)array, allocator); + case ArrowTypeId.UInt16: + return CastUInt16((UInt16Array)array, allocator); + case ArrowTypeId.UInt8: + return CastUInt16((UInt8Array)array, allocator); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + public static UInt8Array CastUInt8(IArrowArray array, MemoryAllocator? allocator = null) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return CastUInt8((DoubleArray)array, allocator); + case ArrowTypeId.Float: + return CastUInt8((FloatArray)array, allocator); + case ArrowTypeId.Int32: + return CastUInt8((Int32Array)array, allocator); + case ArrowTypeId.Int64: + return CastUInt8((Int64Array)array, allocator); + case ArrowTypeId.UInt32: + return CastUInt8((UInt32Array)array, allocator); + case ArrowTypeId.UInt64: + return CastUInt8((UInt64Array)array, allocator); + case ArrowTypeId.Int16: + return CastUInt8((Int16Array)array, allocator); + case ArrowTypeId.Int8: + return CastUInt8((Int8Array)array, allocator); + case ArrowTypeId.UInt16: + return CastUInt8((UInt16Array)array, allocator); + case ArrowTypeId.UInt8: + return CastUInt8((UInt8Array)array, allocator); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } + + public static BooleanArray CastBool(IArrowArray array, MemoryAllocator? allocator = null) + { + switch (array.Data.DataType.TypeId) + { + case ArrowTypeId.Double: + return CastBool((DoubleArray)array, allocator); + case ArrowTypeId.Float: + return CastBool((FloatArray)array, allocator); + case ArrowTypeId.Int32: + return CastBool((Int32Array)array, allocator); + case ArrowTypeId.Int64: + return CastBool((Int64Array)array, allocator); + case ArrowTypeId.UInt32: + return CastBool((UInt32Array)array, allocator); + case ArrowTypeId.UInt64: + return CastBool((UInt64Array)array, allocator); + case ArrowTypeId.Int16: + return CastBool((Int16Array)array, allocator); + case ArrowTypeId.Int8: + return CastBool((Int8Array)array, allocator); + case ArrowTypeId.UInt16: + return CastBool((UInt16Array)array, allocator); + case ArrowTypeId.UInt8: + return CastBool((UInt8Array)array, allocator); + default: + throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); + } + } } From 33e651a1fee29674476c5624e93986b2c330b384 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sat, 28 Feb 2026 22:01:55 -0500 Subject: [PATCH 6/9] chore: use ArgumentException --- src/Apache.Arrow.Operations/Select.cs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Apache.Arrow.Operations/Select.cs b/src/Apache.Arrow.Operations/Select.cs index 6741dc78..306a7eaf 100644 --- a/src/Apache.Arrow.Operations/Select.cs +++ b/src/Apache.Arrow.Operations/Select.cs @@ -34,10 +34,10 @@ public static class Select /// The mask defining which values to keep or exclude /// The memory allocator to build the new array from /// - /// If the mask and the array are not of equal size + /// If the mask and the array are not of equal size public static Array Filter(Array array, BooleanArray mask, MemoryAllocator? allocator = null) { - if (array.Length != mask.Length) throw new InvalidOperationException("Array and mask must have the same length"); + if (array.Length != mask.Length) throw new ArgumentException("Array and mask must have the same length"); List<(int, int)> spans = new(); int? start = null; for (int i = 0; i < mask.Length; i++) @@ -98,7 +98,6 @@ public static Array Take(Array array, IList<(int, int)> spans, MemoryAllocator? /// The indices to select /// The memory allocator to build the new array from /// - /// public static Array Take(Array array, IList indices, MemoryAllocator? allocator = null) { if (indices.Count == 0) @@ -180,7 +179,7 @@ public static Dictionary Filter(Dictionary batch, Boolean /// public static RecordBatch Filter(RecordBatch batch, BooleanArray mask, MemoryAllocator? allocator = null) { - if (batch.Length != mask.Length) throw new InvalidOperationException("Array and mask must have the same length"); + if (batch.Length != mask.Length) throw new ArgumentException("Array and mask must have the same length"); List<(int, int)> spans = new(); int? start = null; for (int i = 0; i < mask.Length; i++) From 3b1ed327ed3db409a961e061e960fac37168fa46 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sat, 28 Feb 2026 22:39:36 -0500 Subject: [PATCH 7/9] feature: add sql-like null handling option in comparisons --- src/Apache.Arrow.Operations/Comparison.cs | 138 ++++++++++++++++++---- src/Apache.Arrow.Operations/Conversion.cs | 18 --- src/Apache.Arrow.Operations/Select.cs | 56 ++++++--- src/Apache.Arrow.Operations/Text.cs | 32 ++--- 4 files changed, 170 insertions(+), 74 deletions(-) diff --git a/src/Apache.Arrow.Operations/Comparison.cs b/src/Apache.Arrow.Operations/Comparison.cs index 81d8186c..2e2a7cd7 100644 --- a/src/Apache.Arrow.Operations/Comparison.cs +++ b/src/Apache.Arrow.Operations/Comparison.cs @@ -257,6 +257,22 @@ public static ArrowBuffer Xor(ArrowBuffer buffer, ArrowBuffer buffer2) } } +/// +/// Specifies how null values should be handled in comparison operations. +/// +public enum ComparisonNullHandling +{ + /// + /// If both values are null, they are equal. This is the default behavior in C# + /// + Equality, + + /// + /// Propagate null: if any value in the comparison is null, return null, as in SQL. + /// + Propagate, +} + public static class Comparison { /// @@ -355,6 +371,7 @@ public static BooleanArray Xor(BooleanArray lhs, BooleanArray rhs, MemoryAllocat /// /// /// + /// /// public static BooleanArray Equal(PrimitiveArray lhs, T? rhs, MemoryAllocator? allocator = null) where T : struct, INumber { @@ -379,18 +396,41 @@ public static BooleanArray Equal(PrimitiveArray lhs, T? rhs, MemoryAllocat /// /// /// + /// /// /// - public static BooleanArray Equal(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null) where T : struct, INumber + public static BooleanArray Equal(PrimitiveArray lhs, PrimitiveArray rhs, MemoryAllocator? allocator = null, ComparisonNullHandling nullHandling = ComparisonNullHandling.Equality) where T : struct, INumber { if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); var cmp = new BooleanArray.Builder(lhs.Length); - for (int i = 0; i < lhs.Length; i++) + switch (nullHandling) { - var a = lhs.GetValue(i); - var b = rhs.GetValue(i); - var flag = a == b; - cmp.Append(flag); + case ComparisonNullHandling.Equality: + { + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var b = rhs.GetValue(i); + var flag = a == b; + cmp.Append(flag); + } + break; + } + case ComparisonNullHandling.Propagate: + { + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetValue(i); + var b = rhs.GetValue(i); + if (a == null || b == null) + cmp.AppendNull(); + else + cmp.Append(a == b); + } + break; + } + default: + throw new NotImplementedException($"{nullHandling}"); } return cmp.Build(allocator); } @@ -401,12 +441,16 @@ public static BooleanArray Equal(PrimitiveArray lhs, PrimitiveArray rhs /// /// /// + /// /// - public static BooleanArray Equal(StringArray lhs, string? rhs, MemoryAllocator? allocator = null) + public static BooleanArray Equal(StringArray lhs, string? rhs, MemoryAllocator? allocator = null, ComparisonNullHandling nullHandling = ComparisonNullHandling.Equality) { if (rhs == null) { - return new BooleanArray(lhs.NullBitmapBuffer.Clone(), ArrowBuffer.Empty, lhs.Length, 0, 0); + if (nullHandling == ComparisonNullHandling.Equality) + return new BooleanArray(lhs.NullBitmapBuffer.Clone(), ArrowBuffer.Empty, lhs.Length, 0, 0); + else if (nullHandling == ComparisonNullHandling.Propagate) + return new BooleanArray(lhs.NullBitmapBuffer.Clone(), lhs.NullBitmapBuffer.Clone(), lhs.Length, lhs.NullCount, 0); } var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) @@ -424,18 +468,41 @@ public static BooleanArray Equal(StringArray lhs, string? rhs, MemoryAllocator? /// /// /// + /// /// /// - public static BooleanArray Equal(StringArray lhs, StringArray rhs, MemoryAllocator? allocator = null) + public static BooleanArray Equal(StringArray lhs, StringArray rhs, MemoryAllocator? allocator = null, ComparisonNullHandling nullHandling = ComparisonNullHandling.Equality) { if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); var cmp = new BooleanArray.Builder(lhs.Length); - for (int i = 0; i < lhs.Length; i++) + switch (nullHandling) { - var a = lhs.GetString(i); - var b = rhs.GetString(i); - var flag = a == b; - cmp.Append(flag); + case ComparisonNullHandling.Equality: + { + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetString(i); + var b = rhs.GetString(i); + var flag = a == b; + cmp.Append(flag); + } + break; + } + case ComparisonNullHandling.Propagate: + { + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetString(i); + var b = rhs.GetString(i); + if (a == null || b == null) + cmp.AppendNull(); + else + cmp.Append(a == b); + } + break; + } + default: + throw new NotImplementedException($"{nullHandling}"); } return cmp.Build(allocator); } @@ -446,12 +513,16 @@ public static BooleanArray Equal(StringArray lhs, StringArray rhs, MemoryAllocat /// /// /// + /// /// - public static BooleanArray Equal(LargeStringArray lhs, string? rhs, MemoryAllocator? allocator = null) + public static BooleanArray Equal(LargeStringArray lhs, string? rhs, MemoryAllocator? allocator = null, ComparisonNullHandling nullHandling = ComparisonNullHandling.Equality) { if (rhs == null) { - return new BooleanArray(lhs.NullBitmapBuffer.Clone(), ArrowBuffer.Empty, lhs.Length, 0, 0); + if (nullHandling == ComparisonNullHandling.Equality) + return new BooleanArray(lhs.NullBitmapBuffer.Clone(), ArrowBuffer.Empty, lhs.Length, 0, 0); + else if (nullHandling == ComparisonNullHandling.Propagate) + return new BooleanArray(lhs.NullBitmapBuffer.Clone(), lhs.NullBitmapBuffer.Clone(), lhs.Length, lhs.NullCount, 0); } var cmp = new BooleanArray.Builder(lhs.Length); for (int i = 0; i < lhs.Length; i++) @@ -469,18 +540,41 @@ public static BooleanArray Equal(LargeStringArray lhs, string? rhs, MemoryAlloca /// /// /// + /// /// /// - public static BooleanArray Equal(LargeStringArray lhs, LargeStringArray rhs, MemoryAllocator? allocator = null) + public static BooleanArray Equal(LargeStringArray lhs, LargeStringArray rhs, MemoryAllocator? allocator = null, ComparisonNullHandling nullHandling = ComparisonNullHandling.Equality) { if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); var cmp = new BooleanArray.Builder(lhs.Length); - for (int i = 0; i < lhs.Length; i++) + switch (nullHandling) { - var a = lhs.GetString(i); - var b = rhs.GetString(i); - var flag = a == b; - cmp.Append(flag); + case ComparisonNullHandling.Equality: + { + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetString(i); + var b = rhs.GetString(i); + var flag = a == b; + cmp.Append(flag); + } + break; + } + case ComparisonNullHandling.Propagate: + { + for (int i = 0; i < lhs.Length; i++) + { + var a = lhs.GetString(i); + var b = rhs.GetString(i); + if (a == null || b == null) + cmp.AppendNull(); + else + cmp.Append(a == b); + } + break; + } + default: + throw new NotImplementedException($"{nullHandling}"); } return cmp.Build(allocator); } diff --git a/src/Apache.Arrow.Operations/Conversion.cs b/src/Apache.Arrow.Operations/Conversion.cs index 0857a8f8..bf6ee987 100644 --- a/src/Apache.Arrow.Operations/Conversion.cs +++ b/src/Apache.Arrow.Operations/Conversion.cs @@ -22,24 +22,6 @@ namespace Apache.Arrow.Operations; -/// -/// Specifies how null values should be handled in aggregate computations. -/// -public enum NullHandling -{ - /// - /// Skip null values when computing the result. - /// Returns null only if the array is empty or all values are null. - /// - Skip, - - /// - /// Propagate null: if any value in the array is null, return null. - /// - Propagate -} - - /// /// Copy primitive arraays between types to explicitly known numerical types. When the type already /// matches, no copy is performed. diff --git a/src/Apache.Arrow.Operations/Select.cs b/src/Apache.Arrow.Operations/Select.cs index 306a7eaf..106b3885 100644 --- a/src/Apache.Arrow.Operations/Select.cs +++ b/src/Apache.Arrow.Operations/Select.cs @@ -22,6 +22,7 @@ namespace Apache.Arrow.Operations; + public static class Select { /// @@ -285,6 +286,25 @@ public static RecordBatch Take(RecordBatch batch, IList indices, MemoryAllo } + +/// +/// Specifies how null values should be handled in aggregate computations. +/// +public enum AggregateNullHandling +{ + /// + /// Skip null values when computing the result. + /// Returns null only if the array is empty or all values are null. + /// + Skip, + + /// + /// Propagate null: if any value in the array is null, return null. + /// + Propagate +} + + public static class Aggregate { @@ -296,7 +316,7 @@ public static class Aggregate /// How to handle null values. /// The minimum value, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static T? Min(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + public static T? Min(PrimitiveArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) where T : struct, INumber { if (array.Length == 0) @@ -308,7 +328,7 @@ public static class Aggregate var value = array.GetValue(i); if (value == null) { - if (nullHandling == NullHandling.Propagate) + if (nullHandling == AggregateNullHandling.Propagate) return null; continue; } @@ -326,7 +346,7 @@ public static class Aggregate /// How to handle null values. /// The minimum value, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static double? Min(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + public static double? Min(IArrowArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) { switch (array.Data.DataType.TypeId) { @@ -363,7 +383,7 @@ public static class Aggregate /// How to handle null values. /// The maximum value, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static T? Max(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + public static T? Max(PrimitiveArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) where T : struct, INumber { if (array.Length == 0) @@ -375,7 +395,7 @@ public static class Aggregate var value = array.GetValue(i); if (value == null) { - if (nullHandling == NullHandling.Propagate) + if (nullHandling == AggregateNullHandling.Propagate) return null; continue; } @@ -393,7 +413,7 @@ public static class Aggregate /// How to handle null values. /// The maximum value, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static double? Max(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + public static double? Max(IArrowArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) { switch (array.Data.DataType.TypeId) { @@ -430,7 +450,7 @@ public static class Aggregate /// How to handle null values. /// The index of the minimum value, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static int? ArgMin(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + public static int? ArgMin(PrimitiveArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) where T : struct, INumber { if (array.Length == 0) @@ -443,7 +463,7 @@ public static class Aggregate var value = array.GetValue(i); if (value == null) { - if (nullHandling == NullHandling.Propagate) + if (nullHandling == AggregateNullHandling.Propagate) return null; continue; } @@ -464,7 +484,7 @@ public static class Aggregate /// How to handle null values. /// The index of the minimum value, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static int? ArgMin(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + public static int? ArgMin(IArrowArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) { switch (array.Data.DataType.TypeId) { @@ -501,7 +521,7 @@ public static class Aggregate /// How to handle null values. /// The index of the maximum value, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static int? ArgMax(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + public static int? ArgMax(PrimitiveArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) where T : struct, INumber { if (array.Length == 0) @@ -514,7 +534,7 @@ public static class Aggregate var value = array.GetValue(i); if (value == null) { - if (nullHandling == NullHandling.Propagate) + if (nullHandling == AggregateNullHandling.Propagate) return null; continue; } @@ -535,7 +555,7 @@ public static class Aggregate /// How to handle null values. /// The index of the maximum value, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static int? ArgMax(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + public static int? ArgMax(IArrowArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) { switch (array.Data.DataType.TypeId) { @@ -572,7 +592,7 @@ public static class Aggregate /// How to handle null values. /// The sum of values, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static T? Sum(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + public static T? Sum(PrimitiveArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) where T : struct, INumber { if (array.Length == 0) @@ -585,7 +605,7 @@ public static class Aggregate var value = array.GetValue(i); if (value == null) { - if (nullHandling == NullHandling.Propagate) + if (nullHandling == AggregateNullHandling.Propagate) return null; continue; } @@ -603,7 +623,7 @@ public static class Aggregate /// How to handle null values. /// The sum of values, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static double? Sum(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + public static double? Sum(IArrowArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) { switch (array.Data.DataType.TypeId) { @@ -640,7 +660,7 @@ public static class Aggregate /// How to handle null values. /// The mean as a double, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static double? Mean(PrimitiveArray array, NullHandling nullHandling = NullHandling.Skip) + public static double? Mean(PrimitiveArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) where T : struct, INumber { if (array.Length == 0) @@ -653,7 +673,7 @@ public static class Aggregate var value = array.GetValue(i); if (value == null) { - if (nullHandling == NullHandling.Propagate) + if (nullHandling == AggregateNullHandling.Propagate) return null; continue; } @@ -671,7 +691,7 @@ public static class Aggregate /// How to handle null values. /// The mean as a double, or null if the array is empty, all values are null, /// or nullHandling is Propagate and any null exists. - public static double? Mean(IArrowArray array, NullHandling nullHandling = NullHandling.Skip) + public static double? Mean(IArrowArray array, AggregateNullHandling nullHandling = AggregateNullHandling.Skip) { switch (array.Data.DataType.TypeId) { diff --git a/src/Apache.Arrow.Operations/Text.cs b/src/Apache.Arrow.Operations/Text.cs index 45d263b8..8242c580 100644 --- a/src/Apache.Arrow.Operations/Text.cs +++ b/src/Apache.Arrow.Operations/Text.cs @@ -37,7 +37,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int List indenting = Enumerable.Repeat(indenter, indent).ToList(); string indentString = string.Concat(indenting); - stream.WriteLine($"{indentString}["); + stream.WriteLine($"{indentString}[ {array.Length} elements"); var pad = indentString + indenter; switch (array.Data.DataType.TypeId) { @@ -46,7 +46,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int var valArray = (FloatArray)array; foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -55,7 +55,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int var valArray = (DoubleArray)array; foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -64,7 +64,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int var valArray = (Int32Array)array; foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -73,7 +73,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int var valArray = (Int64Array)array; foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -82,7 +82,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int var valArray = (Int16Array)array; foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -91,7 +91,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int var valArray = (Int8Array)array; foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -100,7 +100,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int var valArray = (UInt8Array)array; foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -109,7 +109,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int var valArray = (UInt16Array)array; foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -118,7 +118,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int var valArray = (UInt32Array)array; foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -127,7 +127,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int var valArray = (UInt64Array)array; foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -137,7 +137,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -146,7 +146,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int var valArray = (HalfFloatArray)array; foreach (var v in valArray) { - stream.WriteLine($"{pad}{v}"); + stream.WriteLine($"{pad}{(v == null ? "null" : v)}"); } break; } @@ -157,7 +157,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int { if (valArray.IsNull(i)) { - stream.WriteLine($"{pad}{null}"); + stream.WriteLine($"{pad}null"); } else { @@ -174,7 +174,7 @@ public static void PrettyPrintFormat(IArrowArray array, StreamWriter stream, int { if (valArray.IsNull(i)) { - stream.WriteLine($"{pad}{null}"); + stream.WriteLine($"{pad}null"); } else { @@ -222,7 +222,7 @@ public static string PrettyPrintFormat(IArrowArray array, int indent = 0, string } /// - /// Pretty print `array` to `STDOUT` via `Console.WriteLine`. Prefer `PrettyPrintFormat` to control where the + /// Pretty print `array` via `Console.WriteLine`. Prefer `PrettyPrintFormat` to control where the /// writing happens. /// /// From ebd1afb3418c4dbac27da87ad8419894759f3f24 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 1 Mar 2026 13:19:35 -0500 Subject: [PATCH 8/9] chore: reorder import --- src/Apache.Arrow.Operations/Comparison.cs | 5 +++-- src/Apache.Arrow.Operations/Conversion.cs | 2 +- src/Apache.Arrow.Operations/Select.cs | 2 +- src/Apache.Arrow.Operations/Text.cs | 2 +- src/Apache.Arrow/Arrays/ListArray.cs | 4 ++-- test/Apache.Arrow.Operations.Tests/TestOperations.cs | 5 +++-- 6 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/Apache.Arrow.Operations/Comparison.cs b/src/Apache.Arrow.Operations/Comparison.cs index 2e2a7cd7..7d4a31f4 100644 --- a/src/Apache.Arrow.Operations/Comparison.cs +++ b/src/Apache.Arrow.Operations/Comparison.cs @@ -16,10 +16,11 @@ using System; using System.Numerics; +using System.Runtime.Intrinsics; + using Apache.Arrow.Memory; using Apache.Arrow.Types; -using System.Runtime.Intrinsics; namespace Apache.Arrow.Operations; @@ -69,7 +70,7 @@ public static ArrowBuffer OnesComplement(ArrowBuffer buffer) else break; } - for(var i = offset; i < size; i++) + for (var i = offset; i < size; i++) { store[i] = (byte)~buffer.Span[i]; } diff --git a/src/Apache.Arrow.Operations/Conversion.cs b/src/Apache.Arrow.Operations/Conversion.cs index bf6ee987..1a15c91e 100644 --- a/src/Apache.Arrow.Operations/Conversion.cs +++ b/src/Apache.Arrow.Operations/Conversion.cs @@ -179,7 +179,7 @@ public static Int8Array CastInt8(IList array, MemoryAllocator? allocator = return builder.Build(allocator); } - public static BooleanArray CastBool(PrimitiveArray array, MemoryAllocator? allocator = null) where T: struct, INumber + public static BooleanArray CastBool(PrimitiveArray array, MemoryAllocator? allocator = null) where T : struct, INumber { var builder = new BooleanArray.Builder(); builder.Reserve(array.Length); diff --git a/src/Apache.Arrow.Operations/Select.cs b/src/Apache.Arrow.Operations/Select.cs index 106b3885..e707da41 100644 --- a/src/Apache.Arrow.Operations/Select.cs +++ b/src/Apache.Arrow.Operations/Select.cs @@ -719,4 +719,4 @@ public static class Aggregate throw new InvalidDataException("Unsupported data type " + array.Data.DataType.Name); } } -} \ No newline at end of file +} diff --git a/src/Apache.Arrow.Operations/Text.cs b/src/Apache.Arrow.Operations/Text.cs index 8242c580..5c7075ce 100644 --- a/src/Apache.Arrow.Operations/Text.cs +++ b/src/Apache.Arrow.Operations/Text.cs @@ -233,4 +233,4 @@ public static void PrettyPrint(IArrowArray array, int indent = 0, string indente var text = PrettyPrintFormat(array, indent, indenter); Console.WriteLine(text); } -} \ No newline at end of file +} diff --git a/src/Apache.Arrow/Arrays/ListArray.cs b/src/Apache.Arrow/Arrays/ListArray.cs index f245b49a..a9d9148c 100644 --- a/src/Apache.Arrow/Arrays/ListArray.cs +++ b/src/Apache.Arrow/Arrays/ListArray.cs @@ -14,10 +14,10 @@ // limitations under the License. using System; -using Apache.Arrow.Memory; -using Apache.Arrow.Types; using System.Collections; using System.Collections.Generic; +using Apache.Arrow.Memory; +using Apache.Arrow.Types; namespace Apache.Arrow { diff --git a/test/Apache.Arrow.Operations.Tests/TestOperations.cs b/test/Apache.Arrow.Operations.Tests/TestOperations.cs index 4ca830d8..cb138cd4 100644 --- a/test/Apache.Arrow.Operations.Tests/TestOperations.cs +++ b/test/Apache.Arrow.Operations.Tests/TestOperations.cs @@ -61,7 +61,8 @@ public void TestSelectionFilterMask() } -public class ArrowBooleanOperationsTests { +public class ArrowBooleanOperationsTests +{ [Fact] public void TestInvert() { @@ -144,4 +145,4 @@ public void TestXor() Assert.Equal(2500, result.Count(s => s ?? false)); Assert.Equal(0, ((BooleanArray)result.Slice(0, 2500)).Count(s => s ?? false)); } -} \ No newline at end of file +} From 950dc55cc7fadb014fbaea5c294478f4b024c72c Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Mon, 20 Apr 2026 22:10:34 -0400 Subject: [PATCH 9/9] change: start handling review feedback --- .../Apache.Arrow.Operations.csproj | 6 +- src/Apache.Arrow.Operations/Bitops.cs | 305 ++++++++++++++++++ src/Apache.Arrow.Operations/Comparison.cs | 260 +-------------- .../Apache.Arrow.Operations.Tests.csproj | 20 +- 4 files changed, 320 insertions(+), 271 deletions(-) create mode 100644 src/Apache.Arrow.Operations/Bitops.cs diff --git a/src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj b/src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj index efe7b036..51796055 100644 --- a/src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj +++ b/src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj @@ -1,8 +1,4 @@ - - - - net8.0 enable @@ -13,4 +9,4 @@ - \ No newline at end of file + diff --git a/src/Apache.Arrow.Operations/Bitops.cs b/src/Apache.Arrow.Operations/Bitops.cs new file mode 100644 index 00000000..12cf82ee --- /dev/null +++ b/src/Apache.Arrow.Operations/Bitops.cs @@ -0,0 +1,305 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System.Runtime.Intrinsics; + +using Apache.Arrow.Memory; + +namespace Apache.Arrow.Operations; + +internal static class BitVectorOps +{ + internal static ArrowBuffer AllOnes(int numBytes, MemoryAllocator? allocator = default) + { + var zeros = AllZeros(numBytes, allocator); + return OnesComplement(zeros, allocator); + } + + internal static ArrowBuffer AllZeros(int numBytes, MemoryAllocator? allocator = default) + { + // Exploit that this uses new byte[...] to allocate the memory which necessarily + // zeros out everything. + var builder = new ArrowBuffer.BitmapBuilder(numBytes * 8); + builder.Set(numBytes * 8 - 1, false); + return builder.Build(allocator); + } + + internal static ArrowBuffer OnesComplement(ArrowBuffer buffer, MemoryAllocator? allocator = default) + { + var builder = new ArrowBuffer.BitmapBuilder(buffer.Length * 8); + var store = builder.Span; + int offset = 0; + int size = buffer.Span.Length; + + if (Vector512.IsHardwareAccelerated) + { + while ((size - offset) >= 64) + { + var part = buffer.Span.Slice(offset, 64); + Vector512 vector = Vector512.Create(part); + vector = Vector512.OnesComplement(vector); + vector.CopyTo(store.Slice(offset, 64)); + offset += 64; + } + } + if (Vector256.IsHardwareAccelerated) + { + while ((size - offset) >= 32) + { + var part = buffer.Span.Slice(offset, 32); + Vector256 vector = Vector256.Create(part); + vector = Vector256.OnesComplement(vector); + vector.CopyTo(store.Slice(offset, 32)); + offset += 32; + } + } + while ((size - offset) >= 16) + { + var part = buffer.Span.Slice(offset, 16); + Vector128 vector = Vector128.Create(part); + vector = Vector128.OnesComplement(vector); + vector.CopyTo(store.Slice(offset, 16)); + offset += 16; + } + while ((size - offset) >= 8) + { + var part = buffer.Span.Slice(offset, 8); + Vector64 vector = Vector64.Create(part); + vector = Vector64.OnesComplement(vector); + vector.CopyTo(store.Slice(offset, 8)); + offset += 8; + } + for (var i = offset; i < size; i++) + { + store[i] = (byte)~buffer.Span[i]; + } + return builder.Build(allocator); + } + + internal static ArrowBuffer And(ArrowBuffer lhs, ArrowBuffer rhs, MemoryAllocator? allocator = default) + { + if (lhs.IsEmpty) + { + if (rhs.IsEmpty) + { + return ArrowBuffer.Empty; + } + else + { + return rhs; + } + } + else if (rhs.IsEmpty) return lhs; + + var builder = new ArrowBuffer.BitmapBuilder(lhs.Length * 8); + var store = builder.Span; + int offset = 0; + int size = lhs.Span.Length; + + if (Vector512.IsHardwareAccelerated) + { + while ((size - offset) >= 64) + { + var part = lhs.Span.Slice(offset, 64); + Vector512 vlhs = Vector512.Create(part); + part = rhs.Span.Slice(offset, 64); + Vector512 vrhs = Vector512.Create(part); + vlhs = vlhs & vrhs; + vlhs.CopyTo(store.Slice(offset, 64)); + offset += 64; + } + } + if (Vector256.IsHardwareAccelerated) + { + while ((size - offset) >= 32) + { + var part = lhs.Span.Slice(offset, 32); + Vector256 vlhs = Vector256.Create(part); + part = rhs.Span.Slice(offset, 32); + Vector256 vrhs = Vector256.Create(part); + vlhs = vlhs & vrhs; + vlhs.CopyTo(store.Slice(offset, 32)); + offset += 32; + } + } + while ((size - offset) >= 16) + { + var part = lhs.Span.Slice(offset, 16); + Vector128 vlhs = Vector128.Create(part); + part = rhs.Span.Slice(offset, 16); + Vector128 vrhs = Vector128.Create(part); + vlhs = vlhs & vrhs; + vlhs.CopyTo(store.Slice(offset, 16)); + offset += 16; + } + while ((size - offset) >= 8) + { + var part = lhs.Span.Slice(offset, 8); + Vector64 vlhs = Vector64.Create(part); + part = rhs.Span.Slice(offset, 8); + Vector64 vrhs = Vector64.Create(part); + vlhs = vlhs & vrhs; + vlhs.CopyTo(store.Slice(offset, 8)); + offset += 8; + } + for (var i = offset; i < size; i++) + { + store[i] = (byte)(lhs.Span[i] & rhs.Span[i]); + } + return builder.Build(allocator); + } + + internal static ArrowBuffer Or(ArrowBuffer lhs, ArrowBuffer rhs, MemoryAllocator? allocator = default) + { + if (lhs.IsEmpty) + { + return lhs; + } + else if (rhs.IsEmpty) return rhs; + + var builder = new ArrowBuffer.BitmapBuilder(lhs.Length * 8); + var store = builder.Span; + int offset = 0; + int size = lhs.Span.Length; + + if (Vector512.IsHardwareAccelerated) + { + while ((size - offset) >= 64) + { + var part = lhs.Span.Slice(offset, 64); + Vector512 vlhs = Vector512.Create(part); + part = rhs.Span.Slice(offset, 64); + Vector512 vrhs = Vector512.Create(part); + vlhs = vlhs | vrhs; + vlhs.CopyTo(store.Slice(offset, 64)); + offset += 64; + } + } + if (Vector256.IsHardwareAccelerated) + { + while ((size - offset) >= 32) + { + var part = lhs.Span.Slice(offset, 32); + Vector256 vlhs = Vector256.Create(part); + part = rhs.Span.Slice(offset, 32); + Vector256 vrhs = Vector256.Create(part); + vlhs = vlhs | vrhs; + vlhs.CopyTo(store.Slice(offset, 32)); + offset += 32; + } + } + while ((size - offset) >= 16) + { + var part = lhs.Span.Slice(offset, 16); + Vector128 vlhs = Vector128.Create(part); + part = rhs.Span.Slice(offset, 16); + Vector128 vrhs = Vector128.Create(part); + vlhs = vlhs | vrhs; + vlhs.CopyTo(store.Slice(offset, 16)); + offset += 16; + } + while ((size - offset) >= 8) + { + var part = lhs.Span.Slice(offset, 8); + Vector64 vlhs = Vector64.Create(part); + part = rhs.Span.Slice(offset, 8); + Vector64 vrhs = Vector64.Create(part); + vlhs = vlhs | vrhs; + vlhs.CopyTo(store.Slice(offset, 8)); + offset += 8; + } + for (var i = offset; i < size; i++) + { + store[i] = (byte)(lhs.Span[i] | rhs.Span[i]); + } + return builder.Build(allocator); + } + + internal static ArrowBuffer Xor(ArrowBuffer lhs, ArrowBuffer rhs, MemoryAllocator? allocator = default) + { + if (lhs.IsEmpty) + { + if (rhs.IsEmpty) + { + return ArrowBuffer.Empty; + } + else + { + return OnesComplement(rhs, allocator); + } + } + else if (rhs.IsEmpty) + { + return OnesComplement(lhs, allocator); + } + var builder = new ArrowBuffer.BitmapBuilder(lhs.Length * 8); + var store = builder.Span; + int offset = 0; + int size = lhs.Span.Length; + + if (Vector512.IsHardwareAccelerated) + { + while ((size - offset) >= 64) + { + var part = lhs.Span.Slice(offset, 64); + Vector512 vlhs = Vector512.Create(part); + part = rhs.Span.Slice(offset, 64); + Vector512 vrhs = Vector512.Create(part); + vlhs = vlhs ^ vrhs; + vlhs.CopyTo(store.Slice(offset, 64)); + offset += 64; + } + } + if (Vector256.IsHardwareAccelerated) + { + while ((size - offset) >= 32) + { + var part = lhs.Span.Slice(offset, 32); + Vector256 vlhs = Vector256.Create(part); + part = rhs.Span.Slice(offset, 32); + Vector256 vrhs = Vector256.Create(part); + vlhs = vlhs ^ vrhs; + vlhs.CopyTo(store.Slice(offset, 32)); + offset += 32; + } + } + while ((size - offset) >= 16) + { + var part = lhs.Span.Slice(offset, 16); + Vector128 vlhs = Vector128.Create(part); + part = rhs.Span.Slice(offset, 16); + Vector128 vrhs = Vector128.Create(part); + vlhs = vlhs ^ vrhs; + vlhs.CopyTo(store.Slice(offset, 16)); + offset += 16; + } + while ((size - offset) >= 8) + { + var part = lhs.Span.Slice(offset, 8); + Vector64 vlhs = Vector64.Create(part); + part = rhs.Span.Slice(offset, 8); + Vector64 vrhs = Vector64.Create(part); + vlhs = vlhs ^ vrhs; + vlhs.CopyTo(store.Slice(offset, 8)); + offset += 8; + } + + for (var i = offset; i < size; i++) + { + store[i] = (byte)(lhs.Span[i] ^ rhs.Span[i]); + } + return builder.Build(allocator); + } +} diff --git a/src/Apache.Arrow.Operations/Comparison.cs b/src/Apache.Arrow.Operations/Comparison.cs index 7d4a31f4..0de4d80f 100644 --- a/src/Apache.Arrow.Operations/Comparison.cs +++ b/src/Apache.Arrow.Operations/Comparison.cs @@ -13,250 +13,14 @@ // See the License for the specific language governing permissions and // limitations under the License. - using System; using System.Numerics; -using System.Runtime.Intrinsics; using Apache.Arrow.Memory; using Apache.Arrow.Types; - namespace Apache.Arrow.Operations; -public static class BitVectorOps -{ - public static ArrowBuffer OnesComplement(ArrowBuffer buffer) - { - var builder = new ArrowBuffer.BitmapBuilder(buffer.Length * 8); - var store = builder.Span; - int offset = 0; - int size = buffer.Span.Length; - - while ((size - offset) >= 8) - { - if ((size - offset) >= 64) - { - var part = buffer.Span.Slice(offset, 64); - Vector512 vector = Vector512.Create(part); - vector = Vector512.OnesComplement(vector); - vector.CopyTo(store.Slice(offset, 64)); - offset += 64; - } - else if ((size - offset) >= 32) - { - var part = buffer.Span.Slice(offset, 32); - Vector256 vector = Vector256.Create(part); - vector = Vector256.OnesComplement(vector); - vector.CopyTo(store.Slice(offset, 32)); - offset += 32; - } - else if ((size - offset) >= 16) - { - var part = buffer.Span.Slice(offset, 16); - Vector128 vector = Vector128.Create(part); - vector = Vector128.OnesComplement(vector); - vector.CopyTo(store.Slice(offset, 16)); - offset += 16; - } - else if ((size - offset) >= 8) - { - var part = buffer.Span.Slice(offset, 8); - Vector64 vector = Vector64.Create(part); - vector = Vector64.OnesComplement(vector); - vector.CopyTo(store.Slice(offset, 8)); - offset += 8; - } - else break; - } - - for (var i = offset; i < size; i++) - { - store[i] = (byte)~buffer.Span[i]; - } - return builder.Build(); - } - - public static ArrowBuffer And(ArrowBuffer buffer, ArrowBuffer buffer2) - { - var builder = new ArrowBuffer.BitmapBuilder(buffer.Length * 8); - var store = builder.Span; - int offset = 0; - int size = buffer.Span.Length; - - while ((size - offset) >= 8) - { - if ((size - offset) >= 64) - { - var part = buffer.Span.Slice(offset, 64); - Vector512 vector = Vector512.Create(part); - part = buffer2.Span.Slice(offset, 64); - Vector512 vector2 = Vector512.Create(part); - vector = vector & vector2; - vector.CopyTo(store.Slice(offset, 64)); - offset += 64; - } - else if ((size - offset) >= 32) - { - var part = buffer.Span.Slice(offset, 32); - Vector256 vector = Vector256.Create(part); - part = buffer2.Span.Slice(offset, 32); - Vector256 vector2 = Vector256.Create(part); - vector = vector & vector2; - vector.CopyTo(store.Slice(offset, 32)); - offset += 32; - } - else if ((size - offset) >= 16) - { - var part = buffer.Span.Slice(offset, 16); - Vector128 vector = Vector128.Create(part); - part = buffer2.Span.Slice(offset, 16); - Vector128 vector2 = Vector128.Create(part); - vector = vector & vector2; - vector = Vector128.OnesComplement(vector); - vector.CopyTo(store.Slice(offset, 16)); - offset += 16; - } - else if ((size - offset) >= 8) - { - var part = buffer.Span.Slice(offset, 8); - Vector64 vector = Vector64.Create(part); - part = buffer2.Span.Slice(offset, 8); - Vector64 vector2 = Vector64.Create(part); - vector = vector & vector2; - vector.CopyTo(store.Slice(offset, 8)); - offset += 8; - } - else break; - } - - for (var i = offset; i < size; i++) - { - store[i] = (byte)(buffer.Span[i] & buffer2.Span[i]); - } - return builder.Build(); - } - - public static ArrowBuffer Or(ArrowBuffer buffer, ArrowBuffer buffer2) - { - var builder = new ArrowBuffer.BitmapBuilder(buffer.Length * 8); - var store = builder.Span; - int offset = 0; - int size = buffer.Span.Length; - - while ((size - offset) >= 8) - { - if ((size - offset) >= 64) - { - var part = buffer.Span.Slice(offset, 64); - Vector512 vector = Vector512.Create(part); - part = buffer2.Span.Slice(offset, 64); - Vector512 vector2 = Vector512.Create(part); - vector = vector | vector2; - vector.CopyTo(store.Slice(offset, 64)); - offset += 64; - } - else if ((size - offset) >= 32) - { - var part = buffer.Span.Slice(offset, 32); - Vector256 vector = Vector256.Create(part); - part = buffer2.Span.Slice(offset, 32); - Vector256 vector2 = Vector256.Create(part); - vector = vector | vector2; - vector.CopyTo(store.Slice(offset, 32)); - offset += 32; - } - else if ((size - offset) >= 16) - { - var part = buffer.Span.Slice(offset, 16); - Vector128 vector = Vector128.Create(part); - part = buffer2.Span.Slice(offset, 16); - Vector128 vector2 = Vector128.Create(part); - vector = vector | vector2; - vector = Vector128.OnesComplement(vector); - vector.CopyTo(store.Slice(offset, 16)); - offset += 16; - } - else if ((size - offset) >= 8) - { - var part = buffer.Span.Slice(offset, 8); - Vector64 vector = Vector64.Create(part); - part = buffer2.Span.Slice(offset, 8); - Vector64 vector2 = Vector64.Create(part); - vector = vector | vector2; - vector.CopyTo(store.Slice(offset, 8)); - offset += 8; - } - else break; - } - - for (var i = offset; i < size; i++) - { - store[i] = (byte)(buffer.Span[i] | buffer2.Span[i]); - } - return builder.Build(); - } - - public static ArrowBuffer Xor(ArrowBuffer buffer, ArrowBuffer buffer2) - { - var builder = new ArrowBuffer.BitmapBuilder(buffer.Length * 8); - var store = builder.Span; - int offset = 0; - int size = buffer.Span.Length; - - while ((size - offset) >= 8) - { - if ((size - offset) >= 64) - { - var part = buffer.Span.Slice(offset, 64); - Vector512 vector = Vector512.Create(part); - part = buffer2.Span.Slice(offset, 64); - Vector512 vector2 = Vector512.Create(part); - vector = vector ^ vector2; - vector.CopyTo(store.Slice(offset, 64)); - offset += 64; - } - else if ((size - offset) >= 32) - { - var part = buffer.Span.Slice(offset, 32); - Vector256 vector = Vector256.Create(part); - part = buffer2.Span.Slice(offset, 32); - Vector256 vector2 = Vector256.Create(part); - vector = vector ^ vector2; - vector.CopyTo(store.Slice(offset, 32)); - offset += 32; - } - else if ((size - offset) >= 16) - { - var part = buffer.Span.Slice(offset, 16); - Vector128 vector = Vector128.Create(part); - part = buffer2.Span.Slice(offset, 16); - Vector128 vector2 = Vector128.Create(part); - vector = vector ^ vector2; - vector = Vector128.OnesComplement(vector); - vector.CopyTo(store.Slice(offset, 16)); - offset += 16; - } - else if ((size - offset) >= 8) - { - var part = buffer.Span.Slice(offset, 8); - Vector64 vector = Vector64.Create(part); - part = buffer2.Span.Slice(offset, 8); - Vector64 vector2 = Vector64.Create(part); - vector = vector ^ vector2; - vector.CopyTo(store.Slice(offset, 8)); - offset += 8; - } - else break; - } - - for (var i = offset; i < size; i++) - { - store[i] = (byte)(buffer.Span[i] ^ buffer2.Span[i]); - } - return builder.Build(); - } -} /// /// Specifies how null values should be handled in comparison operations. @@ -284,7 +48,7 @@ public static class Comparison /// public static BooleanArray Invert(BooleanArray mask, MemoryAllocator? allocator = null) { - var inverted = BitVectorOps.OnesComplement(mask.ValueBuffer); + var inverted = BitVectorOps.OnesComplement(mask.ValueBuffer, allocator); var invertedmask = new BooleanArray(inverted, mask.NullBitmapBuffer.Clone(), mask.Length, mask.NullCount, 0); return invertedmask; } @@ -308,14 +72,14 @@ public static BooleanArray Invert(BooleanArray mask, MemoryAllocator? allocator public static BooleanArray And(BooleanArray lhs, BooleanArray rhs, MemoryAllocator? allocator = null) { if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); - var combined = BitVectorOps.And(lhs.ValueBuffer, rhs.ValueBuffer); - var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer); + var combined = BitVectorOps.And(lhs.ValueBuffer, rhs.ValueBuffer, allocator); + var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer, allocator); var nullCount = BitUtility.CountBits(combinedMask.Span); return new BooleanArray(combined, combinedMask, lhs.Length, nullCount, 0); } /// - /// Performa a pairwise boolean OR operation. + /// Perform a pairwise boolean OR operation. /// /// /// @@ -325,14 +89,14 @@ public static BooleanArray And(BooleanArray lhs, BooleanArray rhs, MemoryAllocat public static BooleanArray Or(BooleanArray lhs, BooleanArray rhs, MemoryAllocator? allocator = null) { if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); - var combined = BitVectorOps.Or(lhs.ValueBuffer, rhs.ValueBuffer); - var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer); + var combined = BitVectorOps.Or(lhs.ValueBuffer, rhs.ValueBuffer, allocator); + var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer, allocator); var nullCount = BitUtility.CountBits(combinedMask.Span); return new BooleanArray(combined, combinedMask, lhs.Length, nullCount, 0); } /// - /// Performa a pairwise boolean equality operation. + /// Perform a pairwise boolean equality operation. /// /// /// @@ -342,14 +106,14 @@ public static BooleanArray Or(BooleanArray lhs, BooleanArray rhs, MemoryAllocato public static BooleanArray Equals(BooleanArray lhs, BooleanArray rhs, MemoryAllocator? allocator = null) { if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); - var combined = BitVectorOps.OnesComplement(BitVectorOps.Xor(lhs.ValueBuffer, rhs.ValueBuffer)); - var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer); + var combined = BitVectorOps.OnesComplement(BitVectorOps.Xor(lhs.ValueBuffer, rhs.ValueBuffer, allocator)); + var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer, allocator); var nullCount = BitUtility.CountBits(combinedMask.Span); return new BooleanArray(combined, combinedMask, lhs.Length, nullCount, 0); } /// - /// Performa a pairwise boolean XOR operation. + /// Perform a pairwise boolean XOR operation. /// /// /// @@ -359,8 +123,8 @@ public static BooleanArray Equals(BooleanArray lhs, BooleanArray rhs, MemoryAllo public static BooleanArray Xor(BooleanArray lhs, BooleanArray rhs, MemoryAllocator? allocator = null) { if (lhs.Length != rhs.Length) throw new ArgumentException("Arrays must have the same length"); - var combined = BitVectorOps.Xor(lhs.ValueBuffer, rhs.ValueBuffer); - var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer); + var combined = BitVectorOps.Xor(lhs.ValueBuffer, rhs.ValueBuffer, allocator); + var combinedMask = BitVectorOps.And(lhs.NullBitmapBuffer, rhs.NullBitmapBuffer, allocator); var nullCount = BitUtility.CountBits(combinedMask.Span); return new BooleanArray(combined, combinedMask, lhs.Length, nullCount, 0); } diff --git a/test/Apache.Arrow.Operations.Tests/Apache.Arrow.Operations.Tests.csproj b/test/Apache.Arrow.Operations.Tests/Apache.Arrow.Operations.Tests.csproj index d21e4e60..4cc754eb 100644 --- a/test/Apache.Arrow.Operations.Tests/Apache.Arrow.Operations.Tests.csproj +++ b/test/Apache.Arrow.Operations.Tests/Apache.Arrow.Operations.Tests.csproj @@ -1,16 +1,6 @@ - - - true - true - - true - - - net8.0 @@ -20,13 +10,7 @@ - - - all - runtime; build; native; contentfiles; analyzers - - - + all runtime; build; native; contentfiles; analyzers @@ -38,4 +22,4 @@ - \ No newline at end of file +