From f63f8f0213b2a5b4e9a44818f4299f707bc42d5e Mon Sep 17 00:00:00 2001 From: InCerryGit Date: Sun, 26 Apr 2026 09:26:51 +0800 Subject: [PATCH 1/4] perf: improve string array builder append paths BenchmarkDotNet ShortRun, StringBuilderAppendBenchmark, 10,000 ASCII strings of length 32: - AppendSmallStrings: 432.0 us / 1.66 MB -> 341.0 us / 1157.5 KB - AppendRangeSmallStrings: 426.2 us / 1.66 MB -> 311.8 us / 353.68 KB --- src/Apache.Arrow/Arrays/StringArray.cs | 47 ++++++++++- .../StringBuilderAppendBenchmark.cs | 64 +++++++++++++++ test/Apache.Arrow.Tests/StringArrayTests.cs | 80 +++++++++++++++++++ 3 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs diff --git a/src/Apache.Arrow/Arrays/StringArray.cs b/src/Apache.Arrow/Arrays/StringArray.cs index 918f8283..881dfcd4 100644 --- a/src/Apache.Arrow/Arrays/StringArray.cs +++ b/src/Apache.Arrow/Arrays/StringArray.cs @@ -30,6 +30,8 @@ public class StringArray : BinaryArray, IReadOnlyList, ICollection { + private const int StackallocByteThreshold = 256; + public Builder() : base(StringType.Default) { } protected override StringArray Build(ArrayData data) @@ -43,13 +45,54 @@ public Builder Append(string value, Encoding encoding = null) { return AppendNull(); } + encoding = encoding ?? DefaultEncoding; - byte[] span = encoding.GetBytes(value); - return Append(span.AsSpan()); + + int byteCount = encoding.GetByteCount(value); + + if (byteCount == 0) + { + return Append(ReadOnlySpan.Empty); + } + + if (byteCount <= StackallocByteThreshold) + { + Span bytes = stackalloc byte[byteCount]; + + unsafe + { + fixed (char* chars = value) + fixed (byte* data = bytes) + encoding.GetBytes(chars, value.Length, data, byteCount); + } + + return Append(bytes); + } + + byte[] array = encoding.GetBytes(value); + return Append(array.AsSpan()); } public Builder AppendRange(IEnumerable values, Encoding encoding = null) { + encoding = encoding ?? DefaultEncoding; + + if (values is ICollection collection && collection.Count > 0) + { + int totalByteCount = 0; + foreach (string value in collection) + { + if (value != null) + { + totalByteCount = checked(totalByteCount + encoding.GetByteCount(value)); + } + } + + ValueOffsets.Reserve(collection.Count); + ValidityBuffer.Reserve(collection.Count); + ValueBuffer.Reserve(totalByteCount); + } + foreach (string value in values) { Append(value, encoding); diff --git a/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs b/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs new file mode 100644 index 00000000..4da75e8e --- /dev/null +++ b/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using BenchmarkDotNet.Attributes; + +namespace Apache.Arrow.Benchmarks +{ + [MemoryDiagnoser] + [ShortRunJob] + public class StringBuilderAppendBenchmark + { + private const int Count = 10_000; + private string _payload; + private string[] _values; + + [GlobalSetup] + public void GlobalSetup() + { + _payload = new string('a', 32); + _values = new string[Count]; + + for (int i = 0; i < _values.Length; i++) + { + _values[i] = _payload; + } + } + + [Benchmark] + public int AppendSmallStrings() + { + var builder = new StringArray.Builder(); + + for (int i = 0; i < Count; i++) + { + builder.Append(_payload); + } + + using StringArray array = builder.Build(); + return array.Length; + } + + [Benchmark] + public int AppendRangeSmallStrings() + { + using StringArray array = new StringArray.Builder() + .AppendRange(_values) + .Build(); + + return array.Length; + } + } +} diff --git a/test/Apache.Arrow.Tests/StringArrayTests.cs b/test/Apache.Arrow.Tests/StringArrayTests.cs index b1973153..25a2399b 100644 --- a/test/Apache.Arrow.Tests/StringArrayTests.cs +++ b/test/Apache.Arrow.Tests/StringArrayTests.cs @@ -13,6 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +using System.Text; +using System.Collections.Generic; using Xunit; namespace Apache.Arrow.Tests @@ -81,5 +83,83 @@ public void ReturnsAppendedValueMaterialize(string firstValue, string secondValu Assert.Equal(firstValue, retrievedValue); } } + + public class Builder + { + [Fact] + public void AppendUsesCustomEncoding() + { + const string expected = "héllø"; + + var array = new StringArray.Builder() + .Append(expected, Encoding.Unicode) + .Build(); + + Assert.Equal(expected, array.GetString(0, Encoding.Unicode)); + } + + [Fact] + public void AppendLargeStringUsesFallbackPath() + { + string expected = new string('x', 512); + + var array = new StringArray.Builder() + .Append(expected) + .Build(); + + Assert.Equal(expected, array.GetString(0)); + } + + [Fact] + public void AppendRangePreservesCollectionValues() + { + string[] values = { "first", null, string.Empty, "last" }; + + var array = new StringArray.Builder() + .AppendRange(values) + .Build(); + + Assert.Equal("first", array.GetString(0)); + Assert.Null(array.GetString(1)); + Assert.Equal(string.Empty, array.GetString(2)); + Assert.Equal("last", array.GetString(3)); + } + + [Fact] + public void AppendRangePreservesCollectionValuesWithCustomEncoding() + { + string[] values = { "héllø", null, string.Empty, "wørld" }; + + var array = new StringArray.Builder() + .AppendRange(values, Encoding.Unicode) + .Build(); + + Assert.Equal("héllø", array.GetString(0, Encoding.Unicode)); + Assert.Null(array.GetString(1, Encoding.Unicode)); + Assert.Equal(string.Empty, array.GetString(2, Encoding.Unicode)); + Assert.Equal("wørld", array.GetString(3, Encoding.Unicode)); + } + + [Fact] + public void AppendRangePreservesEnumerableValues() + { + var array = new StringArray.Builder() + .AppendRange(YieldValues()) + .Build(); + + Assert.Equal("first", array.GetString(0)); + Assert.Null(array.GetString(1)); + Assert.Equal(string.Empty, array.GetString(2)); + Assert.Equal("last", array.GetString(3)); + } + + private static IEnumerable YieldValues() + { + yield return "first"; + yield return null; + yield return string.Empty; + yield return "last"; + } + } } } From 648d6ee5ce5638313feb03c88ff5fd69f5ed1cd9 Mon Sep 17 00:00:00 2001 From: InCerryGit Date: Sun, 26 Apr 2026 12:40:28 +0800 Subject: [PATCH 2/4] perf: encode strings directly into builder buffer Write encoded string bytes directly into the builder value buffer after reserving capacity, avoiding both stackalloc staging and an extra copy while keeping offsets and validity updates unchanged. BenchmarkDotNet (StringBuilderAppendBenchmark): AppendSmallStrings 393.1 us / 1157.5 KB; AppendRangeSmallStrings 290.5 us / 353.68 KB. --- src/Apache.Arrow/Arrays/StringArray.cs | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/Apache.Arrow/Arrays/StringArray.cs b/src/Apache.Arrow/Arrays/StringArray.cs index 881dfcd4..61774eaf 100644 --- a/src/Apache.Arrow/Arrays/StringArray.cs +++ b/src/Apache.Arrow/Arrays/StringArray.cs @@ -30,8 +30,6 @@ public class StringArray : BinaryArray, IReadOnlyList, ICollection { - private const int StackallocByteThreshold = 256; - public Builder() : base(StringType.Default) { } protected override StringArray Build(ArrayData data) @@ -49,28 +47,28 @@ public Builder Append(string value, Encoding encoding = null) encoding = encoding ?? DefaultEncoding; int byteCount = encoding.GetByteCount(value); + int valueBufferStart = ValueBuffer.Length; - if (byteCount == 0) - { - return Append(ReadOnlySpan.Empty); - } + ValueBuffer.Reserve(byteCount); - if (byteCount <= StackallocByteThreshold) + if (byteCount > 0) { - Span bytes = stackalloc byte[byteCount]; + Span destination = ValueBuffer.Span.Slice(valueBufferStart, byteCount); unsafe { fixed (char* chars = value) - fixed (byte* data = bytes) + fixed (byte* data = destination) encoding.GetBytes(chars, value.Length, data, byteCount); } - return Append(bytes); + ValueBuffer.Resize(checked(valueBufferStart + byteCount)); } - byte[] array = encoding.GetBytes(value); - return Append(array.AsSpan()); + ValidityBuffer.Append(true); + Offset += byteCount; + ValueOffsets.Append(Offset); + return this; } public Builder AppendRange(IEnumerable values, Encoding encoding = null) From c47d59a9230aa834a0af7b1133f1af4d8c355850 Mon Sep 17 00:00:00 2001 From: InCerryGit Date: Sun, 26 Apr 2026 17:53:22 +0800 Subject: [PATCH 3/4] perf: add builder value buffer span helpers Encapsulate the reserve/get-span/advance sequence in BinaryArray.BuilderBase so StringArray.Builder can encode directly into the value buffer without owning the low-level buffer length bookkeeping. BenchmarkDotNet (StringBuilderAppendBenchmark): AppendSmallStrings 383.7 us / 1157.5 KB; AppendRangeSmallStrings 294.4 us / 353.68 KB. --- src/Apache.Arrow/Arrays/BinaryArray.cs | 39 ++++++++++++++++++++++++++ src/Apache.Arrow/Arrays/StringArray.cs | 9 ++---- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/src/Apache.Arrow/Arrays/BinaryArray.cs b/src/Apache.Arrow/Arrays/BinaryArray.cs index 2d11207c..80985d0f 100644 --- a/src/Apache.Arrow/Arrays/BinaryArray.cs +++ b/src/Apache.Arrow/Arrays/BinaryArray.cs @@ -60,6 +60,7 @@ public abstract class BuilderBase : IArrowArrayBuilder this.ValidityBuffer.UnsetBitCount; + private int _availableValueBufferByteCount; protected BuilderBase(IArrowType dataType) { @@ -82,6 +83,44 @@ protected BuilderBase(IArrowType dataType) protected abstract TArray Build(ArrayData data); + /// + /// Returns writable value-buffer space without changing the committed buffer length. + /// + /// The minimum number of writable bytes required. + /// A span starting at the first uncommitted byte in the value buffer. + protected Span GetValueBufferSpan(int sizeHint) + { + if (sizeHint < 0) + { + throw new ArgumentOutOfRangeException(nameof(sizeHint)); + } + + ValueBuffer.Reserve(sizeHint); + Span span = ValueBuffer.Span.Slice(ValueBuffer.Length); + _availableValueBufferByteCount = span.Length; + return span; + } + + /// + /// Commits bytes previously written into the span returned by . + /// + /// The number of bytes written to the span returned by the latest call. + protected void AdvanceValueBuffer(int count) + { + if (count < 0) + { + throw new ArgumentOutOfRangeException(nameof(count)); + } + + if (count > _availableValueBufferByteCount) + { + throw new ArgumentOutOfRangeException(nameof(count)); + } + + ValueBuffer.Resize(checked(ValueBuffer.Length + count)); + _availableValueBufferByteCount = 0; + } + /// /// Gets the length of the array built so far. /// diff --git a/src/Apache.Arrow/Arrays/StringArray.cs b/src/Apache.Arrow/Arrays/StringArray.cs index 61774eaf..4998fae1 100644 --- a/src/Apache.Arrow/Arrays/StringArray.cs +++ b/src/Apache.Arrow/Arrays/StringArray.cs @@ -47,24 +47,19 @@ public Builder Append(string value, Encoding encoding = null) encoding = encoding ?? DefaultEncoding; int byteCount = encoding.GetByteCount(value); - int valueBufferStart = ValueBuffer.Length; - - ValueBuffer.Reserve(byteCount); + Span destination = GetValueBufferSpan(byteCount).Slice(0, byteCount); if (byteCount > 0) { - Span destination = ValueBuffer.Span.Slice(valueBufferStart, byteCount); - unsafe { fixed (char* chars = value) fixed (byte* data = destination) encoding.GetBytes(chars, value.Length, data, byteCount); } - - ValueBuffer.Resize(checked(valueBufferStart + byteCount)); } + AdvanceValueBuffer(byteCount); ValidityBuffer.Append(true); Offset += byteCount; ValueOffsets.Append(Offset); From 842d0f7ec1923bd15200d67401b62ca5f5c19eb1 Mon Sep 17 00:00:00 2001 From: InCerryGit Date: Sun, 26 Apr 2026 23:06:19 +0800 Subject: [PATCH 4/4] style: sort StringArray test usings --- test/Apache.Arrow.Tests/StringArrayTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Apache.Arrow.Tests/StringArrayTests.cs b/test/Apache.Arrow.Tests/StringArrayTests.cs index 25a2399b..d79726f1 100644 --- a/test/Apache.Arrow.Tests/StringArrayTests.cs +++ b/test/Apache.Arrow.Tests/StringArrayTests.cs @@ -13,8 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System.Text; using System.Collections.Generic; +using System.Text; using Xunit; namespace Apache.Arrow.Tests