diff --git a/src/Apache.Arrow/Arrays/BinaryArray.cs b/src/Apache.Arrow/Arrays/BinaryArray.cs index 2d11207c..80985d0f 100644 --- a/src/Apache.Arrow/Arrays/BinaryArray.cs +++ b/src/Apache.Arrow/Arrays/BinaryArray.cs @@ -60,6 +60,7 @@ public abstract class BuilderBase : IArrowArrayBuilder this.ValidityBuffer.UnsetBitCount; + private int _availableValueBufferByteCount; protected BuilderBase(IArrowType dataType) { @@ -82,6 +83,44 @@ protected BuilderBase(IArrowType dataType) protected abstract TArray Build(ArrayData data); + /// + /// Returns writable value-buffer space without changing the committed buffer length. + /// + /// The minimum number of writable bytes required. + /// A span starting at the first uncommitted byte in the value buffer. + protected Span GetValueBufferSpan(int sizeHint) + { + if (sizeHint < 0) + { + throw new ArgumentOutOfRangeException(nameof(sizeHint)); + } + + ValueBuffer.Reserve(sizeHint); + Span span = ValueBuffer.Span.Slice(ValueBuffer.Length); + _availableValueBufferByteCount = span.Length; + return span; + } + + /// + /// Commits bytes previously written into the span returned by . + /// + /// The number of bytes written to the span returned by the latest call. + protected void AdvanceValueBuffer(int count) + { + if (count < 0) + { + throw new ArgumentOutOfRangeException(nameof(count)); + } + + if (count > _availableValueBufferByteCount) + { + throw new ArgumentOutOfRangeException(nameof(count)); + } + + ValueBuffer.Resize(checked(ValueBuffer.Length + count)); + _availableValueBufferByteCount = 0; + } + /// /// Gets the length of the array built so far. /// diff --git a/src/Apache.Arrow/Arrays/StringArray.cs b/src/Apache.Arrow/Arrays/StringArray.cs index 918f8283..4998fae1 100644 --- a/src/Apache.Arrow/Arrays/StringArray.cs +++ b/src/Apache.Arrow/Arrays/StringArray.cs @@ -43,13 +43,49 @@ public Builder Append(string value, Encoding encoding = null) { return AppendNull(); } + encoding = encoding ?? DefaultEncoding; - byte[] span = encoding.GetBytes(value); - return Append(span.AsSpan()); + + int byteCount = encoding.GetByteCount(value); + Span destination = GetValueBufferSpan(byteCount).Slice(0, byteCount); + + if (byteCount > 0) + { + unsafe + { + fixed (char* chars = value) + fixed (byte* data = destination) + encoding.GetBytes(chars, value.Length, data, byteCount); + } + } + + AdvanceValueBuffer(byteCount); + ValidityBuffer.Append(true); + Offset += byteCount; + ValueOffsets.Append(Offset); + return this; } public Builder AppendRange(IEnumerable values, Encoding encoding = null) { + encoding = encoding ?? DefaultEncoding; + + if (values is ICollection collection && collection.Count > 0) + { + int totalByteCount = 0; + foreach (string value in collection) + { + if (value != null) + { + totalByteCount = checked(totalByteCount + encoding.GetByteCount(value)); + } + } + + ValueOffsets.Reserve(collection.Count); + ValidityBuffer.Reserve(collection.Count); + ValueBuffer.Reserve(totalByteCount); + } + foreach (string value in values) { Append(value, encoding); diff --git a/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs b/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs new file mode 100644 index 00000000..4da75e8e --- /dev/null +++ b/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using BenchmarkDotNet.Attributes; + +namespace Apache.Arrow.Benchmarks +{ + [MemoryDiagnoser] + [ShortRunJob] + public class StringBuilderAppendBenchmark + { + private const int Count = 10_000; + private string _payload; + private string[] _values; + + [GlobalSetup] + public void GlobalSetup() + { + _payload = new string('a', 32); + _values = new string[Count]; + + for (int i = 0; i < _values.Length; i++) + { + _values[i] = _payload; + } + } + + [Benchmark] + public int AppendSmallStrings() + { + var builder = new StringArray.Builder(); + + for (int i = 0; i < Count; i++) + { + builder.Append(_payload); + } + + using StringArray array = builder.Build(); + return array.Length; + } + + [Benchmark] + public int AppendRangeSmallStrings() + { + using StringArray array = new StringArray.Builder() + .AppendRange(_values) + .Build(); + + return array.Length; + } + } +} diff --git a/test/Apache.Arrow.Tests/StringArrayTests.cs b/test/Apache.Arrow.Tests/StringArrayTests.cs index b1973153..d79726f1 100644 --- a/test/Apache.Arrow.Tests/StringArrayTests.cs +++ b/test/Apache.Arrow.Tests/StringArrayTests.cs @@ -13,6 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +using System.Collections.Generic; +using System.Text; using Xunit; namespace Apache.Arrow.Tests @@ -81,5 +83,83 @@ public void ReturnsAppendedValueMaterialize(string firstValue, string secondValu Assert.Equal(firstValue, retrievedValue); } } + + public class Builder + { + [Fact] + public void AppendUsesCustomEncoding() + { + const string expected = "héllø"; + + var array = new StringArray.Builder() + .Append(expected, Encoding.Unicode) + .Build(); + + Assert.Equal(expected, array.GetString(0, Encoding.Unicode)); + } + + [Fact] + public void AppendLargeStringUsesFallbackPath() + { + string expected = new string('x', 512); + + var array = new StringArray.Builder() + .Append(expected) + .Build(); + + Assert.Equal(expected, array.GetString(0)); + } + + [Fact] + public void AppendRangePreservesCollectionValues() + { + string[] values = { "first", null, string.Empty, "last" }; + + var array = new StringArray.Builder() + .AppendRange(values) + .Build(); + + Assert.Equal("first", array.GetString(0)); + Assert.Null(array.GetString(1)); + Assert.Equal(string.Empty, array.GetString(2)); + Assert.Equal("last", array.GetString(3)); + } + + [Fact] + public void AppendRangePreservesCollectionValuesWithCustomEncoding() + { + string[] values = { "héllø", null, string.Empty, "wørld" }; + + var array = new StringArray.Builder() + .AppendRange(values, Encoding.Unicode) + .Build(); + + Assert.Equal("héllø", array.GetString(0, Encoding.Unicode)); + Assert.Null(array.GetString(1, Encoding.Unicode)); + Assert.Equal(string.Empty, array.GetString(2, Encoding.Unicode)); + Assert.Equal("wørld", array.GetString(3, Encoding.Unicode)); + } + + [Fact] + public void AppendRangePreservesEnumerableValues() + { + var array = new StringArray.Builder() + .AppendRange(YieldValues()) + .Build(); + + Assert.Equal("first", array.GetString(0)); + Assert.Null(array.GetString(1)); + Assert.Equal(string.Empty, array.GetString(2)); + Assert.Equal("last", array.GetString(3)); + } + + private static IEnumerable YieldValues() + { + yield return "first"; + yield return null; + yield return string.Empty; + yield return "last"; + } + } } }