Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions src/Apache.Arrow/Arrays/BinaryArray.cs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public abstract class BuilderBase<TArray, TBuilder> : IArrowArrayBuilder<byte, T
protected ArrowBuffer.BitmapBuilder ValidityBuffer { get; }
protected int Offset { get; set; }
protected int NullCount => this.ValidityBuffer.UnsetBitCount;
private int _availableValueBufferByteCount;

protected BuilderBase(IArrowType dataType)
{
Expand All @@ -82,6 +83,44 @@ protected BuilderBase(IArrowType dataType)

protected abstract TArray Build(ArrayData data);

/// <summary>
/// Returns writable value-buffer space without changing the committed buffer length.
/// </summary>
/// <param name="sizeHint">The minimum number of writable bytes required.</param>
/// <returns>A span starting at the first uncommitted byte in the value buffer.</returns>
protected Span<byte> GetValueBufferSpan(int sizeHint)
{
if (sizeHint < 0)
{
throw new ArgumentOutOfRangeException(nameof(sizeHint));
}

ValueBuffer.Reserve(sizeHint);
Span<byte> span = ValueBuffer.Span.Slice(ValueBuffer.Length);
_availableValueBufferByteCount = span.Length;
return span;
}

/// <summary>
/// Commits bytes previously written into the span returned by <see cref="GetValueBufferSpan"/>.
/// </summary>
/// <param name="count">The number of bytes written to the span returned by the latest <see cref="GetValueBufferSpan"/> call.</param>
protected void AdvanceValueBuffer(int count)
{
if (count < 0)
{
throw new ArgumentOutOfRangeException(nameof(count));
}

if (count > _availableValueBufferByteCount)
{
throw new ArgumentOutOfRangeException(nameof(count));
}

ValueBuffer.Resize(checked(ValueBuffer.Length + count));
_availableValueBufferByteCount = 0;
}

/// <summary>
/// Gets the length of the array built so far.
/// </summary>
Expand Down
40 changes: 38 additions & 2 deletions src/Apache.Arrow/Arrays/StringArray.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,49 @@ public Builder Append(string value, Encoding encoding = null)
{
return AppendNull();
}

encoding = encoding ?? DefaultEncoding;
byte[] span = encoding.GetBytes(value);
return Append(span.AsSpan());

int byteCount = encoding.GetByteCount(value);
Comment thread
InCerryGit marked this conversation as resolved.
Span<byte> destination = GetValueBufferSpan(byteCount).Slice(0, byteCount);

if (byteCount > 0)
{
unsafe
{
fixed (char* chars = value)
fixed (byte* data = destination)
encoding.GetBytes(chars, value.Length, data, byteCount);
}
}

AdvanceValueBuffer(byteCount);
ValidityBuffer.Append(true);
Offset += byteCount;
ValueOffsets.Append(Offset);
return this;
}

public Builder AppendRange(IEnumerable<string> values, Encoding encoding = null)
{
encoding = encoding ?? DefaultEncoding;

if (values is ICollection<string> collection && collection.Count > 0)
{
int totalByteCount = 0;
foreach (string value in collection)
{
if (value != null)
{
totalByteCount = checked(totalByteCount + encoding.GetByteCount(value));
}
}

ValueOffsets.Reserve(collection.Count);
ValidityBuffer.Reserve(collection.Count);
ValueBuffer.Reserve(totalByteCount);
}

foreach (string value in values)
{
Append(value, encoding);
Expand Down
64 changes: 64 additions & 0 deletions test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

using BenchmarkDotNet.Attributes;

namespace Apache.Arrow.Benchmarks
{
[MemoryDiagnoser]
[ShortRunJob]
public class StringBuilderAppendBenchmark
{
private const int Count = 10_000;
private string _payload;
private string[] _values;

[GlobalSetup]
public void GlobalSetup()
{
_payload = new string('a', 32);
_values = new string[Count];

for (int i = 0; i < _values.Length; i++)
{
_values[i] = _payload;
}
}

[Benchmark]
public int AppendSmallStrings()
{
var builder = new StringArray.Builder();

for (int i = 0; i < Count; i++)
{
builder.Append(_payload);
}

using StringArray array = builder.Build();
return array.Length;
}

[Benchmark]
public int AppendRangeSmallStrings()
{
using StringArray array = new StringArray.Builder()
.AppendRange(_values)
.Build();

return array.Length;
}
}
}
80 changes: 80 additions & 0 deletions test/Apache.Arrow.Tests/StringArrayTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.

using System.Collections.Generic;
using System.Text;
using Xunit;
Comment thread
InCerryGit marked this conversation as resolved.

namespace Apache.Arrow.Tests
Expand Down Expand Up @@ -81,5 +83,83 @@ public void ReturnsAppendedValueMaterialize(string firstValue, string secondValu
Assert.Equal(firstValue, retrievedValue);
}
}

public class Builder
{
[Fact]
public void AppendUsesCustomEncoding()
{
const string expected = "héllø";

var array = new StringArray.Builder()
.Append(expected, Encoding.Unicode)
.Build();

Assert.Equal(expected, array.GetString(0, Encoding.Unicode));
}

[Fact]
public void AppendLargeStringUsesFallbackPath()
{
string expected = new string('x', 512);

var array = new StringArray.Builder()
.Append(expected)
.Build();

Assert.Equal(expected, array.GetString(0));
}

[Fact]
public void AppendRangePreservesCollectionValues()
{
string[] values = { "first", null, string.Empty, "last" };

var array = new StringArray.Builder()
.AppendRange(values)
.Build();

Assert.Equal("first", array.GetString(0));
Assert.Null(array.GetString(1));
Assert.Equal(string.Empty, array.GetString(2));
Assert.Equal("last", array.GetString(3));
}

[Fact]
public void AppendRangePreservesCollectionValuesWithCustomEncoding()
{
string[] values = { "héllø", null, string.Empty, "wørld" };

var array = new StringArray.Builder()
.AppendRange(values, Encoding.Unicode)
.Build();

Assert.Equal("héllø", array.GetString(0, Encoding.Unicode));
Assert.Null(array.GetString(1, Encoding.Unicode));
Assert.Equal(string.Empty, array.GetString(2, Encoding.Unicode));
Assert.Equal("wørld", array.GetString(3, Encoding.Unicode));
}

[Fact]
public void AppendRangePreservesEnumerableValues()
{
var array = new StringArray.Builder()
.AppendRange(YieldValues())
.Build();

Assert.Equal("first", array.GetString(0));
Assert.Null(array.GetString(1));
Assert.Equal(string.Empty, array.GetString(2));
Assert.Equal("last", array.GetString(3));
}

private static IEnumerable<string> YieldValues()
{
yield return "first";
yield return null;
yield return string.Empty;
yield return "last";
}
}
}
}