diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 5913b756f1c..ffcdbd7a0fa 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -23,8 +23,11 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/dict_util_internal.h" #include "arrow/util/float16.h" #include "arrow/util/logging_internal.h" +#include "arrow/util/ree_util.h" +#include "arrow/util/union_util.h" namespace arrow { @@ -36,29 +39,7 @@ namespace compute { namespace internal { namespace { -Status IsValidExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const ArraySpan& arr = batch[0].array; - ArraySpan* out_span = out->array_span_mutable(); - if (arr.type->id() == Type::NA) { - // Input is all nulls => output is entirely false. - bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length, - false); - return Status::OK(); - } - - DCHECK_EQ(out_span->offset, 0); - DCHECK_LE(out_span->length, arr.length); - if (arr.MayHaveNulls()) { - // We could do a zero-copy optimization, but it isn't worth the added complexity - ::arrow::internal::CopyBitmap(arr.buffers[0].data, arr.offset, arr.length, - out_span->buffers[1].data, out_span->offset); - } else { - // Input has no nulls => output is entirely true. - bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length, - true); - } - return Status::OK(); -} +using NanOptionsState = OptionsWrapper; struct IsFiniteOperator { template @@ -82,8 +63,6 @@ struct IsInfOperator { } }; -using NanOptionsState = OptionsWrapper; - template static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_offset) { const T* data = arr.GetValues(1); @@ -101,45 +80,72 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of } } -Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const ArraySpan& arr = batch[0].array; - ArraySpan* out_span = out->array_span_mutable(); - if (arr.type->id() == Type::NA) { - bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length, - true); - return Status::OK(); - } - - const auto& options = NanOptionsState::Get(ctx); - uint8_t* out_bitmap = out_span->buffers[1].data; - if (arr.GetNullCount() > 0) { - // Input has nulls => output is the inverted null (validity) bitmap. - InvertBitmap(arr.buffers[0].data, arr.offset, arr.length, out_bitmap, - out_span->offset); +static Status SetLogicalNullBits(KernelContext* ctx, const ArraySpan& span, + uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null) { + const Type::type t = span.type->id(); + if (t == Type::NA) { + // Input is all nulls, so all output bits are the same. + bit_util::SetBitsTo(out_bitmap, out_offset, span.length, set_on_null); + } else if (t == Type::SPARSE_UNION) { + union_util::SetLogicalNullBitsSparse(span, out_bitmap, out_offset, set_on_null); + } else if (t == Type::DENSE_UNION) { + union_util::SetLogicalNullBitsDense(span, out_bitmap, out_offset, set_on_null); + } else if (t == Type::RUN_END_ENCODED) { + ree_util::SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + } else if (t == Type::DICTIONARY) { + dict_util::SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); } else { - // Input has no nulls => output is entirely false. - bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false); - } + // Input is a type for which logical and physical nulls are the same, so we can + // use GetNullCount() and the validity bitmap + if (span.GetNullCount() > 0) { + // Input has nulls. The output is either the validity bitmap or the inverse of the + // validity bitmap. + if (set_on_null) { + InvertBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap, + out_offset); + } else { + CopyBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap, + out_offset); + } + } else { + // Input has no nulls, so all output bits are the same. + bit_util::SetBitsTo(out_bitmap, out_offset, span.length, !set_on_null); + } - if (is_floating(arr.type->id()) && options.nan_is_null) { - switch (arr.type->id()) { - case Type::FLOAT: - SetNanBits(arr, out_bitmap, out_span->offset); - break; - case Type::DOUBLE: - SetNanBits(arr, out_bitmap, out_span->offset); - break; - case Type::HALF_FLOAT: - SetNanBits(arr, out_bitmap, out_span->offset); - break; - default: - return Status::NotImplemented("NaN detection not implemented for type ", - arr.type->ToString()); + // If nan_is_null, we must also check for nans. + if (is_floating(t) && NanOptionsState::Get(ctx).nan_is_null) { + switch (t) { + case Type::FLOAT: + SetNanBits(span, out_bitmap, out_offset); + break; + case Type::DOUBLE: + SetNanBits(span, out_bitmap, out_offset); + break; + case Type::HALF_FLOAT: + SetNanBits(span, out_bitmap, out_offset); + break; + default: + return Status::NotImplemented("NaN detection not implemented for type ", + span.type->ToString()); + } } } return Status::OK(); } +Status IsValidExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + ArraySpan* out_span = out->array_span_mutable(); + return SetLogicalNullBits(ctx, batch[0].array, out_span->buffers[1].data, + out_span->offset, false); +} + +Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + ArraySpan* out_span = out->array_span_mutable(); + return SetLogicalNullBits(ctx, batch[0].array, out_span->buffers[1].data, + out_span->offset, true); +} + struct IsNanOperator { template static constexpr OutType Call(KernelContext*, const InType& value, Status*) { @@ -243,20 +249,18 @@ std::shared_ptr MakeIsNanFunction(std::string name, FunctionDoc } Status TrueUnlessNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + // Set all bits in the output's value bitmap to true ArraySpan* out_span = out->array_span_mutable(); - if (out_span->buffers[0].data) { - // If there is a validity bitmap computed above the kernel - // invocation, we copy it to the output buffers - ::arrow::internal::CopyBitmap(out_span->buffers[0].data, out_span->offset, - out_span->length, out_span->buffers[1].data, - out_span->offset); - } else { - // But for all-valid inputs, the engine will skip allocating a - // validity bitmap, so we set everything to true - bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length, - true); - } - return Status::OK(); + bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length, + true); + + // Set the output's validity bitmap based on the nullity of the input array + // NOTE: alternatively, we could switch this kernel's null handling back to + // NullHandling::INTERSECTION and change the validity checks in exec.cc so that + // they correctly handle logical nulls, but that would invove significant changes + // in exec.cc which might have more side effects + return SetLogicalNullBits(ctx, batch[0].array, out_span->buffers[0].data, + out_span->offset, false); } const FunctionDoc is_valid_doc( @@ -303,7 +307,7 @@ void RegisterScalarValidity(FunctionRegistry* registry) { /*can_write_into_slices=*/true, &kNullOptions, NanOptionsState::Init); MakeFunction("true_unless_null", true_unless_null_doc, {InputType::Any()}, boolean(), - TrueUnlessNullExec, registry, NullHandling::INTERSECTION, + TrueUnlessNullExec, registry, NullHandling::COMPUTED_PREALLOCATE, /*can_write_into_slices=*/false); DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", is_finite_doc))); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 4613176b48c..a7357744270 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -59,6 +59,55 @@ TEST_F(TestBooleanValidityKernels, TrueUnlessNull) { type_singleton(), "[null, true, true, null]"); } +void CheckValidityKernels(Datum input, Datum is_valid_expected) { + ASSERT_OK_AND_ASSIGN(auto is_null_expected, compute::Invert(is_valid_expected)); + BooleanScalar true_scalar(true); + NullScalar null_scalar; + ASSERT_OK_AND_ASSIGN(auto true_unlesss_null_expected, + compute::IfElse(is_valid_expected, true_scalar, null_scalar)); + + CheckScalarUnary("is_valid", input, is_valid_expected); + CheckScalarUnary("is_null", input, is_null_expected); + CheckScalarUnary("true_unless_null", input, true_unlesss_null_expected); +} + +TEST_F(TestBooleanValidityKernels, LogicalNulls) { + auto null_dict = + DictArrayFromJSON(dictionary(int8(), int8()), "[0, 2, 1]", "[0, 1, null]"); + CheckValidityKernels(null_dict, ArrayFromJSON(boolean(), "[true, false, true]")); + auto null_index = + DictArrayFromJSON(dictionary(int8(), int32()), "[null, 1, 0]", "[8, 2]"); + CheckValidityKernels(null_index, ArrayFromJSON(boolean(), "[false, true, true]")); + auto null_dict_and_index = DictArrayFromJSON(dictionary(int8(), boolean()), + "[1, null, 2, 0]", "[true, false, null]"); + CheckValidityKernels(null_dict_and_index, + ArrayFromJSON(boolean(), "[true, false, false, true]")); + + ASSERT_OK_AND_ASSIGN(auto ree, + RunEndEncode(ArrayFromJSON(int64(), "[11, 11, null, null, 12]"))); + CheckValidityKernels(ree, ArrayFromJSON(boolean(), "[true, true, false, false, true]")); + + ArrayVector children{ + ArrayFromJSON(int64(), "[1, 23, 45, null, null, -2, null]"), + ArrayFromJSON(float32(), "[null, 1.1, 2.2, null, -4.0, 1.5, 0.1]"), + ArrayFromJSON(utf8(), R"(["alpha", "", "beta", null, "gamma", "delta", null])"), + }; + auto type_ids = ArrayFromJSON(int8(), "[0, 1, 2, 2, 0, 2, 1]"); + auto fields = {field("a", int64()), field("b", float32()), field("c", utf8())}; + SparseUnionArray sparse(sparse_union(fields), 7, children, + type_ids->data()->buffers[1]); + ASSERT_OK(sparse.ValidateFull()); + CheckValidityKernels( + sparse, ArrayFromJSON(boolean(), "[true, true, true, false, false, true, true]")); + + auto offsets = ArrayFromJSON(int32(), "[0, 0, 0, 2, 3, 6, 3]"); + DenseUnionArray dense(dense_union(fields), 7, children, type_ids->data()->buffers[1], + offsets->data()->buffers[1]); + ASSERT_OK(dense.ValidateFull()); + CheckValidityKernels( + dense, ArrayFromJSON(boolean(), "[true, false, true, true, false, false, false]")); +} + TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) { CheckScalarUnary("is_null", std::make_shared(5), ArrayFromJSON(boolean(), "[true, true, true, true, true]")); diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc index c93517140ca..e76a8dcdcc5 100644 --- a/cpp/src/arrow/util/dict_util.cc +++ b/cpp/src/arrow/util/dict_util.cc @@ -19,6 +19,7 @@ #include "arrow/array/array_dict.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" namespace arrow { @@ -51,6 +52,31 @@ int64_t LogicalNullCount(const ArraySpan& span) { return null_count; } +template +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null) { + const auto* indices_null_bit_map = span.buffers[0].data; + const auto& dictionary_span = span.dictionary(); + // TODO: Is this always non-null? + const auto* dictionary_null_bit_map = dictionary_span.buffers[0].data; + + using CType = typename IndexArrowType::c_type; + const CType* indices_data = span.GetValues(1); + for (int64_t i = 0; i < span.length; i++) { + bool is_null = false; + if (indices_null_bit_map != nullptr && + !bit_util::GetBit(indices_null_bit_map, i + span.offset)) { + is_null = true; + } else { + CType current_index = indices_data[i]; + is_null = !bit_util::GetBit(dictionary_null_bit_map, + current_index + dictionary_span.offset); + } + + bit_util::SetBitTo(out_bitmap, out_offset + i, set_on_null == is_null); + } +} + } // namespace int64_t LogicalNullCount(const ArraySpan& span) { @@ -78,5 +104,48 @@ int64_t LogicalNullCount(const ArraySpan& span) { return LogicalNullCount(span); } } + +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null) { + if (span.dictionary().GetNullCount() == 0 || span.length == 0) { + if (set_on_null) { + internal::InvertBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap, + out_offset); + } else { + internal::CopyBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap, + out_offset); + } + return; + } + + const auto& dict_array_type = internal::checked_cast(*span.type); + switch (dict_array_type.index_type()->id()) { + case Type::UINT8: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::INT8: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::UINT16: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::INT16: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::UINT32: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::INT32: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::UINT64: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + default: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + } +} + } // namespace dict_util } // namespace arrow diff --git a/cpp/src/arrow/util/dict_util_internal.h b/cpp/src/arrow/util/dict_util_internal.h index a92733ae0f6..a946758b685 100644 --- a/cpp/src/arrow/util/dict_util_internal.h +++ b/cpp/src/arrow/util/dict_util_internal.h @@ -22,7 +22,15 @@ namespace arrow { namespace dict_util { +/// \brief Compute the logical null count of a dictionary-encoded array int64_t LogicalNullCount(const ArraySpan& span); +/// \brief Populate a bitmap based on the logical nulls in a dictionary-encoded array +/// +/// \param set_on_null true if we should set bits corresponding to nulls and false if +/// we should set bits corresponding to non-nulls +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null); + } // namespace dict_util } // namespace arrow diff --git a/cpp/src/arrow/util/ree_util.cc b/cpp/src/arrow/util/ree_util.cc index 461d6804b8c..8685f70e9ce 100644 --- a/cpp/src/arrow/util/ree_util.cc +++ b/cpp/src/arrow/util/ree_util.cc @@ -47,6 +47,23 @@ int64_t LogicalNullCount(const ArraySpan& span) { return null_count; } +template +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null) { + const auto& values = ValuesArray(span); + const auto& values_bitmap = values.buffers[0].data; + + RunEndEncodedArraySpan ree_span(span); + auto end = ree_span.end(); + for (auto it = ree_span.begin(); it != end; ++it) { + const bool is_null = + values_bitmap && + !bit_util::GetBit(values_bitmap, values.offset + it.index_into_array()); + bit_util::SetBitsTo(out_bitmap, out_offset, it.run_length(), set_on_null == is_null); + out_offset += it.run_length(); + } +} + } // namespace int64_t LogicalNullCount(const ArraySpan& span) { @@ -61,6 +78,19 @@ int64_t LogicalNullCount(const ArraySpan& span) { return LogicalNullCount(span); } +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null) { + const auto type_id = RunEndsArray(span).type->id(); + if (type_id == Type::INT16) { + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + } else if (type_id == Type::INT32) { + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + } else { + DCHECK_EQ(type_id, Type::INT64); + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + } +} + namespace internal { /// \pre 0 <= i < array_span.length() diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h index 5c759f2e80d..16a2395acc2 100644 --- a/cpp/src/arrow/util/ree_util.h +++ b/cpp/src/arrow/util/ree_util.h @@ -57,6 +57,13 @@ Status ValidateRunEndEncodedChildren(const RunEndEncodedType& type, /// \brief Compute the logical null count of an REE array int64_t LogicalNullCount(const ArraySpan& span); +/// \brief Populate a bitmap based on the logical nulls in an REE array +/// +/// \param set_on_null true if we should set bits corresponding to nulls and false if +/// we should set bits corresponding to non-nulls +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null); + namespace internal { /// \brief Uses binary-search to find the physical offset given a logical offset diff --git a/cpp/src/arrow/util/union_util.cc b/cpp/src/arrow/util/union_util.cc index 6b4d752d868..56438f537d7 100644 --- a/cpp/src/arrow/util/union_util.cc +++ b/cpp/src/arrow/util/union_util.cc @@ -55,4 +55,18 @@ int64_t LogicalDenseUnionNullCount(const ArraySpan& span) { return null_count; } +void SetLogicalNullBitsSparse(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset, bool set_on_null) { + for (int64_t i = 0; i < span.length; i++) { + bit_util::SetBitTo(out_bitmap, out_offset + i, set_on_null == span.IsNull(i)); + } +} + +void SetLogicalNullBitsDense(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset, bool set_on_null) { + for (int64_t i = 0; i < span.length; i++) { + bit_util::SetBitTo(out_bitmap, out_offset + i, set_on_null == span.IsNull(i)); + } +} + } // namespace arrow::union_util diff --git a/cpp/src/arrow/util/union_util.h b/cpp/src/arrow/util/union_util.h index 0f30d5a3278..04608d24536 100644 --- a/cpp/src/arrow/util/union_util.h +++ b/cpp/src/arrow/util/union_util.h @@ -27,5 +27,19 @@ int64_t LogicalSparseUnionNullCount(const ArraySpan& span); /// \brief Compute the number of of logical nulls in a dense union array int64_t LogicalDenseUnionNullCount(const ArraySpan& span); +/// \brief Populate a bitmap based on the logical nulls in a sparse union array +/// +/// \param set_on_null true if we should set bits corresponding to nulls and false if +/// we should set bits corresponding to non-nulls +void SetLogicalNullBitsSparse(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset, bool set_on_null); + +/// \brief Populate a bitmap based on the logical nulls in a dense union array +/// +/// \param set_on_null true if we should set bits corresponding to nulls and false if +/// we should set bits corresponding to non-nulls +void SetLogicalNullBitsDense(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset, bool set_on_null); + } // namespace union_util } // namespace arrow