Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,7 @@ if(ARROW_COMPUTE)
compute/kernels/scalar_validity.cc
compute/kernels/vector_array_sort.cc
compute/kernels/vector_cumulative_ops.cc
compute/kernels/vector_demean.cc
compute/kernels/vector_pairwise.cc
compute/kernels/vector_nested.cc
compute/kernels/vector_rank.cc
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/compute/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ add_arrow_compute_test(vector_selection_test
EXTRA_LINK_LIBS
arrow_compute_kernels_testing)

add_arrow_compute_test(vector_demean_test
SOURCES
vector_demean_test.cc
EXTRA_LINK_LIBS
arrow_compute_kernels_testing)

add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute")
add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute")
add_arrow_benchmark(vector_partition_benchmark PREFIX "arrow-compute")
Expand Down
81 changes: 81 additions & 0 deletions cpp/src/arrow/compute/kernels/vector_demean.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// Include necessary Arrow headers
#include <arrow/compute/api_vector.h>
#include "arrow/compute/api_scalar.h"
#include "arrow/compute/function.h"
#include "arrow/compute/registry.h"
#include "arrow/util/logging.h"

namespace arrow {
namespace compute {
namespace internal {

namespace {
// Function documentation
const FunctionDoc demean_doc{
"Perform a demean operation over all the elements of the array",
("Returns an array where every element has been removed from \n"
"the mean of the array."),
Comment on lines +34 to +35
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO, fix desc

{"array"}};

class DemeanMetaFunction : public MetaFunction {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remind me what MetaFunction does?

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh - is a "MetaFunction" a function that is composed of other functions?

public:
DemeanMetaFunction() : MetaFunction("demean", Arity::Unary(), demean_doc) {}

Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
ExecContext* ctx) const override {
switch (args[0].kind()) {
case Datum::ARRAY:
case Datum::CHUNKED_ARRAY: {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say support for chunked_array is not needed - more of a "nice to have". The main application of these kernels are to be used with Acero which doesn't use chunk arrays.

return Demean(*args[0].make_array(), ctx);
} break;
default:
break;
}
return Status::NotImplemented(
"Unsupported types for demean operation: "
"values=",
args[0].ToString());
}

private:
template <typename T>
static Result<Datum> Demean(const T& input, ExecContext* ctx) {
// TODO: Expose options?
Datum mean_result;
ARROW_ASSIGN_OR_RAISE(
mean_result, arrow::compute::CumulativeMean(input, CumulativeOptions(), ctx));
Datum demean_result;
ARROW_ASSIGN_OR_RAISE(
demean_result,
arrow::compute::Subtract(input, mean_result, ArithmeticOptions(), ctx));

return demean_result;
}
};
} // namespace

void RegisterVectorDemean(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::make_shared<DemeanMetaFunction>()));
}
} // namespace internal
} // namespace compute
} // namespace arrow
52 changes: 52 additions & 0 deletions cpp/src/arrow/compute/kernels/vector_demean_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest.h>

#include "arrow/compute/api.h"
#include "arrow/compute/kernels/test_util.h"
#include "arrow/compute/util.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/matchers.h"

namespace arrow {
namespace compute {

TEST(TestDemean, BasicDemean) {
constexpr int data_bufndx{1};
const std::vector<int32_t> test_result{0, 0, 0, 0, 0, 0};
const std::vector<int32_t> test_values{1, 1, 1, 1, 1, 1};
Int32Builder input_builder;
ASSERT_OK(input_builder.Reserve(test_values.size()));
ASSERT_OK(input_builder.AppendValues(test_values));
ASSERT_OK_AND_ASSIGN(auto test_inputs, input_builder.Finish());

ASSERT_OK_AND_ASSIGN(Datum demean_result, CallFunction("demean", {test_inputs}));
Copy link
Copy Markdown

@icexelloss icexelloss Dec 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure if this works with Acero - ultimately we want to use the demean function with Acero similar to [this] (The link is an aggregate node but we have something similar but uses vector kernels instead) (https://github.com/apache/arrow/blob/da3c6dde44575d3b68a20a0493bd3e9a1588aa14/cpp/src/arrow/acero/aggregate_internal.h#L229).

Which uses the kernel abstraction instead of the function abstraction in the node.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if the MetaFunction supports DispatchExact it should work:
https://github.com/apache/arrow/blob/main/cpp/src/arrow/acero/aggregate_internal.cc#L81C57-L81C70

@raulcd Can you test this out?

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In any case, I think we should probably add a "test Acero vector node" to make this easier - basically an simple Acero node that pass each batch to the vector kernel and output the result - this would be useful to test other vector kernels too.

auto result_data = *(demean_result.array());

// validate each value
for (int val_ndx = 0; val_ndx < test_inputs->length(); ++val_ndx) {
int32_t expected_value = test_result[val_ndx];
int32_t actual_value = result_data.GetValues<int32_t>(data_bufndx)[val_ndx];
ASSERT_EQ(expected_value, actual_value);
}
}
} // namespace compute
} // namespace arrow
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/registry.cc
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ static std::unique_ptr<FunctionRegistry> CreateBuiltInRegistry() {
// Vector functions
RegisterVectorArraySort(registry.get());
RegisterVectorCumulativeSum(registry.get());
RegisterVectorDemean(registry.get());
RegisterVectorNested(registry.get());
RegisterVectorRank(registry.get());
RegisterVectorReplace(registry.get());
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/registry_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ void RegisterScalarOptions(FunctionRegistry* registry);
// Vector functions
void RegisterVectorArraySort(FunctionRegistry* registry);
void RegisterVectorCumulativeSum(FunctionRegistry* registry);
void RegisterVectorDemean(FunctionRegistry* registry);
void RegisterVectorHash(FunctionRegistry* registry);
void RegisterVectorNested(FunctionRegistry* registry);
void RegisterVectorRank(FunctionRegistry* registry);
Expand Down