diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index a7ede6e99d..09e85ce380 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ on: - docs/ - examples/ - rust/perspective-python/README.md - pull_request_target: + pull_request: branches: - master workflow_dispatch: diff --git a/rust/perspective-python/perspective/tests/table/test_view.py b/rust/perspective-python/perspective/tests/table/test_view.py index f80d483ec8..e59d13dca2 100644 --- a/rust/perspective-python/perspective/tests/table/test_view.py +++ b/rust/perspective-python/perspective/tests/table/test_view.py @@ -424,6 +424,151 @@ def test_view_aggregate_mean(self): {"__ROW_PATH__": ["a"], "y": 300 / 2}, ] + def test_view_aggregate_gmv(self): + data = { + "division": [ + "D1", + "D2", + "D1", + "D2", + "D1", + "D2", + "D1", + "D2", + "D1", + "D2", + "D1", + "D2", + "D1", + "D2", + "D1", + "D2", + "D1", + "D2", + "D1", + "D2", + ], + "trading area": [ + "A", + "B", + "C", + "D", + "E", + "A", + "B", + "C", + "D", + "E", + "A", + "B", + "C", + "D", + "E", + "A", + "B", + "C", + "D", + "E", + ], + "symbol": [ + "AAPL", + "GOOG", + "MSFT", + "AAPL", + "GOOG", + "MSFT", + "AAPL", + "GOOG", + "MSFT", + "AAPL", + "GOOG", + "MSFT", + "AAPL", + "GOOG", + "MSFT", + "AAPL", + "GOOG", + "MSFT", + "AAPL", + "GOOG", + ], + "MV": [ + 1500, + 1200, + 1300, + 1400, + 1600, + 1100, + 1700, + 1800, + 1900, + 2000, + -2100, + -2200, + -2300, + -2400, + -2500, + -2600, + -2700, + -2800, + -2900, + -3000, + ], + } + + tbl = Table(data) + view = tbl.view( + aggregates={"MV": "gmv"}, group_by=["division", "symbol"], columns=["MV"] + ) + + assert view.to_columns() == { + "__ROW_PATH__": [ + [], + ["D1"], + ["D1", "AAPL"], + ["D1", "GOOG"], + ["D1", "MSFT"], + ["D2"], + ["D2", "AAPL"], + ["D2", "GOOG"], + ["D2", "MSFT"], + ], + "MV": [10000, 5900, -2000, -3200, 700, 7100, 800, -2400, -3900], + } + + def test_view_aggregate_gmv_split_by(self): + data = { + "division": [ + "D1", "D2", "D1", "D2", "D1", "D2", "D1", "D2", "D1", "D2", + "D1", "D2", "D1", "D2", "D1", "D2", "D1", "D2", "D1", "D2", + ], + "symbol": [ + "AAPL", "GOOG", "MSFT", "AAPL", "GOOG", "MSFT", "AAPL", + "GOOG", "MSFT", "AAPL", "GOOG", "MSFT", "AAPL", "GOOG", + "MSFT", "AAPL", "GOOG", "MSFT", "AAPL", "GOOG", + ], + "MV": [ + 1500, 1200, 1300, 1400, 1600, 1100, 1700, 1800, 1900, 2000, + -2100, -2200, -2300, -2400, -2500, -2600, -2700, -2800, + -2900, -3000, + ], + } + + tbl = Table(data) + view = tbl.view( + aggregates={"MV": "gmv"}, + group_by=["division"], + split_by=["symbol"], + columns=["MV"], + ) + + assert view.to_columns() == { + "__ROW_PATH__": [[], ["D1"], ["D2"]], + "AAPL|MV": [2800, -2000, 800], + "GOOG|MV": [5600, -3200, -2400], + "MSFT|MV": [4600, 700, -3900], + } + def test_view_aggregate_mean_from_schema(self): data = [ {"a": "a", "x": 1, "y": 200}, diff --git a/rust/perspective-server/cpp/perspective/src/cpp/aggspec.cpp b/rust/perspective-server/cpp/perspective/src/cpp/aggspec.cpp index aa262faea0..3668855095 100644 --- a/rust/perspective-server/cpp/perspective/src/cpp/aggspec.cpp +++ b/rust/perspective-server/cpp/perspective/src/cpp/aggspec.cpp @@ -247,6 +247,9 @@ t_aggspec::agg_str() const { case AGGTYPE_STANDARD_DEVIATION: { return "stddev"; } + case AGGTYPE_GMV: { + return "gmv"; + } default: { PSP_COMPLAIN_AND_ABORT("Unknown agg type"); return "unknown"; @@ -343,6 +346,7 @@ t_aggspec::get_output_specs(const t_schema& schema) const { case AGGTYPE_SUM: case AGGTYPE_SUM_ABS: case AGGTYPE_ABS_SUM: + case AGGTYPE_GMV: case AGGTYPE_PCT_SUM_PARENT: case AGGTYPE_PCT_SUM_GRAND_TOTAL: case AGGTYPE_MUL: diff --git a/rust/perspective-server/cpp/perspective/src/cpp/base.cpp b/rust/perspective-server/cpp/perspective/src/cpp/base.cpp index 6ce0298749..1cc8314f17 100644 --- a/rust/perspective-server/cpp/perspective/src/cpp/base.cpp +++ b/rust/perspective-server/cpp/perspective/src/cpp/base.cpp @@ -589,6 +589,9 @@ str_to_aggtype(const std::string& str) { if (str == "stddev" || str == "standard deviation") { return t_aggtype::AGGTYPE_STANDARD_DEVIATION; } + if (str == "gmv") { + return t_aggtype::AGGTYPE_GMV; + } std::stringstream ss; ss << "Encountered unknown aggregate operation: '" << str << "'" diff --git a/rust/perspective-server/cpp/perspective/src/cpp/config.cpp b/rust/perspective-server/cpp/perspective/src/cpp/config.cpp index 75f92a2914..53c5636e07 100644 --- a/rust/perspective-server/cpp/perspective/src/cpp/config.cpp +++ b/rust/perspective-server/cpp/perspective/src/cpp/config.cpp @@ -226,6 +226,7 @@ t_config::setup( case AGGTYPE_SUM_NOT_NULL: case AGGTYPE_SUM_ABS: case AGGTYPE_ABS_SUM: + case AGGTYPE_GMV: case AGGTYPE_MUL: case AGGTYPE_DISTINCT_COUNT: case AGGTYPE_DISTINCT_LEAF: diff --git a/rust/perspective-server/cpp/perspective/src/cpp/context_two.cpp b/rust/perspective-server/cpp/perspective/src/cpp/context_two.cpp index ea3fef00af..3606e1e0bc 100644 --- a/rust/perspective-server/cpp/perspective/src/cpp/context_two.cpp +++ b/rust/perspective-server/cpp/perspective/src/cpp/context_two.cpp @@ -77,6 +77,19 @@ t_ctx2::init() { ); m_trees[treeidx]->init(); + + // Tell the tree how many of its `m_pivots` are row pivots and + // what the next missing row pivot is (if any). AGGTYPE_GMV uses + // this to roll up across row dimensions that aren't embedded in + // this particular tree — e.g. m_trees[0] has no row pivots, so + // a tree-leaf at e.g. [AAPL] needs to be partitioned by + // `row_pivots[0]` from the gstate to compute the gmv value. + const auto& row_pivots = m_config.get_row_pivots(); + std::string next_row_pivot; + if (treeidx < row_pivots.size()) { + next_row_pivot = row_pivots[treeidx].colname(); + } + m_trees[treeidx]->set_gmv_row_pivot_meta(treeidx, next_row_pivot); } m_rtraversal = std::make_shared(rtree()); @@ -1171,6 +1184,15 @@ t_ctx2::reset(bool reset_expressions) { ); m_trees[treeidx]->init(); m_trees[treeidx]->set_deltas_enabled(get_feature_state(CTX_FEAT_DELTA)); + + // See [t_ctx2::init] — gmv needs to know how many row pivots + // this particular tree carries vs. what's still in the gstate. + const auto& row_pivots = m_config.get_row_pivots(); + std::string next_row_pivot; + if (treeidx < row_pivots.size()) { + next_row_pivot = row_pivots[treeidx].colname(); + } + m_trees[treeidx]->set_gmv_row_pivot_meta(treeidx, next_row_pivot); } m_rtraversal = std::make_shared(rtree()); diff --git a/rust/perspective-server/cpp/perspective/src/cpp/extract_aggregate.cpp b/rust/perspective-server/cpp/perspective/src/cpp/extract_aggregate.cpp index 9469aed403..ecc607e9c3 100644 --- a/rust/perspective-server/cpp/perspective/src/cpp/extract_aggregate.cpp +++ b/rust/perspective-server/cpp/perspective/src/cpp/extract_aggregate.cpp @@ -54,6 +54,7 @@ extract_aggregate( case AGGTYPE_SUM: case AGGTYPE_SUM_ABS: case AGGTYPE_ABS_SUM: + case AGGTYPE_GMV: case AGGTYPE_SUM_NOT_NULL: case AGGTYPE_MUL: case AGGTYPE_COUNT: diff --git a/rust/perspective-server/cpp/perspective/src/cpp/server.cpp b/rust/perspective-server/cpp/perspective/src/cpp/server.cpp index bfa159a90b..54c67960b8 100644 --- a/rust/perspective-server/cpp/perspective/src/cpp/server.cpp +++ b/rust/perspective-server/cpp/perspective/src/cpp/server.cpp @@ -1453,6 +1453,7 @@ ProtoServer::_handle_request(std::uint32_t client_id, Request&& req) { number_opts.add_aggregates()->set_name("distinct count"); number_opts.add_aggregates()->set_name("dominant"); number_opts.add_aggregates()->set_name("first"); + number_opts.add_aggregates()->set_name("gmv"); number_opts.add_aggregates()->set_name("high"); number_opts.add_aggregates()->set_name("low"); number_opts.add_aggregates()->set_name("max"); diff --git a/rust/perspective-server/cpp/perspective/src/cpp/sparse_tree.cpp b/rust/perspective-server/cpp/perspective/src/cpp/sparse_tree.cpp index f0202b446c..d2198b9371 100644 --- a/rust/perspective-server/cpp/perspective/src/cpp/sparse_tree.cpp +++ b/rust/perspective-server/cpp/perspective/src/cpp/sparse_tree.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -86,11 +87,31 @@ t_stree::t_stree( m_aggspecs(aggspecs), m_schema(std::move(schema)), m_cur_aggidx(1), - m_has_delta(false) { + m_has_delta(false), + m_num_row_pivots_in_tree(pivots.size()) { const auto& g_agg_str = cfg.get_grand_agg_str(); m_grand_agg_str = g_agg_str.empty() ? "Grand Aggregate" : g_agg_str; } +void +t_stree::set_gmv_row_pivot_meta( + t_uindex num_row_pivots_in_tree, + const std::string& next_row_pivot_name +) { + m_num_row_pivots_in_tree = num_row_pivots_in_tree; + m_next_row_pivot_name = next_row_pivot_name; +} + +t_uindex +t_stree::get_num_row_pivots_in_tree() const { + return m_num_row_pivots_in_tree; +} + +const std::string& +t_stree::get_next_row_pivot_name() const { + return m_next_row_pivot_name; +} + t_stree::~t_stree() { for (auto& iter : m_smap) { free(const_cast(iter.first)); @@ -1815,6 +1836,131 @@ t_stree::update_agg_table( ); dst->set_scalar(dst_ridx, new_value); } break; + case AGGTYPE_GMV: { + // The user-facing rule is "leaf = sum, parent = sum over + // immediate row children of |raw sum of child subtree|". + // Under split_by this is complicated by t_ctx2 sharding + // the data across several t_stree instances — see + // [t_stree::set_gmv_row_pivot_meta] for the metadata that + // distinguishes "the next pivot in this tree is a row + // pivot" from "the next missing row pivot lives in the + // gstate". + old_value.set(dst->get_scalar(dst_ridx)); + const auto& col_name = spec.get_dependencies()[0].name(); + const auto dst_dtype = dst->get_dtype(); + const auto k_tree = m_num_row_pivots_in_tree; + const auto& next_row_pivot = m_next_row_pivot_name; + const bool has_missing_row_pivot = !next_row_pivot.empty(); + const auto depth = get_depth(nidx); + + auto sum_reducer = + [dst_dtype](std::vector& values) -> t_tscalar { + if (values.empty()) { + return mknone(); + } + t_tscalar v; + v.set(std::uint64_t(0)); + v.m_type = dst_dtype; + for (const auto& x : values) { + if (x.is_nan()) { + continue; + } + v = v.add(x.coerce_numeric_dtype(dst_dtype)); + } + return v; + }; + + // row_path_len = min(depth, k_tree). A node is a "row-leaf" + // iff there are no row pivots missing from this tree AND + // all row pivots are consumed at this node's depth — i.e. + // depth >= k_tree. + const bool is_row_leaf = + !has_missing_row_pivot && depth >= k_tree; + + if (is_row_leaf) { + auto pkeys = get_pkeys(nidx); + new_value.set( + reduce_from_gstate&)>>( + gstate, + expression_master_table, + col_name, + pkeys, + sum_reducer + ) + ); + } else if (depth < k_tree) { + // Row-parent and this tree's next pivot is the next + // row pivot. Tree children at depth+1 are the + // immediate row children — use them directly. + t_tscalar rval; + rval.set(std::uint64_t(0)); + rval.m_type = dst_dtype; + for (auto cidx : get_child_idx(nidx)) { + auto cpkeys = get_pkeys(cidx); + t_tscalar csum = reduce_from_gstate&)>>( + gstate, + expression_master_table, + col_name, + cpkeys, + sum_reducer + ); + if (csum.is_valid() && !csum.is_nan()) { + rval = rval.add(csum.abs()); + } + } + new_value.set(rval); + } else { + // Row-parent but this tree's pivots beyond `depth` + // (if any) are column pivots, so the immediate row + // children aren't tree children. Read the missing + // row pivot column for this node's pkeys, partition + // by it, sum each partition, sum |partition_sum|. + auto pkeys = get_pkeys(nidx); + std::vector pivot_vals; + std::vector data_vals; + read_column_from_gstate( + gstate, + expression_master_table, + next_row_pivot, + pkeys, + pivot_vals + ); + read_column_from_gstate( + gstate, + expression_master_table, + col_name, + pkeys, + data_vals + ); + + std::unordered_map partials; + for (std::size_t i = 0; + i < pivot_vals.size() && i < data_vals.size(); + ++i) { + const auto& x = data_vals[i]; + if (!x.is_valid() || x.is_nan()) { + continue; + } + auto& acc = partials[pivot_vals[i]]; + if (!acc.is_valid()) { + acc.set(std::uint64_t(0)); + acc.m_type = dst_dtype; + } + acc = acc.add(x.coerce_numeric_dtype(dst_dtype)); + } + + t_tscalar rval; + rval.set(std::uint64_t(0)); + rval.m_type = dst_dtype; + for (const auto& kv : partials) { + rval = rval.add(kv.second.abs()); + } + new_value.set(rval); + } + dst->set_scalar(dst_ridx, new_value); + } break; case AGGTYPE_MUL: { old_value.set(dst->get_scalar(dst_ridx)); auto pkeys = get_pkeys(nidx); diff --git a/rust/perspective-server/cpp/perspective/src/include/perspective/base.h b/rust/perspective-server/cpp/perspective/src/include/perspective/base.h index b3a2737dd7..0df6c435f3 100644 --- a/rust/perspective-server/cpp/perspective/src/include/perspective/base.h +++ b/rust/perspective-server/cpp/perspective/src/include/perspective/base.h @@ -293,7 +293,8 @@ enum t_aggtype { AGGTYPE_PCT_SUM_PARENT, AGGTYPE_PCT_SUM_GRAND_TOTAL, AGGTYPE_VARIANCE, - AGGTYPE_STANDARD_DEVIATION + AGGTYPE_STANDARD_DEVIATION, + AGGTYPE_GMV }; PERSPECTIVE_EXPORT t_aggtype str_to_aggtype(const std::string& str); diff --git a/rust/perspective-server/cpp/perspective/src/include/perspective/sparse_tree.h b/rust/perspective-server/cpp/perspective/src/include/perspective/sparse_tree.h index c561571316..15a46799bc 100644 --- a/rust/perspective-server/cpp/perspective/src/include/perspective/sparse_tree.h +++ b/rust/perspective-server/cpp/perspective/src/include/perspective/sparse_tree.h @@ -468,6 +468,25 @@ class PERSPECTIVE_EXPORT t_stree { t_symtable m_symtable; bool m_has_delta; std::string m_grand_agg_str; + + // Used by AGGTYPE_GMV under split_by. For t_ctx1 (group_by only) the + // tree's pivots are exactly the row pivots, so the default + // `m_num_row_pivots_in_tree == m_pivots.size()` makes every tree-leaf + // a "real" leaf and the next-row-pivot lookup unused. For t_ctx2 each + // m_trees[k] is initialized with the row+col split via + // `set_gmv_row_pivot_meta(k, next_row_pivot_name)`, so the gmv kernel + // can detect a "tree-leaf but row-parent" node and roll up across the + // missing row pivot dimension via the gstate. + t_uindex m_num_row_pivots_in_tree; + std::string m_next_row_pivot_name; + +public: + void set_gmv_row_pivot_meta( + t_uindex num_row_pivots_in_tree, + const std::string& next_row_pivot_name + ); + t_uindex get_num_row_pivots_in_tree() const; + const std::string& get_next_row_pivot_name() const; }; } // end namespace perspective