Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
title: "Support 'countDist' (count distinct) metric in rollup for streaming expressions"
type: added
authors:
- name: khushjain
links:
- name: SOLR-18220
url: https://issues.apache.org/jira/browse/SOLR-18220
Original file line number Diff line number Diff line change
Expand Up @@ -1448,7 +1448,7 @@ For faster aggregation over low to moderate cardinality fields, the `facet` func
* `StreamExpression` (Mandatory)
* `over`: (Mandatory) A list of fields to group by.
* `metrics`: (Mandatory) The list of metrics to compute.
Currently supported metrics are `sum(col)`, `avg(col)`, `min(col)`, `max(col)`, `count(*)`, `missing(col)`.
Currently supported metrics are `sum(col)`, `avg(col)`, `min(col)`, `max(col)`, `count(*)`, `missing(col)`, `countDist(col)`.

=== rollup Syntax

Expand All @@ -1466,7 +1466,8 @@ rollup(
avg(a_i),
avg(a_f),
count(*),
missing(a_i)
missing(a_i),
countDist(a_i)
)
----

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.solr.client.solrj.io.stream.metrics;

import java.io.IOException;
import java.util.HashSet;
import java.util.Locale;
import org.apache.solr.client.solrj.io.Tuple;
import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
Expand All @@ -29,6 +30,7 @@ public class CountDistinctMetric extends Metric {
public static final String APPROX_COUNT_DISTINCT = "hll";

private String columnName;
private HashSet<Object> distinctValues = new HashSet<>();

public CountDistinctMetric(String columnName) {
this(columnName, false);
Expand All @@ -53,6 +55,10 @@ public CountDistinctMetric(StreamExpression expression, StreamFactory factory)
expression,
functionName));
}
if (1 != expression.getParameters().size()) {
throw new IOException(
String.format(Locale.ROOT, "Invalid expression %s - unknown operands found", expression));
Comment thread
KhushJain marked this conversation as resolved.
}

init(functionName, columnName);
}
Expand All @@ -66,7 +72,10 @@ private void init(String functionName, String columnName) {

@Override
public void update(Tuple tuple) {
// Nop for now
Object value = tuple.get(columnName);
if (value != null) {
distinctValues.add(value);
}
Comment thread
KhushJain marked this conversation as resolved.
}

@Override
Expand All @@ -81,14 +90,11 @@ public String[] getColumns() {

@Override
public Number getValue() {
// No op for now
return null;
return distinctValues.size();
}

@Override
public StreamExpressionParameter toExpression(StreamFactory factory) throws IOException {
return new StreamExpression(getFunctionName())
.withParameter(columnName)
.withParameter(Boolean.toString(outputLong));
return new StreamExpression(getFunctionName()).withParameter(columnName);
Comment thread
KhushJain marked this conversation as resolved.
Comment thread
KhushJain marked this conversation as resolved.
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
import org.apache.solr.client.solrj.io.stream.expr.StreamExpressionParser;
import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
import org.apache.solr.client.solrj.io.stream.metrics.CountDistinctMetric;
import org.apache.solr.client.solrj.io.stream.metrics.CountMetric;
import org.apache.solr.client.solrj.io.stream.metrics.MaxMetric;
import org.apache.solr.client.solrj.io.stream.metrics.MeanMetric;
Expand Down Expand Up @@ -1343,7 +1344,7 @@ public void testRollupStream() throws Exception {

new UpdateRequest()
.add(id, "0", "a_s", "hello0", "a_i", "0", "a_f", "1")
.add(id, "2", "a_s", "hello0", "a_i", "2", "a_f", "2")
.add(id, "2", "a_s", "hello0", "a_i", "0", "a_f", "2")
.add(id, "3", "a_s", "hello3", "a_i", "3", "a_f", "3")
.add(id, "4", "a_s", "hello4", "a_i", "4", "a_f", "4")
.add(id, "1", "a_s", "hello0", "a_i", "1", "a_f", "5")
Expand All @@ -1363,7 +1364,8 @@ public void testRollupStream() throws Exception {
.withFunctionName("min", MinMetric.class)
.withFunctionName("max", MaxMetric.class)
.withFunctionName("avg", MeanMetric.class)
.withFunctionName("count", CountMetric.class);
.withFunctionName("count", CountMetric.class)
.withFunctionName("countDist", CountDistinctMetric.class);

StreamExpression expression;
TupleStream stream;
Expand All @@ -1388,6 +1390,8 @@ public void testRollupStream() throws Exception {
+ "avg(a_i),"
+ "avg(a_f),"
+ "count(*),"
+ "countDist(a_i),"
+ "countDist(a_s)"
+ ")");
stream = factory.constructStream(expression);
stream.setStreamContext(streamContext);
Expand All @@ -1408,17 +1412,21 @@ public void testRollupStream() throws Exception {
Double avgi = tuple.getDouble("avg(a_i)");
Double avgf = tuple.getDouble("avg(a_f)");
Double count = tuple.getDouble("count(*)");
Double countDistI = tuple.getDouble("countDist(a_i)");
Double countDistS = tuple.getDouble("countDist(a_s)");

assertEquals("hello0", bucket);
assertEquals(17.0D, sumi, 0.0);
assertEquals(15.0D, sumi, 0.0);
assertEquals(18.0D, sumf, 0.0);
assertEquals(0.0D, mini, 0.0);
assertEquals(1.0D, minf, 0.0);
assertEquals(14.0D, maxi, 0.0);
assertEquals(10.0D, maxf, 0.0);
assertEquals(4.25D, avgi, 0.0);
assertEquals(3.75D, avgi, 0.0);
assertEquals(4.5D, avgf, 0.0);
assertEquals(4, count, 0.0);
assertEquals(3, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

tuple = tuples.get(1);
bucket = tuple.getString("a_s");
Expand All @@ -1431,6 +1439,8 @@ public void testRollupStream() throws Exception {
avgi = tuple.getDouble("avg(a_i)");
avgf = tuple.getDouble("avg(a_f)");
count = tuple.getDouble("count(*)");
countDistI = tuple.getDouble("countDist(a_i)");
countDistS = tuple.getDouble("countDist(a_s)");

assertEquals("hello3", bucket);
assertEquals(38.0D, sumi, 0.0);
Expand All @@ -1442,6 +1452,8 @@ public void testRollupStream() throws Exception {
assertEquals(9.5D, avgi, 0.0);
assertEquals(6.5D, avgf, 0.0);
assertEquals(4, count, 0.0);
assertEquals(4, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

tuple = tuples.get(2);
bucket = tuple.getString("a_s");
Expand All @@ -1454,6 +1466,8 @@ public void testRollupStream() throws Exception {
avgi = tuple.getDouble("avg(a_i)");
avgf = tuple.getDouble("avg(a_f)");
count = tuple.getDouble("count(*)");
countDistI = tuple.getDouble("countDist(a_i)");
countDistS = tuple.getDouble("countDist(a_s)");

assertEquals("hello4", bucket);
assertEquals(15, sumi.longValue());
Expand All @@ -1465,6 +1479,8 @@ public void testRollupStream() throws Exception {
assertEquals(7.5D, avgi, 0.0);
assertEquals(5.5D, avgf, 0.0);
assertEquals(2, count, 0.0);
assertEquals(2, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

} finally {
solrClientCache.close();
Expand All @@ -1476,7 +1492,7 @@ public void testHashRollupStream() throws Exception {

new UpdateRequest()
.add(id, "0", "a_s", "hello0", "a_i", "0", "a_f", "1")
.add(id, "2", "a_s", "hello0", "a_i", "2", "a_f", "2")
.add(id, "2", "a_s", "hello0", "a_i", "0", "a_f", "2")
.add(id, "3", "a_s", "hello3", "a_i", "3", "a_f", "3")
.add(id, "4", "a_s", "hello4", "a_i", "4", "a_f", "4")
.add(id, "1", "a_s", "hello0", "a_i", "1", "a_f", "5")
Expand All @@ -1497,6 +1513,7 @@ public void testHashRollupStream() throws Exception {
.withFunctionName("max", MaxMetric.class)
.withFunctionName("avg", MeanMetric.class)
.withFunctionName("count", CountMetric.class)
.withFunctionName("countDist", CountDistinctMetric.class)
.withFunctionName("sort", SortStream.class);

StreamExpression expression;
Expand All @@ -1522,6 +1539,8 @@ public void testHashRollupStream() throws Exception {
+ "avg(a_i),"
+ "avg(a_f),"
+ "count(*),"
+ "countDist(a_i),"
+ "countDist(a_s)"
+ "), by=\"avg(a_f) asc\")");
stream = factory.constructStream(expression);
stream.setStreamContext(streamContext);
Expand All @@ -1542,17 +1561,21 @@ public void testHashRollupStream() throws Exception {
Double avgi = tuple.getDouble("avg(a_i)");
Double avgf = tuple.getDouble("avg(a_f)");
Double count = tuple.getDouble("count(*)");
Double countDistI = tuple.getDouble("countDist(a_i)");
Double countDistS = tuple.getDouble("countDist(a_s)");

assertEquals("hello0", bucket);
assertEquals(17.0D, sumi, 0.0);
assertEquals(15.0D, sumi, 0.0);
assertEquals(18.0D, sumf, 0.0);
assertEquals(0.0D, mini, 0.0);
assertEquals(1.0D, minf, 0.0);
assertEquals(14.0D, maxi, 0.0);
assertEquals(10.0D, maxf, 0.0);
assertEquals(4.25D, avgi, 0.0);
assertEquals(3.75D, avgi, 0.0);
assertEquals(4.5D, avgf, 0.0);
assertEquals(4, count, 0.0);
assertEquals(3, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

tuple = tuples.get(1);
bucket = tuple.getString("a_s");
Expand All @@ -1565,6 +1588,8 @@ public void testHashRollupStream() throws Exception {
avgi = tuple.getDouble("avg(a_i)");
avgf = tuple.getDouble("avg(a_f)");
count = tuple.getDouble("count(*)");
countDistI = tuple.getDouble("countDist(a_i)");
countDistS = tuple.getDouble("countDist(a_s)");

System.out.println("################:bucket" + bucket);

Expand All @@ -1578,6 +1603,8 @@ public void testHashRollupStream() throws Exception {
assertEquals(7.5D, avgi, 0.0);
assertEquals(5.5D, avgf, 0.0);
assertEquals(2, count, 0.0);
assertEquals(2, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

tuple = tuples.get(2);
bucket = tuple.getString("a_s");
Expand All @@ -1590,6 +1617,8 @@ public void testHashRollupStream() throws Exception {
avgi = tuple.getDouble("avg(a_i)");
avgf = tuple.getDouble("avg(a_f)");
count = tuple.getDouble("count(*)");
countDistI = tuple.getDouble("countDist(a_i)");
countDistS = tuple.getDouble("countDist(a_s)");

assertEquals("hello3", bucket);
assertEquals(38.0D, sumi, 0.0);
Expand All @@ -1601,6 +1630,8 @@ public void testHashRollupStream() throws Exception {
assertEquals(9.5D, avgi, 0.0);
assertEquals(6.5D, avgf, 0.0);
assertEquals(4, count, 0.0);
assertEquals(4, countDistI, 0.0);
assertEquals(1, countDistS, 0.0);

} finally {
solrClientCache.close();
Expand Down Expand Up @@ -1993,7 +2024,7 @@ public void testParallelRollupStream() throws Exception {

new UpdateRequest()
.add(id, "0", "a_s", "hello0", "a_i", "0", "a_f", "1")
.add(id, "2", "a_s", "hello0", "a_i", "2", "a_f", "2")
.add(id, "2", "a_s", "hello0", "a_i", "0", "a_f", "2")
.add(id, "3", "a_s", "hello3", "a_i", "3", "a_f", "3")
.add(id, "4", "a_s", "hello4", "a_i", "4", "a_f", "4")
.add(id, "1", "a_s", "hello0", "a_i", "1", "a_f", "5")
Expand Down Expand Up @@ -2070,13 +2101,13 @@ public void testParallelRollupStream() throws Exception {
Double count = tuple.getDouble("count(*)");

assertEquals("hello0", bucket);
assertEquals(17.0D, sumi, 0.0);
assertEquals(15.0D, sumi, 0.0);
assertEquals(18.0D, sumf, 0.0);
assertEquals(0.0D, mini, 0.0);
assertEquals(1.0D, minf, 0.0);
assertEquals(14.0D, maxi, 0.0);
assertEquals(10.0D, maxf, 0.0);
assertEquals(4.25D, avgi, 0.0);
assertEquals(3.75D, avgi, 0.0);
assertEquals(4.5D, avgf, 0.0);
assertEquals(4, count, 0.0);

Expand Down Expand Up @@ -2135,7 +2166,7 @@ public void testParallelHashRollupStream() throws Exception {

new UpdateRequest()
.add(id, "0", "a_s", "hello0", "a_i", "0", "a_f", "1")
.add(id, "2", "a_s", "hello0", "a_i", "2", "a_f", "2")
.add(id, "2", "a_s", "hello0", "a_i", "0", "a_f", "2")
.add(id, "3", "a_s", "hello3", "a_i", "3", "a_f", "3")
.add(id, "4", "a_s", "hello4", "a_i", "4", "a_f", "4")
.add(id, "1", "a_s", "hello0", "a_i", "1", "a_f", "5")
Expand Down Expand Up @@ -2213,13 +2244,13 @@ public void testParallelHashRollupStream() throws Exception {
Double count = tuple.getDouble("count(*)");

assertEquals("hello0", bucket);
assertEquals(17.0D, sumi, 0.0);
assertEquals(15.0D, sumi, 0.0);
assertEquals(18.0D, sumf, 0.0);
assertEquals(0.0D, mini, 0.0);
assertEquals(1.0D, minf, 0.0);
assertEquals(14.0D, maxi, 0.0);
assertEquals(10.0D, maxf, 0.0);
assertEquals(4.25D, avgi, 0.0);
assertEquals(3.75D, avgi, 0.0);
assertEquals(4.5D, avgf, 0.0);
assertEquals(4, count, 0.0);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
import org.apache.solr.client.solrj.io.stream.expr.StreamExpressionParser;
import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
import org.apache.solr.client.solrj.io.stream.metrics.CountDistinctMetric;
import org.apache.solr.client.solrj.io.stream.metrics.CountMetric;
import org.apache.solr.client.solrj.io.stream.metrics.MaxMetric;
import org.apache.solr.client.solrj.io.stream.metrics.MeanMetric;
Expand Down Expand Up @@ -55,6 +56,7 @@ public StreamExpressionToExpressionTest() {
.withFunctionName("intersect", IntersectStream.class)
.withFunctionName("complement", ComplementStream.class)
.withFunctionName("count", CountMetric.class)
.withFunctionName("countDist", CountDistinctMetric.class)
.withFunctionName("sum", SumMetric.class)
.withFunctionName("min", MinMetric.class)
.withFunctionName("max", MaxMetric.class)
Expand Down Expand Up @@ -626,6 +628,19 @@ public void testCountMetric() throws Exception {
assertEquals("count(*)", expressionString);
}

@Test
public void testCountDistinctMetric() throws Exception {

Metric metric;
String expressionString;

// Basic test
metric = new CountDistinctMetric(StreamExpressionParser.parse("countDist(foo)"), factory);
expressionString = metric.toExpression(factory).toString();

assertEquals("countDist(foo)", expressionString);
}

@Test
public void testMaxMetric() throws Exception {

Expand Down
Loading
Loading