@@ -33,43 +33,43 @@ col_type_2 = col('"Type 2"')
3333col_speed = col(' "Speed"' )
3434col_attack = col(' "Attack"' )
3535
36- print ( df.aggregate(
36+ df.aggregate(
3737 [col_type_1],
3838 [
3939 f.approx_distinct(col_speed).alias(" Count" ),
4040 f.approx_median(col_speed).alias(" Median Speed" ),
4141 f.approx_percentile_cont(col_speed, 0.9 ).alias(" 90% Speed" ),
4242 ],
43- ))
43+ ).show( )
4444```
4545
4646
4747When ` group_by ` is ` None ` or an empty list, the aggregation is done over the whole
4848[ ` DataFrame ` ] [ datafusion.dataframe.DataFrame ] . For grouping the ` group_by ` list must contain at least one column.
4949
5050``` python exec="1" source="material-block" result="text" session="aggregations"
51- print ( df.aggregate(
51+ df.aggregate(
5252 [col_type_1],
5353 [
5454 f.max(col_speed).alias(" Max Speed" ),
5555 f.avg(col_speed).alias(" Avg Speed" ),
5656 f.min(col_speed).alias(" Min Speed" ),
5757 ],
58- ))
58+ ).show( )
5959```
6060
6161
6262More than one column can be used for grouping
6363
6464``` python exec="1" source="material-block" result="text" session="aggregations"
65- print ( df.aggregate(
65+ df.aggregate(
6666 [col_type_1, col_type_2],
6767 [
6868 f.max(col_speed).alias(" Max Speed" ),
6969 f.avg(col_speed).alias(" Avg Speed" ),
7070 f.min(col_speed).alias(" Min Speed" ),
7171 ],
72- ))
72+ ).show( )
7373```
7474
7575
@@ -80,7 +80,7 @@ operation. These can also be overridden using the builder approach to setting an
8080parameters. When you use the builder, you must call ` build() ` to finish. For example, these two
8181expressions are equivalent.
8282
83- ``` python exec="1" source="material-block" result="text" session="aggregations"
83+ ``` python exec="1" source="material-block" session="aggregations"
8484first_1 = f.first_value(col(" a" ), order_by = [col(" a" )])
8585first_2 = f.first_value(col(" a" )).order_by(col(" a" )).build()
8686```
@@ -94,14 +94,14 @@ sort the Pokemon by their attack in increasing order and take the first value, w
9494Pokemon with the smallest attack value in each ` Type 1 ` .
9595
9696``` python exec="1" source="material-block" result="text" session="aggregations"
97- print ( df.aggregate(
97+ df.aggregate(
9898 [col(' "Type 1"' )],
9999 [
100100 f.first_value(
101101 col(' "Name"' ), order_by = [col(' "Attack"' ).sort(ascending = True )]
102102 ).alias(" Smallest Attack" )
103103 ],
104- ))
104+ ).show( )
105105```
106106
107107
@@ -112,9 +112,9 @@ time each. Suppose we want to create an array of all of the `Type 2` for each `T
112112Pokemon set. Since there will be many entries of ` Type 2 ` we only one each distinct value.
113113
114114``` python exec="1" source="material-block" result="text" session="aggregations"
115- print ( df.aggregate(
115+ df.aggregate(
116116 [col_type_1], [f.array_agg(col_type_2, distinct = True ).alias(" Type 2 List" )]
117- ))
117+ ).show( )
118118```
119119
120120
@@ -128,14 +128,14 @@ df.filter(col_type_2.is_not_null()).aggregate(
128128 [col_type_1], [f.array_agg(col_type_2, distinct = True ).alias(" Type 2 List" )]
129129)
130130
131- print ( df.aggregate(
131+ df.aggregate(
132132 [col_type_1],
133133 [
134134 f.array_agg(col_type_2, distinct = True , filter = col_type_2.is_not_null()).alias(
135135 " Type 2 List"
136136 )
137137 ],
138- ))
138+ ).show( )
139139```
140140
141141
@@ -163,14 +163,14 @@ df.aggregate(
163163 ],
164164)
165165
166- print ( df.aggregate(
166+ df.aggregate(
167167 [col_type_1],
168168 [
169169 f.first_value(
170170 col_type_2, order_by = [col_attack], null_treatment = NullTreatment.IGNORE_NULLS
171171 ).alias(" Lowest Attack Type 2" )
172172 ],
173- ))
173+ ).show( )
174174```
175175
176176
@@ -185,13 +185,13 @@ Filter takes a single expression.
185185Suppose we want to find the speed values for only Pokemon that have low Attack values.
186186
187187``` python exec="1" source="material-block" result="text" session="aggregations"
188- print ( df.aggregate(
188+ df.aggregate(
189189 [col_type_1],
190190 [
191191 f.avg(col_speed).alias(" Avg Speed All" ),
192192 f.avg(col_speed, filter = col_attack < lit(50 )).alias(" Avg Speed Low Attack" ),
193193 ],
194- ))
194+ ).show( )
195195```
196196
197197
@@ -278,14 +278,14 @@ once:
278278``` python exec="1" source="material-block" result="text" session="aggregations"
279279from datafusion.expr import GroupingSet
280280
281- print ( df.aggregate(
281+ df.aggregate(
282282 [GroupingSet.rollup(col_type_1)],
283283 [
284284 f.count(col_speed).alias(" Count" ),
285285 f.avg(col_speed).alias(" Avg Speed" ),
286286 f.max(col_speed).alias(" Max Speed" ),
287287 ],
288- ).sort(col_type_1.sort(ascending = True , nulls_first = True )))
288+ ).sort(col_type_1.sort(ascending = True , nulls_first = True )).show( )
289289```
290290
291291
@@ -297,14 +297,14 @@ for that row and `1` when it is aggregated across.
297297Use ` .alias() ` to give the column a readable name:
298298
299299``` python exec="1" source="material-block" result="text" session="aggregations"
300- print ( df.aggregate(
300+ df.aggregate(
301301 [GroupingSet.rollup(col_type_1)],
302302 [
303303 f.count(col_speed).alias(" Count" ),
304304 f.avg(col_speed).alias(" Avg Speed" ),
305305 f.grouping(col_type_1).alias(" Is Total" ),
306306 ],
307- ).sort(col_type_1.sort(ascending = True , nulls_first = True )))
307+ ).sort(col_type_1.sort(ascending = True , nulls_first = True )).show( )
308308```
309309
310310
@@ -315,13 +315,13 @@ With two columns the hierarchy becomes more apparent. `rollup(Type 1, Type 2)` p
315315- one grand total row
316316
317317``` python exec="1" source="material-block" result="text" session="aggregations"
318- print ( df.aggregate(
318+ df.aggregate(
319319 [GroupingSet.rollup(col_type_1, col_type_2)],
320320 [f.count(col_speed).alias(" Count" ), f.avg(col_speed).alias(" Avg Speed" )],
321321).sort(
322322 col_type_1.sort(ascending = True , nulls_first = True ),
323323 col_type_2.sort(ascending = True , nulls_first = True ),
324- ))
324+ ).show( )
325325```
326326
327327
@@ -336,13 +336,13 @@ For our Pokemon data, `cube(Type 1, Type 2)` gives us stats broken down by the t
336336by ` Type 1 ` alone, by ` Type 2 ` alone, and a grand total — all in one query:
337337
338338``` python exec="1" source="material-block" result="text" session="aggregations"
339- print ( df.aggregate(
339+ df.aggregate(
340340 [GroupingSet.cube(col_type_1, col_type_2)],
341341 [f.count(col_speed).alias(" Count" ), f.avg(col_speed).alias(" Avg Speed" )],
342342).sort(
343343 col_type_1.sort(ascending = True , nulls_first = True ),
344344 col_type_2.sort(ascending = True , nulls_first = True ),
345- ))
345+ ).show( )
346346```
347347
348348
@@ -359,21 +359,21 @@ For example, if we want only the per-`Type 1` totals and per-`Type 2` totals —
359359full ` (Type 1, Type 2) ` detail rows or the grand total — we can ask for exactly that:
360360
361361``` python exec="1" source="material-block" result="text" session="aggregations"
362- print ( df.aggregate(
362+ df.aggregate(
363363 [GroupingSet.grouping_sets([col_type_1], [col_type_2])],
364364 [f.count(col_speed).alias(" Count" ), f.avg(col_speed).alias(" Avg Speed" )],
365365).sort(
366366 col_type_1.sort(ascending = True , nulls_first = True ),
367367 col_type_2.sort(ascending = True , nulls_first = True ),
368- ))
368+ ).show( )
369369```
370370
371371
372372Each row belongs to exactly one grouping level. The [ ` grouping ` ] [ datafusion.functions.grouping ]
373373function tells you which level each row comes from:
374374
375375``` python exec="1" source="material-block" result="text" session="aggregations"
376- print ( df.aggregate(
376+ df.aggregate(
377377 [GroupingSet.grouping_sets([col_type_1], [col_type_2])],
378378 [
379379 f.count(col_speed).alias(" Count" ),
@@ -384,7 +384,7 @@ print(df.aggregate(
384384).sort(
385385 col_type_1.sort(ascending = True , nulls_first = True ),
386386 col_type_2.sort(ascending = True , nulls_first = True ),
387- ))
387+ ).show( )
388388```
389389
390390
0 commit comments