diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_diff_len.py b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_diff_len.py index 2488b2316bb85..25a2dbb37db01 100644 --- a/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_diff_len.py +++ b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_diff_len.py @@ -18,6 +18,7 @@ import pandas as pd from pyspark import pandas as ps +from pyspark.loose_version import LooseVersion from pyspark.pandas.config import set_option, reset_option from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.sqlutils import SQLTestUtils @@ -46,37 +47,41 @@ def test_groupby_different_lengths(self): pd.DataFrame({"a": [1, 2, 6, 4, 4, 6, 4, 3, 7], "b": [4, 2, 7, 3, 3, 1, 1, 1, 2]}), ] - for pdf1, pdf2 in zip(pdfs1, pdfs2): - psdf1 = ps.from_pandas(pdf1) - psdf2 = ps.from_pandas(pdf2) - - for as_index in [True, False]: - if as_index: - - def sort(df): - return df.sort_index() - - else: - - def sort(df): - return df.sort_values("c").reset_index(drop=True) - - self.assert_eq( - sort(psdf1.groupby(psdf2.a, as_index=as_index).sum()), - sort(pdf1.groupby(pdf2.a, as_index=as_index).sum()), - almost=as_index, - ) - - self.assert_eq( - sort(psdf1.groupby(psdf2.a, as_index=as_index).c.sum()), - sort(pdf1.groupby(pdf2.a, as_index=as_index).c.sum()), - almost=as_index, - ) - self.assert_eq( - sort(psdf1.groupby(psdf2.a, as_index=as_index)["c"].sum()), - sort(pdf1.groupby(pdf2.a, as_index=as_index)["c"].sum()), - almost=as_index, - ) + for as_index in [True, False]: + # pandas 3 includes external group keys for as_index=False and can widen + # their dtype after aligning mismatched lengths. + almost = as_index or LooseVersion(pd.__version__) >= "3.0.0" + + if as_index: + + def sort(df): + return df.sort_index() + + else: + + def sort(df): + return df.sort_values("c").reset_index(drop=True) + + for i, (pdf1, pdf2) in enumerate(zip(pdfs1, pdfs2)): + psdf1 = ps.from_pandas(pdf1) + psdf2 = ps.from_pandas(pdf2) + + with self.subTest(i=i, as_index=as_index): + self.assert_eq( + sort(psdf1.groupby(psdf2.a, as_index=as_index).sum()), + sort(pdf1.groupby(pdf2.a, as_index=as_index).sum()), + almost=almost, + ) + self.assert_eq( + sort(psdf1.groupby(psdf2.a, as_index=as_index).c.sum()), + sort(pdf1.groupby(pdf2.a, as_index=as_index).c.sum()), + almost=almost, + ) + self.assert_eq( + sort(psdf1.groupby(psdf2.a, as_index=as_index)["c"].sum()), + sort(pdf1.groupby(pdf2.a, as_index=as_index)["c"].sum()), + almost=almost, + ) class GroupByDiffLenTests(