From b1274113dcdd2a22b116afe2e19cdcdc92f4c0a0 Mon Sep 17 00:00:00 2001 From: James Santucci Date: Thu, 26 Feb 2026 17:33:20 -0800 Subject: [PATCH 1/6] Use random-shuffle list shuffling instead of random >= 1.3 --- dataframe.cabal | 4 ++-- examples/examples.cabal | 8 ++++---- flake.nix | 2 -- src/DataFrame/Operations/Permutation.hs | 3 ++- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/dataframe.cabal b/dataframe.cabal index c10646bc..3c83b286 100644 --- a/dataframe.cabal +++ b/dataframe.cabal @@ -105,7 +105,8 @@ library hashable >= 1.2 && < 2, process ^>= 1.6, snappy-hs ^>= 0.1, - random >= 1.3 && < 2, + random >= 1.2 && < 1.3, + random-shuffle >= 0.0.4 && < 1, regex-tdfa >= 1.3.0 && < 2, scientific >=0.3.1 && <0.4, template-haskell >= 2.0 && < 3, @@ -209,7 +210,6 @@ test-suite tests directory >= 1.3.0.0 && < 2, HUnit ^>= 1.6, QuickCheck >= 2 && < 3, - random >= 1 && < 2, random-shuffle >= 0.0.4 && < 1, random >= 1 && < 2, text >= 2.0 && < 3, diff --git a/examples/examples.cabal b/examples/examples.cabal index c53d8786..20269194 100644 --- a/examples/examples.cabal +++ b/examples/examples.cabal @@ -85,7 +85,7 @@ executable chipotle hashable >= 1.2 && < 2, process ^>= 1.6, snappy-hs ^>= 0.1, - random >= 1.3 && < 2, + random >= 1.2 && < 1.3, regex-tdfa >= 1.3.0 && < 2, scientific >=0.3.1 && <0.4, template-haskell >= 2.0 && < 3, @@ -172,7 +172,7 @@ executable california_housing hasktorch >= 0.2.1.6 && < 0.3, process ^>= 1.6, snappy-hs ^>= 0.1, - random >= 1.3 && < 2, + random >= 1.2 && < 1.3, regex-tdfa >= 1.3.0 && < 2, scientific >=0.3.1 && <0.4, template-haskell >= 2.0 && < 3, @@ -259,7 +259,7 @@ executable one_billion_row_challenge hasktorch >= 0.2.1.6 && < 0.3, process ^>= 1.6, snappy-hs ^>= 0.1, - random >= 1.3 && < 2, + random >= 1.2 && < 1.3, regex-tdfa >= 1.3.0 && < 2, scientific >=0.3.1 && <0.4, template-haskell >= 2.0 && < 3, @@ -346,7 +346,7 @@ executable iris hasktorch >= 0.2.1.6 && < 0.3, process ^>= 1.6, snappy-hs ^>= 0.1, - random >= 1.3 && < 2, + random >= 1.2 && < 1.3, regex-tdfa >= 1.3.0 && < 2, scientific >=0.3.1 && <0.4, template-haskell >= 2.0 && < 3, diff --git a/flake.nix b/flake.nix index 0eae9846..7c10fab7 100644 --- a/flake.nix +++ b/flake.nix @@ -13,8 +13,6 @@ hsPkgs = pkgs.haskellPackages.extend (self: super: { dataframe = self.callCabal2nix "dataframe" ./. { }; - random = pkgs.haskellPackages.callHackage "random" "1.3.1" { }; - time-compat = pkgs.haskell.lib.dontCheck super.time-compat; }); in { diff --git a/src/DataFrame/Operations/Permutation.hs b/src/DataFrame/Operations/Permutation.hs index c2b31da7..a381f98b 100644 --- a/src/DataFrame/Operations/Permutation.hs +++ b/src/DataFrame/Operations/Permutation.hs @@ -18,6 +18,7 @@ import DataFrame.Internal.Expression import DataFrame.Internal.Row import DataFrame.Operations.Core import System.Random +import System.Random.Shuffle (shuffle') -- | Sort order taken as a parameter by the 'sortBy' function. data SortOrder where @@ -75,4 +76,4 @@ shuffle pureGen df = df{columns = V.map (atIndicesStable indexes) (columns df)} shuffledIndices :: (RandomGen g) => g -> Int -> VU.Vector Int -shuffledIndices pureGen k = VU.fromList (fst (uniformShuffleList [0 .. (k - 1)] pureGen)) +shuffledIndices pureGen k = VU.fromList (shuffle' [0 .. (k - 1)] k pureGen) From 7f57b1fd6af5a5c6d443c1ab26151e51346df58b Mon Sep 17 00:00:00 2001 From: James Santucci Date: Thu, 26 Feb 2026 18:59:25 -0800 Subject: [PATCH 2/6] Now with tests --- dataframe.cabal | 1 + tests/Main.hs | 2 ++ tests/Operations/Shuffle.hs | 72 +++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 tests/Operations/Shuffle.hs diff --git a/dataframe.cabal b/dataframe.cabal index 3c83b286..11e37944 100644 --- a/dataframe.cabal +++ b/dataframe.cabal @@ -199,6 +199,7 @@ test-suite tests Operations.Join, Operations.Merge, Operations.ReadCsv, + Operations.Shuffle, Operations.Sort, Operations.Subset, Operations.Statistics, diff --git a/tests/Main.hs b/tests/Main.hs index 37820ed2..426602a2 100644 --- a/tests/Main.hs +++ b/tests/Main.hs @@ -29,6 +29,7 @@ import qualified Operations.InsertColumn import qualified Operations.Join import qualified Operations.Merge import qualified Operations.ReadCsv +import qualified Operations.Shuffle import qualified Operations.Sort import qualified Operations.Statistics import qualified Operations.Subset @@ -5120,6 +5121,7 @@ tests = ++ Operations.Join.tests ++ Operations.Merge.tests ++ Operations.ReadCsv.tests + ++ Operations.Shuffle.tests ++ Operations.Sort.tests ++ Operations.Statistics.tests ++ Operations.Take.tests diff --git a/tests/Operations/Shuffle.hs b/tests/Operations/Shuffle.hs new file mode 100644 index 00000000..35e8a0bf --- /dev/null +++ b/tests/Operations/Shuffle.hs @@ -0,0 +1,72 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TypeApplications #-} + +module Operations.Permutation where + +import qualified DataFrame as D + +import DataFrame.Operations.Permutation (shuffle) +import System.Random (mkStdGen) +import Test.HUnit (Test (..), assertEqual) + +testDataFrame :: D.DataFrame +testDataFrame = + D.fromNamedColumns + [ ("numbers", D.fromList @Int [1 .. 26]) + ] + +-- Test that shuffling does anything at all +shuffleShuffles :: Test +shuffleShuffles = + let gen = mkStdGen 1234 + shuffled = shuffle gen testDataFrame + initialNumbers = D.extractNumericColumn "numbers" testDataFrame + shuffledNumbers = D.extractNumericColumn "numbers" shuffled + in TestCase + ( assertEqual + "Shuffled column unequal to initial column" + False + (initialNumbers == shuffledNumbers) + ) + +-- Test that un-shuffling restores the original dataframe +-- which is known to be sorted in this case +shuffleOnlyShuffles :: Test +shuffleOnlyShuffles = + let gen = mkStdGen 1234 + shuffled = shuffle gen testDataFrame + sortedShuffled = D.sortBy [D.Desc (D.col @Int "numbers")] shuffled + in TestCase + (assertEqual "sort recovers initial numbers" testDataFrame sortedShuffled) + +-- Test that shuffling isn't doing anything sneaky with summoning +-- random numbers somehow +shuffleSameSeedIsSameShuffle :: Test +shuffleSameSeedIsSameShuffle = + let gen = mkStdGen 1234 + shuffled1 = shuffle gen testDataFrame + shuffled2 = shuffle gen testDataFrame + in TestCase + (assertEqual "shuffle with same seed gives same result" shuffled1 shuffled2) + +-- Test that different seeds give different results +shuffleDifferentSeedIsDifferent :: Test +shuffleDifferentSeedIsDifferent = + let gen1 = mkStdGen 1234 + gen2 = mkStdGen 4321 + shuffled1 = shuffle gen1 testDataFrame + shuffled2 = shuffle gen2 testDataFrame + in TestCase + ( assertEqual + "shuffle with different seeds gives different results" + False + (shuffled1 == shuffled2) + ) + +tests :: [Test] +tests = + [ TestLabel "shuffleShuffles" shuffleShuffles + , TestLabel "shuffleOnlyShuffles" shuffleOnlyShuffles + , TestLabel "shuffleSameSeedIsSameShuffle" shuffleSameSeedIsSameShuffle + , TestLabel "shuffleDifferentSeedIsDifferent" shuffleDifferentSeedIsDifferent + ] From 184ae00cc3ceb60a70246d941059fed742a33567 Mon Sep 17 00:00:00 2001 From: James Santucci Date: Thu, 26 Feb 2026 19:00:56 -0800 Subject: [PATCH 3/6] module rename, woops --- tests/Operations/Shuffle.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Operations/Shuffle.hs b/tests/Operations/Shuffle.hs index 35e8a0bf..1cac88b0 100644 --- a/tests/Operations/Shuffle.hs +++ b/tests/Operations/Shuffle.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TypeApplications #-} -module Operations.Permutation where +module Operations.Shuffle where import qualified DataFrame as D From 7f53743ffde5aedef66362cd1bc268cde3ca97a2 Mon Sep 17 00:00:00 2001 From: James Santucci Date: Thu, 26 Feb 2026 20:46:09 -0800 Subject: [PATCH 4/6] Add new test preserving column names --- tests/Operations/Shuffle.hs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/Operations/Shuffle.hs b/tests/Operations/Shuffle.hs index 1cac88b0..aef35dec 100644 --- a/tests/Operations/Shuffle.hs +++ b/tests/Operations/Shuffle.hs @@ -29,6 +29,17 @@ shuffleShuffles = (initialNumbers == shuffledNumbers) ) +shufflePreservesColumnNames :: Test +shufflePreservesColumnNames = + let gen = mkStdGen 837 + shuffled = shuffle gen testDataFrame + in TestCase + ( assertEqual + "Column names are unchanged" + (D.columnNames shuffled) + (D.columnNames testDataFrame) + ) + -- Test that un-shuffling restores the original dataframe -- which is known to be sorted in this case shuffleOnlyShuffles :: Test From f8fdae4f7b866448ecbf59122c9ba3438d573b64 Mon Sep 17 00:00:00 2001 From: James Santucci Date: Thu, 26 Feb 2026 20:46:20 -0800 Subject: [PATCH 5/6] Fix sort order test --- tests/Operations/Shuffle.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Operations/Shuffle.hs b/tests/Operations/Shuffle.hs index aef35dec..6e106e3f 100644 --- a/tests/Operations/Shuffle.hs +++ b/tests/Operations/Shuffle.hs @@ -46,7 +46,7 @@ shuffleOnlyShuffles :: Test shuffleOnlyShuffles = let gen = mkStdGen 1234 shuffled = shuffle gen testDataFrame - sortedShuffled = D.sortBy [D.Desc (D.col @Int "numbers")] shuffled + sortedShuffled = D.sortBy [D.Asc (D.col @Int "numbers")] shuffled in TestCase (assertEqual "sort recovers initial numbers" testDataFrame sortedShuffled) From b1def3dc7875f1968f713e2b5cd8988f55aadfa6 Mon Sep 17 00:00:00 2001 From: James Santucci Date: Thu, 26 Feb 2026 20:46:38 -0800 Subject: [PATCH 6/6] Rename shuffleOnlyShuffles --- tests/Operations/Shuffle.hs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/Operations/Shuffle.hs b/tests/Operations/Shuffle.hs index 6e106e3f..f1c52f44 100644 --- a/tests/Operations/Shuffle.hs +++ b/tests/Operations/Shuffle.hs @@ -42,8 +42,8 @@ shufflePreservesColumnNames = -- Test that un-shuffling restores the original dataframe -- which is known to be sorted in this case -shuffleOnlyShuffles :: Test -shuffleOnlyShuffles = +shufflePreservesData :: Test +shufflePreservesData = let gen = mkStdGen 1234 shuffled = shuffle gen testDataFrame sortedShuffled = D.sortBy [D.Asc (D.col @Int "numbers")] shuffled @@ -77,7 +77,8 @@ shuffleDifferentSeedIsDifferent = tests :: [Test] tests = [ TestLabel "shuffleShuffles" shuffleShuffles - , TestLabel "shuffleOnlyShuffles" shuffleOnlyShuffles + , TestLabel "shufflePreservesData" shufflePreservesData + , TestLabel "shufflePreservesColumnNames" shufflePreservesColumnNames , TestLabel "shuffleSameSeedIsSameShuffle" shuffleSameSeedIsSameShuffle , TestLabel "shuffleDifferentSeedIsDifferent" shuffleDifferentSeedIsDifferent ]