Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions dataframe.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ library
hashable >= 1.2 && < 2,
process ^>= 1.6,
snappy-hs ^>= 0.1,
random >= 1.3 && < 2,
random >= 1.2 && < 1.3,
random-shuffle >= 0.0.4 && < 1,
regex-tdfa >= 1.3.0 && < 2,
scientific >=0.3.1 && <0.4,
template-haskell >= 2.0 && < 3,
Expand Down Expand Up @@ -198,6 +199,7 @@ test-suite tests
Operations.Join,
Operations.Merge,
Operations.ReadCsv,
Operations.Shuffle,
Operations.Sort,
Operations.Subset,
Operations.Statistics,
Expand All @@ -209,7 +211,6 @@ test-suite tests
directory >= 1.3.0.0 && < 2,
HUnit ^>= 1.6,
QuickCheck >= 2 && < 3,
random >= 1 && < 2,
random-shuffle >= 0.0.4 && < 1,
random >= 1 && < 2,
text >= 2.0 && < 3,
Expand Down
8 changes: 4 additions & 4 deletions examples/examples.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ executable chipotle
hashable >= 1.2 && < 2,
process ^>= 1.6,
snappy-hs ^>= 0.1,
random >= 1.3 && < 2,
random >= 1.2 && < 1.3,
regex-tdfa >= 1.3.0 && < 2,
scientific >=0.3.1 && <0.4,
template-haskell >= 2.0 && < 3,
Expand Down Expand Up @@ -172,7 +172,7 @@ executable california_housing
hasktorch >= 0.2.1.6 && < 0.3,
process ^>= 1.6,
snappy-hs ^>= 0.1,
random >= 1.3 && < 2,
random >= 1.2 && < 1.3,
regex-tdfa >= 1.3.0 && < 2,
scientific >=0.3.1 && <0.4,
template-haskell >= 2.0 && < 3,
Expand Down Expand Up @@ -259,7 +259,7 @@ executable one_billion_row_challenge
hasktorch >= 0.2.1.6 && < 0.3,
process ^>= 1.6,
snappy-hs ^>= 0.1,
random >= 1.3 && < 2,
random >= 1.2 && < 1.3,
regex-tdfa >= 1.3.0 && < 2,
scientific >=0.3.1 && <0.4,
template-haskell >= 2.0 && < 3,
Expand Down Expand Up @@ -346,7 +346,7 @@ executable iris
hasktorch >= 0.2.1.6 && < 0.3,
process ^>= 1.6,
snappy-hs ^>= 0.1,
random >= 1.3 && < 2,
random >= 1.2 && < 1.3,
regex-tdfa >= 1.3.0 && < 2,
scientific >=0.3.1 && <0.4,
template-haskell >= 2.0 && < 3,
Expand Down
2 changes: 0 additions & 2 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@

hsPkgs = pkgs.haskellPackages.extend (self: super: {
dataframe = self.callCabal2nix "dataframe" ./. { };
random = pkgs.haskellPackages.callHackage "random" "1.3.1" { };
time-compat = pkgs.haskell.lib.dontCheck super.time-compat;
});
in
{
Expand Down
3 changes: 2 additions & 1 deletion src/DataFrame/Operations/Permutation.hs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import DataFrame.Internal.Expression
import DataFrame.Internal.Row
import DataFrame.Operations.Core
import System.Random
import System.Random.Shuffle (shuffle')

-- | Sort order taken as a parameter by the 'sortBy' function.
data SortOrder where
Expand Down Expand Up @@ -75,4 +76,4 @@ shuffle pureGen df =
df{columns = V.map (atIndicesStable indexes) (columns df)}

shuffledIndices :: (RandomGen g) => g -> Int -> VU.Vector Int
shuffledIndices pureGen k = VU.fromList (fst (uniformShuffleList [0 .. (k - 1)] pureGen))
shuffledIndices pureGen k = VU.fromList (shuffle' [0 .. (k - 1)] k pureGen)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty unclear to me how to test this. I thought about a shuffle/un-shuffle identity test, but unshuffle didn't get me very far in search results 😅

Any thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a couple of things I would test then.

  1. test that the shuffling doesn't do anything else than shuffle. So basically sort the shuffled and unshuffled and see if it's equal to the same thing. This ensures that shuffling isn't doing anything else than permuting the indices.

  2. check that shuffling with equivalent seeds result in the same shuffle.

That's about all I can think of

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. No need to round trip. Checking that shuffling preserves length (even when there are duplicates) is probably important. Plus that different seeds are different shuffle orders.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also on second thought the intermediate list allocation is wasteful. I'll add it as a GSOC task to implement fisher yates here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that task shouldn't be GSOC, it should be anyone! Also mwc-random does that I think.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not? It seems simple and self contained enough since it's reading the algorithm and implementing it through.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there could be some task pipeline for people who are not interested only in GSOC, but more interested generally in contributing!

2 changes: 2 additions & 0 deletions tests/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import qualified Operations.InsertColumn
import qualified Operations.Join
import qualified Operations.Merge
import qualified Operations.ReadCsv
import qualified Operations.Shuffle
import qualified Operations.Sort
import qualified Operations.Statistics
import qualified Operations.Subset
Expand Down Expand Up @@ -5120,6 +5121,7 @@ tests =
++ Operations.Join.tests
++ Operations.Merge.tests
++ Operations.ReadCsv.tests
++ Operations.Shuffle.tests
++ Operations.Sort.tests
++ Operations.Statistics.tests
++ Operations.Take.tests
Expand Down
84 changes: 84 additions & 0 deletions tests/Operations/Shuffle.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeApplications #-}

module Operations.Shuffle where

import qualified DataFrame as D

import DataFrame.Operations.Permutation (shuffle)
import System.Random (mkStdGen)
import Test.HUnit (Test (..), assertEqual)

testDataFrame :: D.DataFrame
testDataFrame =
D.fromNamedColumns
[ ("numbers", D.fromList @Int [1 .. 26])
]

-- Test that shuffling does anything at all
shuffleShuffles :: Test
shuffleShuffles =
let gen = mkStdGen 1234
shuffled = shuffle gen testDataFrame
initialNumbers = D.extractNumericColumn "numbers" testDataFrame
shuffledNumbers = D.extractNumericColumn "numbers" shuffled
in TestCase
( assertEqual
"Shuffled column unequal to initial column"
False
(initialNumbers == shuffledNumbers)
)

shufflePreservesColumnNames :: Test
shufflePreservesColumnNames =
let gen = mkStdGen 837
shuffled = shuffle gen testDataFrame
in TestCase
( assertEqual
"Column names are unchanged"
(D.columnNames shuffled)
(D.columnNames testDataFrame)
)

-- Test that un-shuffling restores the original dataframe
-- which is known to be sorted in this case
shufflePreservesData :: Test
shufflePreservesData =
let gen = mkStdGen 1234
shuffled = shuffle gen testDataFrame
sortedShuffled = D.sortBy [D.Asc (D.col @Int "numbers")] shuffled
in TestCase
(assertEqual "sort recovers initial numbers" testDataFrame sortedShuffled)

-- Test that shuffling isn't doing anything sneaky with summoning
-- random numbers somehow
shuffleSameSeedIsSameShuffle :: Test
shuffleSameSeedIsSameShuffle =
let gen = mkStdGen 1234
shuffled1 = shuffle gen testDataFrame
shuffled2 = shuffle gen testDataFrame
in TestCase
(assertEqual "shuffle with same seed gives same result" shuffled1 shuffled2)

-- Test that different seeds give different results
shuffleDifferentSeedIsDifferent :: Test
shuffleDifferentSeedIsDifferent =
let gen1 = mkStdGen 1234
gen2 = mkStdGen 4321
shuffled1 = shuffle gen1 testDataFrame
shuffled2 = shuffle gen2 testDataFrame
in TestCase
( assertEqual
"shuffle with different seeds gives different results"
False
(shuffled1 == shuffled2)
)

tests :: [Test]
tests =
[ TestLabel "shuffleShuffles" shuffleShuffles
, TestLabel "shufflePreservesData" shufflePreservesData
, TestLabel "shufflePreservesColumnNames" shufflePreservesColumnNames
, TestLabel "shuffleSameSeedIsSameShuffle" shuffleSameSeedIsSameShuffle
, TestLabel "shuffleDifferentSeedIsDifferent" shuffleDifferentSeedIsDifferent
]