44{-# LANGUAGE FlexibleInstances #-}
55{-# LANGUAGE GADTs #-}
66{-# LANGUAGE InstanceSigs #-}
7+ {-# LANGUAGE LambdaCase #-}
78{-# LANGUAGE MultiParamTypeClasses #-}
89{-# LANGUAGE OverloadedStrings #-}
910{-# LANGUAGE RankNTypes #-}
1213{-# LANGUAGE TypeApplications #-}
1314{-# LANGUAGE UndecidableInstances #-}
1415
15- module DataFrame.Functions where
16+ module DataFrame.Functions ( module DataFrame.Functions , module DataFrame.Operators ) where
1617
1718import DataFrame.Internal.Column
1819import DataFrame.Internal.DataFrame (
@@ -21,22 +22,26 @@ import DataFrame.Internal.DataFrame (
2122 )
2223import DataFrame.Internal.Expression hiding (normalize )
2324import DataFrame.Internal.Statistics
25+ import DataFrame.Operations.Core
2426
2527import Control.Applicative
2628import Control.Monad
2729import Control.Monad.IO.Class
2830import qualified Data.Char as Char
2931import Data.Function
3032import Data.Functor
33+ import Data.Int
3134import qualified Data.List as L
3235import qualified Data.Map as M
3336import qualified Data.Maybe as Maybe
3437import qualified Data.Text as T
3538import Data.Time
3639import qualified Data.Vector as V
3740import qualified Data.Vector.Unboxed as VU
41+ import Data.Word
3842import qualified DataFrame.IO.CSV as CSV
3943import qualified DataFrame.IO.Parquet as Parquet
44+ import DataFrame.IO.Parquet.Thrift
4045import DataFrame.Operators
4146import Debug.Trace (trace )
4247import Language.Haskell.TH
@@ -45,21 +50,6 @@ import Text.Regex.TDFA
4550import Prelude hiding (maximum , minimum )
4651import Prelude as P
4752
48- name :: (Show a ) => Expr a -> T. Text
49- name (Col n) = n
50- name other =
51- error $
52- " You must call `name` on a column reference. Not the expression: " ++ show other
53-
54- col :: (Columnable a ) => T. Text -> Expr a
55- col = Col
56-
57- ifThenElse :: (Columnable a ) => Expr Bool -> Expr a -> Expr a -> Expr a
58- ifThenElse = If
59-
60- lit :: (Columnable a ) => a -> Expr a
61- lit = Lit
62-
6353lift :: (Columnable a , Columnable b ) => (a -> b ) -> Expr a -> Expr b
6454lift f =
6555 Unary (MkUnaryOp {unaryFn = f, unaryName = " unaryUdf" , unarySymbol = Nothing })
@@ -450,12 +440,35 @@ declareColumnsFromCsvFile path = do
450440 (CSV. readSeparated (CSV. defaultReadOptions{CSV. numColumns = Just 100 }) path)
451441 declareColumns df
452442
453- -- TODO: We don't have to read the whole file, we can just read the schema.
454443declareColumnsFromParquetFile :: String -> DecsQ
455444declareColumnsFromParquetFile path = do
456- df <- liftIO (Parquet. readParquet path)
445+ metadata <- liftIO (Parquet. readMetadataFromPath path)
446+ let df = schemaToEmptyDataFrame (schema metadata)
457447 declareColumns df
458448
449+ schemaToEmptyDataFrame :: [SchemaElement ] -> DataFrame
450+ schemaToEmptyDataFrame elems =
451+ let leafElems = filter (\ e -> numChildren e == 0 ) elems
452+ in fromNamedColumns (map schemaElemToColumn leafElems)
453+
454+ schemaElemToColumn :: SchemaElement -> (T. Text , Column )
455+ schemaElemToColumn elem =
456+ let name = elementName elem
457+ in (name, emptyColumnForType (elementType elem ))
458+
459+ emptyColumnForType :: TType -> Column
460+ emptyColumnForType = \ case
461+ BOOL -> fromList @ Bool []
462+ BYTE -> fromList @ Word8 []
463+ I16 -> fromList @ Int16 []
464+ I32 -> fromList @ Int32 []
465+ I64 -> fromList @ Int64 []
466+ I96 -> fromList @ Int64 []
467+ FLOAT -> fromList @ Float []
468+ DOUBLE -> fromList @ Double []
469+ STRING -> fromList @ T. Text []
470+ other -> error $ " Unsupported parquet type for column: " <> show other
471+
459472declareColumnsFromCsvWithOpts :: CSV. ReadOptions -> String -> DecsQ
460473declareColumnsFromCsvWithOpts opts path = do
461474 df <- liftIO (CSV. readSeparated opts path)
0 commit comments