Skip to content

Commit 169e0d2

Browse files
committed
feat: Add ability to read parquet with just the schema.
1 parent e55486b commit 169e0d2

3 files changed

Lines changed: 48 additions & 20 deletions

File tree

app/Synthesis.hs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import qualified DataFrame.Functions as F
99

1010
import Data.Char
1111
import DataFrame.DecisionTree
12-
import DataFrame.Operators
12+
import DataFrame.Operators hiding (name)
1313

1414
import System.Random
1515

src/DataFrame/Functions.hs

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
{-# LANGUAGE FlexibleInstances #-}
55
{-# LANGUAGE GADTs #-}
66
{-# LANGUAGE InstanceSigs #-}
7+
{-# LANGUAGE LambdaCase #-}
78
{-# LANGUAGE MultiParamTypeClasses #-}
89
{-# LANGUAGE OverloadedStrings #-}
910
{-# LANGUAGE RankNTypes #-}
@@ -12,7 +13,7 @@
1213
{-# LANGUAGE TypeApplications #-}
1314
{-# LANGUAGE UndecidableInstances #-}
1415

15-
module DataFrame.Functions where
16+
module DataFrame.Functions (module DataFrame.Functions, module DataFrame.Operators) where
1617

1718
import DataFrame.Internal.Column
1819
import DataFrame.Internal.DataFrame (
@@ -21,22 +22,26 @@ import DataFrame.Internal.DataFrame (
2122
)
2223
import DataFrame.Internal.Expression hiding (normalize)
2324
import DataFrame.Internal.Statistics
25+
import DataFrame.Operations.Core
2426

2527
import Control.Applicative
2628
import Control.Monad
2729
import Control.Monad.IO.Class
2830
import qualified Data.Char as Char
2931
import Data.Function
3032
import Data.Functor
33+
import Data.Int
3134
import qualified Data.List as L
3235
import qualified Data.Map as M
3336
import qualified Data.Maybe as Maybe
3437
import qualified Data.Text as T
3538
import Data.Time
3639
import qualified Data.Vector as V
3740
import qualified Data.Vector.Unboxed as VU
41+
import Data.Word
3842
import qualified DataFrame.IO.CSV as CSV
3943
import qualified DataFrame.IO.Parquet as Parquet
44+
import DataFrame.IO.Parquet.Thrift
4045
import DataFrame.Operators
4146
import Debug.Trace (trace)
4247
import Language.Haskell.TH
@@ -45,21 +50,6 @@ import Text.Regex.TDFA
4550
import Prelude hiding (maximum, minimum)
4651
import Prelude as P
4752

48-
name :: (Show a) => Expr a -> T.Text
49-
name (Col n) = n
50-
name other =
51-
error $
52-
"You must call `name` on a column reference. Not the expression: " ++ show other
53-
54-
col :: (Columnable a) => T.Text -> Expr a
55-
col = Col
56-
57-
ifThenElse :: (Columnable a) => Expr Bool -> Expr a -> Expr a -> Expr a
58-
ifThenElse = If
59-
60-
lit :: (Columnable a) => a -> Expr a
61-
lit = Lit
62-
6353
lift :: (Columnable a, Columnable b) => (a -> b) -> Expr a -> Expr b
6454
lift f =
6555
Unary (MkUnaryOp{unaryFn = f, unaryName = "unaryUdf", unarySymbol = Nothing})
@@ -450,12 +440,35 @@ declareColumnsFromCsvFile path = do
450440
(CSV.readSeparated (CSV.defaultReadOptions{CSV.numColumns = Just 100}) path)
451441
declareColumns df
452442

453-
-- TODO: We don't have to read the whole file, we can just read the schema.
454443
declareColumnsFromParquetFile :: String -> DecsQ
455444
declareColumnsFromParquetFile path = do
456-
df <- liftIO (Parquet.readParquet path)
445+
metadata <- liftIO (Parquet.readMetadataFromPath path)
446+
let df = schemaToEmptyDataFrame (schema metadata)
457447
declareColumns df
458448

449+
schemaToEmptyDataFrame :: [SchemaElement] -> DataFrame
450+
schemaToEmptyDataFrame elems =
451+
let leafElems = filter (\e -> numChildren e == 0) elems
452+
in fromNamedColumns (map schemaElemToColumn leafElems)
453+
454+
schemaElemToColumn :: SchemaElement -> (T.Text, Column)
455+
schemaElemToColumn elem =
456+
let name = elementName elem
457+
in (name, emptyColumnForType (elementType elem))
458+
459+
emptyColumnForType :: TType -> Column
460+
emptyColumnForType = \case
461+
BOOL -> fromList @Bool []
462+
BYTE -> fromList @Word8 []
463+
I16 -> fromList @Int16 []
464+
I32 -> fromList @Int32 []
465+
I64 -> fromList @Int64 []
466+
I96 -> fromList @Int64 []
467+
FLOAT -> fromList @Float []
468+
DOUBLE -> fromList @Double []
469+
STRING -> fromList @T.Text []
470+
other -> error $ "Unsupported parquet type for column: " <> show other
471+
459472
declareColumnsFromCsvWithOpts :: CSV.ReadOptions -> String -> DecsQ
460473
declareColumnsFromCsvWithOpts opts path = do
461474
df <- liftIO (CSV.readSeparated opts path)

src/DataFrame/Operators.hs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import DataFrame.Internal.Expression (
1515
binaryPrecedence,
1616
binarySymbol
1717
),
18-
Expr (Binary, Lit),
18+
Expr (Binary, Col, If, Lit),
1919
NamedExpr,
2020
UExpr (UExpr),
2121
)
@@ -32,6 +32,21 @@ infixr 0 .=
3232
as :: (Columnable a) => Expr a -> T.Text -> NamedExpr
3333
as expr name = (name, UExpr expr)
3434

35+
name :: (Show a) => Expr a -> T.Text
36+
name (Col n) = n
37+
name other =
38+
error $
39+
"You must call `name` on a column reference. Not the expression: " ++ show other
40+
41+
col :: (Columnable a) => T.Text -> Expr a
42+
col = Col
43+
44+
ifThenElse :: (Columnable a) => Expr Bool -> Expr a -> Expr a -> Expr a
45+
ifThenElse = If
46+
47+
lit :: (Columnable a) => a -> Expr a
48+
lit = Lit
49+
3550
(.=) :: (Columnable a) => T.Text -> Expr a -> NamedExpr
3651
(.=) = flip as
3752

0 commit comments

Comments
 (0)