diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 78b44c97a4976..f3098aa5bb2b2 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -429,6 +429,7 @@ Below is a list of all the keywords in Spark SQL. |ATOMIC|non-reserved|non-reserved|non-reserved| |AUTHORIZATION|reserved|non-reserved|reserved| |BEGIN|non-reserved|non-reserved|non-reserved| +|BERNOULLI|non-reserved|non-reserved|non-reserved| |BETWEEN|non-reserved|non-reserved|reserved| |BIGINT|non-reserved|non-reserved|reserved| |BINARY|non-reserved|non-reserved|reserved| @@ -753,6 +754,7 @@ Below is a list of all the keywords in Spark SQL. |SUBSTR|non-reserved|non-reserved|non-reserved| |SUBSTRING|non-reserved|non-reserved|non-reserved| |SYNC|non-reserved|non-reserved|non-reserved| +|SYSTEM|non-reserved|non-reserved|reserved| |SYSTEM_TIME|non-reserved|non-reserved|non-reserved| |SYSTEM_VERSION|non-reserved|non-reserved|non-reserved| |TABLE|reserved|non-reserved|reserved| diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 index 34de788c6d464..08027f2ca885b 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 @@ -148,6 +148,7 @@ AT: 'AT'; ATOMIC: 'ATOMIC'; AUTHORIZATION: 'AUTHORIZATION'; BEGIN: 'BEGIN'; +BERNOULLI: 'BERNOULLI'; BETWEEN: 'BETWEEN'; BIGINT: 'BIGINT'; BINARY: 'BINARY'; @@ -471,6 +472,7 @@ STRUCT: 'STRUCT' {incComplexTypeLevelCounter();}; SUBSTR: 'SUBSTR'; SUBSTRING: 'SUBSTRING'; SYNC: 'SYNC'; +SYSTEM: 'SYSTEM'; SYSTEM_TIME: 'SYSTEM_TIME'; SYSTEM_VERSION: 'SYSTEM_VERSION'; TABLE: 'TABLE'; diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index a1de1234ef317..92276279082ef 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -1049,7 +1049,9 @@ joinCriteria ; sample - : TABLESAMPLE LEFT_PAREN sampleMethod? RIGHT_PAREN (REPEATABLE LEFT_PAREN seed=integerValue RIGHT_PAREN)? + : TABLESAMPLE (sampleType=(SYSTEM | BERNOULLI))? + LEFT_PAREN sampleMethod? RIGHT_PAREN + (REPEATABLE LEFT_PAREN seed=integerValue RIGHT_PAREN)? ; sampleMethod @@ -1917,6 +1919,7 @@ ansiNonReserved | AT | ATOMIC | BEGIN + | BERNOULLI | BETWEEN | BIGINT | BINARY @@ -2183,6 +2186,7 @@ ansiNonReserved | SUBSTR | SUBSTRING | SYNC + | SYSTEM | SYSTEM_TIME | SYSTEM_VERSION | TABLES @@ -2287,6 +2291,7 @@ nonReserved | ATOMIC | AUTHORIZATION | BEGIN + | BERNOULLI | BETWEEN | BIGINT | BINARY @@ -2599,6 +2604,7 @@ nonReserved | SUBSTR | SUBSTRING | SYNC + | SYSTEM | SYSTEM_TIME | SYSTEM_VERSION | TABLE diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownTableSample.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownTableSample.java index 3630feb4680ea..159ba9565cdd5 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownTableSample.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownTableSample.java @@ -36,4 +36,24 @@ boolean pushTableSample( double upperBound, boolean withReplacement, long seed); + + /** + * Pushes down SAMPLE to the data source with sample method awareness. + * Data sources can override this to distinguish SYSTEM (block) from BERNOULLI (row) sampling. + * By default, rejects SYSTEM sampling for backward compatibility and delegates BERNOULLI to + * the 4-parameter version. + */ + default boolean pushTableSample( + double lowerBound, + double upperBound, + boolean withReplacement, + long seed, + boolean isSystemSampling) { + if (isSystemSampling) { + // If the data source hasn't overridden this method, it must have not added support + // for SYSTEM sampling. Don't apply sample pushdown. + return false; + } + return pushTableSample(lowerBound, upperBound, withReplacement, seed); + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala index 60b952b285e13..7002f774b4494 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala @@ -530,7 +530,7 @@ object UnsupportedOperationChecker extends Logging { throwError("Sorting is not supported on streaming DataFrames/Datasets, unless it is on " + "aggregated DataFrame/Dataset in Complete output mode") - case Sample(_, _, _, _, child) if child.isStreaming => + case Sample(_, _, _, _, child, _) if child.isStreaming => throwError("Sampling is not supported on streaming DataFrames/Datasets") case Window(windowExpression, _, _, child, _) if child.isStreaming => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index ad4769ff8e31a..7ba90cb2e986d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -1288,7 +1288,7 @@ object CollapseProject extends Rule[LogicalPlan] with AliasHelper { limit.copy(child = p2.copy(projectList = newProjectList)) case Project(l1, r @ Repartition(_, _, p @ Project(l2, _))) if isRenaming(l1, l2) => r.copy(child = p.copy(projectList = buildCleanedProjectList(l1, p.projectList))) - case Project(l1, s @ Sample(_, _, _, _, p2 @ Project(l2, _))) if isRenaming(l1, l2) => + case Project(l1, s @ Sample(_, _, _, _, p2 @ Project(l2, _), _)) if isRenaming(l1, l2) => s.copy(child = p2.copy(projectList = buildCleanedProjectList(l1, p2.projectList))) case o => o } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index f82429bae7148..cabdc54373c9c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -2250,10 +2250,14 @@ class AstBuilder extends DataTypeAstBuilder * - TABLESAMPLE(x ROWS): Sample the table down to the given number of rows. * - TABLESAMPLE(x PERCENT) [REPEATABLE (y)]: Sample the table down to the given percentage with * seed 'y'. Note that percentages are defined as a number between 0 and 100. + * - TABLESAMPLE SYSTEM(x PERCENT): Sample by data source dependent blocks or file splits. * - TABLESAMPLE(BUCKET x OUT OF y) [REPEATABLE (z)]: Sample the table down to a 'x' divided by * 'y' fraction with seed 'z'. */ private def withSample(ctx: SampleContext, query: LogicalPlan): LogicalPlan = withOrigin(ctx) { + val isSystem = ctx.sampleType != null && + ctx.sampleType.getType == SqlBaseParser.SYSTEM + // Create a sampled plan if we need one. def sample(fraction: Double, seed: Long): Sample = { // The range of fraction accepted by Sample is [0, 1]. Because Hive's block sampling @@ -2263,13 +2267,18 @@ class AstBuilder extends DataTypeAstBuilder validate(fraction >= 0.0 - eps && fraction <= 1.0 + eps, s"Sampling fraction ($fraction) must be on interval [0, 1]", ctx) - Sample(0.0, fraction, withReplacement = false, seed, query) + val method = if (isSystem) SampleMethod.System else SampleMethod.Bernoulli + Sample(0.0, fraction, withReplacement = false, seed, query, method) } if (ctx.sampleMethod() == null) { throw QueryParsingErrors.emptyInputForTableSampleError(ctx) } + if (isSystem && ctx.seed != null) { + operationNotAllowed("TABLESAMPLE SYSTEM does not support REPEATABLE", ctx) + } + val seed = if (ctx.seed != null) { ctx.seed.getText.toLong } else { @@ -2278,6 +2287,9 @@ class AstBuilder extends DataTypeAstBuilder ctx.sampleMethod() match { case ctx: SampleByRowsContext => + if (isSystem) { + operationNotAllowed("TABLESAMPLE SYSTEM only supports PERCENT sampling", ctx) + } Limit(expression(ctx.expression), query) case ctx: SampleByPercentileContext => @@ -2289,6 +2301,9 @@ class AstBuilder extends DataTypeAstBuilder sample(sign * fraction / 100.0d, seed) case ctx: SampleByBytesContext => + if (isSystem) { + operationNotAllowed("TABLESAMPLE SYSTEM only supports PERCENT sampling", ctx) + } val bytesStr = ctx.bytes.getText if (bytesStr.matches("[0-9]+[bBkKmMgG]")) { throw QueryParsingErrors.tableSampleByBytesUnsupportedError("byteLengthLiteral", ctx) @@ -2297,6 +2312,9 @@ class AstBuilder extends DataTypeAstBuilder } case ctx: SampleByBucketContext if ctx.ON() != null => + if (isSystem) { + operationNotAllowed("TABLESAMPLE SYSTEM only supports PERCENT sampling", ctx) + } if (ctx.identifier != null) { throw QueryParsingErrors.tableSampleByBytesUnsupportedError( "BUCKET x OUT OF y ON colname", ctx) @@ -2306,6 +2324,9 @@ class AstBuilder extends DataTypeAstBuilder } case ctx: SampleByBucketContext => + if (isSystem) { + operationNotAllowed("TABLESAMPLE SYSTEM only supports PERCENT sampling", ctx) + } sample(ctx.numerator.getText.toDouble / ctx.denominator.getText.toDouble, seed) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index c18b7fcecc484..1455afd5f05e3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -1912,6 +1912,14 @@ object SubqueryAlias { } } +sealed trait SampleMethod extends Serializable +object SampleMethod { + /** Row-level sampling (BERNOULLI). Each row independently selected. No I/O savings. */ + case object Bernoulli extends SampleMethod + /** System-level sampling (SYSTEM). Entire partitions/splits included or skipped. */ + case object System extends SampleMethod +} + /** * Sample the dataset. * @@ -1921,13 +1929,15 @@ object SubqueryAlias { * @param withReplacement Whether to sample with replacement. * @param seed the random seed * @param child the LogicalPlan + * @param sampleMethod the sampling method (Bernoulli or System) */ case class Sample( lowerBound: Double, upperBound: Double, withReplacement: Boolean, seed: Long, - child: LogicalPlan) extends UnaryNode { + child: LogicalPlan, + sampleMethod: SampleMethod = SampleMethod.Bernoulli) extends UnaryNode { val eps = RandomSampler.roundingEpsilon val fraction = upperBound - lowerBound diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index 4a38110b80f1c..49dbac38475d8 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -870,6 +870,160 @@ class PlanParserSuite extends AnalysisTest { stop = 65)) } + test("SPARK-55978: TABLESAMPLE SYSTEM and BERNOULLI - basic parsing") { + val sql = "select * from t" + // SYSTEM produces SampleMethod.System + assertEqual( + s"$sql tablesample system (43 percent) as x", + Sample(0, .43d, withReplacement = false, 10L, + table("t").as("x"), SampleMethod.System).select(star())) + // BERNOULLI produces SampleMethod.Bernoulli + assertEqual( + s"$sql tablesample bernoulli (43 percent) as x", + Sample(0, .43d, withReplacement = false, 10L, + table("t").as("x"), SampleMethod.Bernoulli).select(star())) + // No qualifier defaults to Bernoulli (backward compat) + assertEqual( + s"$sql tablesample(43 percent) as x", + Sample(0, .43d, withReplacement = false, 10L, + table("t").as("x")).select(star())) + } + + test("SPARK-55978: TABLESAMPLE SYSTEM - case insensitivity") { + val sql = "select * from t" + // Keywords are case-insensitive + assertEqual( + s"$sql TABLESAMPLE SYSTEM (43 PERCENT) as x", + Sample(0, .43d, withReplacement = false, 10L, + table("t").as("x"), SampleMethod.System).select(star())) + assertEqual( + s"$sql TabLeSaMpLe SyStEm (43 PeRcEnT) as x", + Sample(0, .43d, withReplacement = false, 10L, + table("t").as("x"), SampleMethod.System).select(star())) + assertEqual( + s"$sql TABLESAMPLE BERNOULLI (43 PERCENT) as x", + Sample(0, .43d, withReplacement = false, 10L, + table("t").as("x"), SampleMethod.Bernoulli).select(star())) + } + + test("SPARK-55978: TABLESAMPLE SYSTEM - boundary fractions") { + val sql = "select * from t" + // 0 PERCENT + assertEqual( + s"$sql tablesample system (0 percent) as x", + Sample(0, 0d, withReplacement = false, 10L, + table("t").as("x"), SampleMethod.System).select(star())) + // 100 PERCENT + assertEqual( + s"$sql tablesample system (100 percent) as x", + Sample(0, 1d, withReplacement = false, 10L, + table("t").as("x"), SampleMethod.System).select(star())) + // Fractional percent + assertEqual( + s"$sql tablesample system (0.1 percent) as x", + Sample(0, 0.001d, withReplacement = false, 10L, + table("t").as("x"), SampleMethod.System).select(star())) + } + + test("SPARK-55978: TABLESAMPLE SYSTEM - unsupported sample methods") { + val sql = "select * from t" + // SYSTEM + ROWS -> error + checkError( + exception = parseException(s"$sql tablesample system (100 rows)"), + condition = "_LEGACY_ERROR_TEMP_0035", + parameters = Map( + "message" -> "TABLESAMPLE SYSTEM only supports PERCENT sampling"), + context = ExpectedContext( + fragment = "tablesample system (100 rows)", + start = 16, + stop = 44)) + // SYSTEM + BYTES -> error + checkError( + exception = parseException(s"$sql tablesample system (300M)"), + condition = "_LEGACY_ERROR_TEMP_0035", + parameters = Map( + "message" -> "TABLESAMPLE SYSTEM only supports PERCENT sampling"), + context = ExpectedContext( + fragment = "tablesample system (300M)", + start = 16, + stop = 40)) + // SYSTEM + BUCKET -> error + checkError( + exception = parseException(s"$sql tablesample system (bucket 4 out of 10)"), + condition = "_LEGACY_ERROR_TEMP_0035", + parameters = Map( + "message" -> "TABLESAMPLE SYSTEM only supports PERCENT sampling"), + context = ExpectedContext( + fragment = "tablesample system (bucket 4 out of 10)", + start = 16, + stop = 54)) + } + + test("SPARK-55978: TABLESAMPLE BERNOULLI - REPEATABLE is supported") { + assertEqual( + "select * from t tablesample bernoulli (43 percent) repeatable (123)", + Sample(0, .43d, withReplacement = false, 123L, + table("t"), SampleMethod.Bernoulli).select(star())) + } + + test("SPARK-55978: TABLESAMPLE SYSTEM - REPEATABLE not supported") { + val sql = "select * from t" + checkError( + exception = parseException(s"$sql tablesample system (43 percent) repeatable (123)"), + condition = "_LEGACY_ERROR_TEMP_0035", + parameters = Map( + "message" -> "TABLESAMPLE SYSTEM does not support REPEATABLE"), + context = ExpectedContext( + fragment = "tablesample system (43 percent) repeatable (123)", + start = 16, + stop = 63)) + } + + test("SPARK-55978: TABLESAMPLE SYSTEM - fraction out of range") { + val sql = "select * from t" + // > 100 PERCENT + checkError( + exception = parseException(s"$sql tablesample system (150 percent) as x"), + condition = "_LEGACY_ERROR_TEMP_0064", + parameters = Map("msg" -> "Sampling fraction (1.5) must be on interval [0, 1]"), + context = ExpectedContext( + fragment = "tablesample system (150 percent)", + start = 16, + stop = 47)) + // Negative PERCENT + checkError( + exception = parseException(s"$sql tablesample system (-10 percent) as x"), + condition = "_LEGACY_ERROR_TEMP_0064", + parameters = Map("msg" -> "Sampling fraction (-0.1) must be on interval [0, 1]"), + context = ExpectedContext( + fragment = "tablesample system (-10 percent)", + start = 16, + stop = 47)) + } + + test("SPARK-55978: TABLESAMPLE SYSTEM and BERNOULLI as identifiers") { + // SYSTEM usable as column name (nonReserved) + assertEqual("SELECT system FROM t", + table("t").select($"system")) + // BERNOULLI usable as column name + assertEqual("SELECT bernoulli FROM t", + table("t").select($"bernoulli")) + // Usable as table alias + assertEqual("SELECT * FROM t system", + table("t").as("system").select(star())) + assertEqual("SELECT * FROM t bernoulli", + table("t").as("bernoulli").select(star())) + } + + test("SPARK-55978: TABLESAMPLE SYSTEM - subquery and join contexts") { + // SYSTEM sample in subquery + assertEqual( + "SELECT * FROM (SELECT * FROM t TABLESAMPLE SYSTEM (50 PERCENT)) sub", + Sample(0, .5d, withReplacement = false, 10L, + table("t"), SampleMethod.System) + .select(star()).as("sub").select(star())) + } + test("sub-query") { val plan = table("t0").select($"id") assertEqual("select id from (t0)", plan) diff --git a/sql/connect/client/jdbc/src/test/scala/org/apache/spark/sql/connect/client/jdbc/SparkConnectDatabaseMetaDataSuite.scala b/sql/connect/client/jdbc/src/test/scala/org/apache/spark/sql/connect/client/jdbc/SparkConnectDatabaseMetaDataSuite.scala index 903e7edac2230..e98cd843b2fc5 100644 --- a/sql/connect/client/jdbc/src/test/scala/org/apache/spark/sql/connect/client/jdbc/SparkConnectDatabaseMetaDataSuite.scala +++ b/sql/connect/client/jdbc/src/test/scala/org/apache/spark/sql/connect/client/jdbc/SparkConnectDatabaseMetaDataSuite.scala @@ -209,7 +209,7 @@ class SparkConnectDatabaseMetaDataSuite extends ConnectFunSuite with RemoteSpark withConnection { conn => val metadata = conn.getMetaData // scalastyle:off line.size.limit - assert(metadata.getSQLKeywords === "ADD,AFTER,AGGREGATE,ALWAYS,ANALYZE,ANTI,ANY_VALUE,ARCHIVE,ASC,BINDING,BUCKET,BUCKETS,BYTE,CACHE,CASCADE,CATALOG,CATALOGS,CHANGE,CHANGES,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATION,COLLECTION,COLUMNS,COMMENT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONTAINS,CONTINUE,COST,DATA,DATABASE,DATABASES,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAYOFYEAR,DAYS,DBPROPERTIES,DEFINED,DEFINER,DELAY,DELIMITED,DESC,DFS,DIRECTORIES,DIRECTORY,DISTRIBUTE,DIV,DO,ELSEIF,ENFORCED,ESCAPED,EVOLUTION,EXCHANGE,EXCLUDE,EXCLUSIVE,EXIT,EXPLAIN,EXPORT,EXTEND,EXTENDED,FIELDS,FILEFORMAT,FIRST,FLOW,FOLLOWING,FORMAT,FORMATTED,FOUND,FUNCTIONS,GENERATED,GEOGRAPHY,GEOMETRY,HANDLER,HOURS,IDENTIFIED,IDENTIFIER,IF,IGNORE,ILIKE,IMMEDIATE,INCLUDE,INCLUSIVE,INCREMENT,INDEX,INDEXES,INPATH,INPUT,INPUTFORMAT,INVOKER,ITEMS,ITERATE,JSON,KEY,KEYS,LAST,LAZY,LEAVE,LEVEL,LIMIT,LINES,LIST,LOAD,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MATERIALIZED,MEASURE,METRICS,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTES,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NORELY,NULLS,OFFSET,OPTION,OPTIONS,OUTPUTFORMAT,OVERWRITE,PARTITIONED,PARTITIONS,PERCENT,PIVOT,PLACING,PRECEDING,PRINCIPALS,PROCEDURES,PROPERTIES,PURGE,QUARTER,QUERY,RECORDREADER,RECORDWRITER,RECOVER,RECURSION,REDUCE,REFRESH,RELY,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,ROLE,ROLES,SCHEMA,SCHEMAS,SECONDS,SECURITY,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SETS,SHORT,SHOW,SINGLE,SKEWED,SORT,SORTED,SOURCE,STATISTICS,STORED,STRATIFY,STREAM,STREAMING,STRING,STRUCT,SUBSTR,SYNC,SYSTEM_TIME,SYSTEM_VERSION,TABLES,TARGET,TBLPROPERTIES,TERMINATED,TIMEDIFF,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TOUCH,TRANSACTION,TRANSACTIONS,TRANSFORM,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNLOCK,UNPIVOT,UNSET,UNTIL,USE,VAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WATERMARK,WEEK,WEEKS,WHILE,X,YEARS,ZONE") + assert(metadata.getSQLKeywords === "ADD,AFTER,AGGREGATE,ALWAYS,ANALYZE,ANTI,ANY_VALUE,ARCHIVE,ASC,BERNOULLI,BINDING,BUCKET,BUCKETS,BYTE,CACHE,CASCADE,CATALOG,CATALOGS,CHANGE,CHANGES,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATION,COLLECTION,COLUMNS,COMMENT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONTAINS,CONTINUE,COST,DATA,DATABASE,DATABASES,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAYOFYEAR,DAYS,DBPROPERTIES,DEFINED,DEFINER,DELAY,DELIMITED,DESC,DFS,DIRECTORIES,DIRECTORY,DISTRIBUTE,DIV,DO,ELSEIF,ENFORCED,ESCAPED,EVOLUTION,EXCHANGE,EXCLUDE,EXCLUSIVE,EXIT,EXPLAIN,EXPORT,EXTEND,EXTENDED,FIELDS,FILEFORMAT,FIRST,FLOW,FOLLOWING,FORMAT,FORMATTED,FOUND,FUNCTIONS,GENERATED,GEOGRAPHY,GEOMETRY,HANDLER,HOURS,IDENTIFIED,IDENTIFIER,IF,IGNORE,ILIKE,IMMEDIATE,INCLUDE,INCLUSIVE,INCREMENT,INDEX,INDEXES,INPATH,INPUT,INPUTFORMAT,INVOKER,ITEMS,ITERATE,JSON,KEY,KEYS,LAST,LAZY,LEAVE,LEVEL,LIMIT,LINES,LIST,LOAD,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MATERIALIZED,MEASURE,METRICS,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTES,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NORELY,NULLS,OFFSET,OPTION,OPTIONS,OUTPUTFORMAT,OVERWRITE,PARTITIONED,PARTITIONS,PERCENT,PIVOT,PLACING,PRECEDING,PRINCIPALS,PROCEDURES,PROPERTIES,PURGE,QUARTER,QUERY,RECORDREADER,RECORDWRITER,RECOVER,RECURSION,REDUCE,REFRESH,RELY,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,ROLE,ROLES,SCHEMA,SCHEMAS,SECONDS,SECURITY,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SETS,SHORT,SHOW,SINGLE,SKEWED,SORT,SORTED,SOURCE,STATISTICS,STORED,STRATIFY,STREAM,STREAMING,STRING,STRUCT,SUBSTR,SYNC,SYSTEM_TIME,SYSTEM_VERSION,TABLES,TARGET,TBLPROPERTIES,TERMINATED,TIMEDIFF,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TOUCH,TRANSACTION,TRANSACTIONS,TRANSFORM,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNLOCK,UNPIVOT,UNSET,UNTIL,USE,VAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WATERMARK,WEEK,WEEKS,WHILE,X,YEARS,ZONE") // scalastyle:on line.size.limit } } diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/sample_fraction_seed.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/sample_fraction_seed.explain index f94e0a850e403..9bcbf88135399 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/sample_fraction_seed.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/sample_fraction_seed.explain @@ -1,2 +1,2 @@ -Sample 0.0, 0.43, false, 9890823 +Sample 0.0, 0.43, false, 9890823, Bernoulli +- LocalRelation , [id#0L, a#0, b#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/sample_withReplacement_fraction_seed.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/sample_withReplacement_fraction_seed.explain index 340c25ab6d017..5af5314e48f90 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/sample_withReplacement_fraction_seed.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/sample_withReplacement_fraction_seed.explain @@ -1,2 +1,2 @@ -Sample 0.0, 0.23, true, 898 +Sample 0.0, 0.23, true, 898, Bernoulli +- LocalRelation , [id#0L, a#0, b#0] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 5c393b1db227e..f5ded4bec6503 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -1037,7 +1037,18 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { execution.FilterExec(f.typedCondition(f.deserializer), planLater(f.child)) :: Nil case e @ logical.Expand(_, _, child) => execution.ExpandExec(e.projections, e.output, planLater(child)) :: Nil - case logical.Sample(lb, ub, withReplacement, seed, child) => + case logical.Sample(lb, ub, withReplacement, seed, child, sampleMethod) => + if (sampleMethod == logical.SampleMethod.System) { + // Defensive Check: TABLESAMPLE SYSTEM node wasn't able to be pushed into + // V2ScanRelationPushDown earlier. This means the data source is either not DSv2 or + // not able to support block/split sampling. Now it fell through to the row-based + // sampling. Therefore error out. + throw new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_0035", + messageParameters = Map("message" -> + ("TABLESAMPLE SYSTEM (block sampling) must be pushed down to a DSv2 data source. " + + "It cannot be executed as row-level sampling."))) + } execution.SampleExec(lb, ub, withReplacement, seed, planLater(child)) :: Nil case logical.LocalRelation(output, data, _, stream) => LocalTableScanExec(output, data, stream) :: Nil diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala index 4a87a50c6576e..019b3e21853c4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.collection.mutable import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Expression, ExpressionSet, NamedExpression, PythonUDF, SchemaPruning, SubqueryExpression} +import org.apache.spark.sql.catalyst.plans.logical.SampleMethod import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.connector.expressions.{IdentityTransform, SortOrder, Transform} @@ -205,7 +206,8 @@ object PushDownUtils { scanBuilder match { case s: SupportsPushDownTableSample => s.pushTableSample( - sample.lowerBound, sample.upperBound, sample.withReplacement, sample.seed) + sample.lowerBound, sample.upperBound, sample.withReplacement, sample.seed, + sample.sampleMethod == SampleMethod.System) case _ => false } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableSampleInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableSampleInfo.scala index cb4fb9eb0809a..441ed28c813c0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableSampleInfo.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableSampleInfo.scala @@ -17,8 +17,11 @@ package org.apache.spark.sql.execution.datasources.v2 +import org.apache.spark.sql.catalyst.plans.logical.SampleMethod + case class TableSampleInfo( lowerBound: Double, upperBound: Double, withReplacement: Boolean, - seed: Long) + seed: Long, + sampleMethod: SampleMethod = SampleMethod.Bernoulli) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala index 9a25752ccadac..de23aaaa5e290 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala @@ -23,11 +23,12 @@ import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.internal.LogKeys.{AGGREGATE_FUNCTIONS, COLUMN_NAMES, GROUP_BY_EXPRS, JOIN_CONDITION, JOIN_TYPE, POST_SCAN_FILTERS, PUSHED_FILTERS, RELATION_NAME, RELATION_OUTPUT} +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{aggregate, Alias, And, Attribute, AttributeMap, AttributeReference, AttributeSet, Cast, Expression, ExprId, IntegerLiteral, Literal, NamedExpression, PredicateHelper, ProjectionOverSchema, SortOrder, SubqueryExpression} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.optimizer.CollapseProject import org.apache.spark.sql.catalyst.planning.{PhysicalOperation, ScanOperation} -import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, Join, LeafNode, Limit, LimitAndOffset, LocalLimit, LogicalPlan, Offset, OffsetAndLimit, Project, Sample, Sort} +import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, Join, LeafNode, Limit, LimitAndOffset, LocalLimit, LogicalPlan, Offset, OffsetAndLimit, Project, Sample, SampleMethod, Sort, SubqueryAlias} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes import org.apache.spark.sql.connector.expressions.{SortOrder => V2SortOrder} @@ -811,7 +812,44 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper { } } + private def findScanBuilderHolder(plan: LogicalPlan): Option[ScanBuilderHolder] = { + plan match { + case s: ScanBuilderHolder => Some(s) + case s: SubqueryAlias => findScanBuilderHolder(s.child) + case p: Project => findScanBuilderHolder(p.child) + case f: Filter => findScanBuilderHolder(f.child) + case _ => None + } + } + def pushDownSample(plan: LogicalPlan): LogicalPlan = plan.transform { + case sample: Sample if sample.sampleMethod == SampleMethod.System => + findScanBuilderHolder(sample.child) match { + case Some(sHolder) => + val tableSample = TableSampleInfo( + sample.lowerBound, + sample.upperBound, + sample.withReplacement, + sample.seed, + sampleMethod = sample.sampleMethod) + val pushed = PushDownUtils.pushTableSample(sHolder.builder, tableSample) + if (pushed) { + sHolder.pushedSample = Some(tableSample) + sample.child + } else { + throw new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_0035", + messageParameters = Map("message" -> + ("TABLESAMPLE SYSTEM requires a data source that supports " + + "table sample pushdown (SupportsPushDownTableSample)."))) + } + case None => + throw new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_0035", + messageParameters = Map("message" -> + "TABLESAMPLE SYSTEM is only supported for DSv2 data source scan relations.")) + } + case sample: Sample => sample.child match { case PhysicalOperation(_, Nil, sHolder: ScanBuilderHolder) => val tableSample = TableSampleInfo( @@ -826,7 +864,6 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper { } else { sample } - case _ => sample } } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out index 776e6291b9190..dfefebf9843ad 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out @@ -1979,7 +1979,7 @@ org.apache.spark.sql.catalyst.parser.ParseException table t |> tablesample (100 percent) repeatable (0) -- !query analysis -Sample 0.0, 1.0, false, 0 +Sample 0.0, 1.0, false, 0, Bernoulli +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv @@ -1998,7 +1998,7 @@ GlobalLimit 2 table t |> tablesample (bucket 1 out of 1) repeatable (0) -- !query analysis -Sample 0.0, 1.0, false, 0 +Sample 0.0, 1.0, false, 0, Bernoulli +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv @@ -2009,10 +2009,10 @@ table t |> tablesample (5 rows) repeatable (0) |> tablesample (bucket 1 out of 1) repeatable (0) -- !query analysis -Sample 0.0, 1.0, false, 0 +Sample 0.0, 1.0, false, 0, Bernoulli +- GlobalLimit 5 +- LocalLimit 5 - +- Sample 0.0, 1.0, false, 0 + +- Sample 0.0, 1.0, false, 0, Bernoulli +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[x#x,y#x] csv @@ -2435,7 +2435,7 @@ Project [a#x] : +- Project [a#x] : +- SubqueryAlias grouping : +- LocalRelation [a#x] - +- Sample 0.0, 1.0, false, 0 + +- Sample 0.0, 1.0, false, 0, Bernoulli +- SubqueryAlias jt2 +- SubqueryAlias join_test_t2 +- View (`join_test_t2`, [a#x]) @@ -2458,7 +2458,7 @@ Project [a#x] : +- SubqueryAlias grouping : +- LocalRelation [a#x] +- SubqueryAlias jt2 - +- Sample 0.0, 1.0, false, 0 + +- Sample 0.0, 1.0, false, 0, Bernoulli +- Project [1 AS a#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out index b7491bfc93dba..576f9402bf76e 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out @@ -24,6 +24,7 @@ AT false ATOMIC false AUTHORIZATION true BEGIN false +BERNOULLI false BETWEEN false BIGINT false BINARY false @@ -346,6 +347,7 @@ STRUCT false SUBSTR false SUBSTRING false SYNC false +SYSTEM false SYSTEM_TIME false SYSTEM_VERSION false TABLE true diff --git a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out index ee43431e5efff..0bfa1dcb13e0a 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out @@ -24,6 +24,7 @@ AT false ATOMIC false AUTHORIZATION false BEGIN false +BERNOULLI false BETWEEN false BIGINT false BINARY false @@ -346,6 +347,7 @@ STRUCT false SUBSTR false SUBSTRING false SYNC false +SYSTEM false SYSTEM_TIME false SYSTEM_VERSION false TABLE false diff --git a/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out index ee43431e5efff..0bfa1dcb13e0a 100644 --- a/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out @@ -24,6 +24,7 @@ AT false ATOMIC false AUTHORIZATION false BEGIN false +BERNOULLI false BETWEEN false BIGINT false BINARY false @@ -346,6 +347,7 @@ STRUCT false SUBSTR false SUBSTRING false SYNC false +SYSTEM false SYSTEM_TIME false SYSTEM_VERSION false TABLE false diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala index e864b026b7cd6..bb9fc1d91fd76 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala @@ -214,7 +214,7 @@ trait ThriftServerWithSparkContextSuite extends SharedThriftServer { val sessionHandle = client.openSession(user, "") val infoValue = client.getInfo(sessionHandle, GetInfoType.CLI_ODBC_KEYWORDS) // scalastyle:off line.size.limit - assert(infoValue.getStringValue == "ADD,AFTER,AGGREGATE,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,ASENSITIVE,AT,ATOMIC,AUTHORIZATION,BEGIN,BETWEEN,BIGINT,BINARY,BINDING,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CALL,CALLED,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHANGES,CHAR,CHARACTER,CHECK,CLEAR,CLOSE,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONDITION,CONSTRAINT,CONTAINS,CONTINUE,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,CURSOR,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFINED,DEFINER,DELAY,DELETE,DELIMITED,DESC,DESCRIBE,DETERMINISTIC,DFS,DIRECTORIES,DIRECTORY,DISTINCT,DISTRIBUTE,DIV,DO,DOUBLE,DROP,ELSE,ELSEIF,END,ENFORCED,ESCAPE,ESCAPED,EVOLUTION,EXCEPT,EXCHANGE,EXCLUDE,EXCLUSIVE,EXECUTE,EXISTS,EXIT,EXPLAIN,EXPORT,EXTEND,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FLOW,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FOUND,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GEOGRAPHY,GEOMETRY,GLOBAL,GRANT,GROUP,GROUPING,HANDLER,HAVING,HOUR,HOURS,IDENTIFIED,IDENTIFIER,IDENTITY,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INCLUSIVE,INCREMENT,INDEX,INDEXES,INNER,INPATH,INPUT,INPUTFORMAT,INSENSITIVE,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,INVOKER,IS,ITEMS,ITERATE,JOIN,JSON,KEY,KEYS,LANGUAGE,LAST,LATERAL,LAZY,LEADING,LEAVE,LEFT,LEVEL,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MATERIALIZED,MAX,MEASURE,MERGE,METRICS,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MODIFIES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NEXT,NO,NONE,NORELY,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPEN,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PERCENT,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROCEDURE,PROCEDURES,PROPERTIES,PURGE,QUARTER,QUERY,RANGE,READ,READS,REAL,RECORDREADER,RECORDWRITER,RECOVER,RECURSION,RECURSIVE,REDUCE,REFERENCES,REFRESH,RELY,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,RETURN,RETURNS,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SECURITY,SELECT,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,SPECIFIC,SQL,SQLEXCEPTION,SQLSTATE,START,STATISTICS,STORED,STRATIFY,STREAM,STREAMING,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UNTIL,UPDATE,USE,USER,USING,VALUE,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WATERMARK,WEEK,WEEKS,WHEN,WHERE,WHILE,WINDOW,WITH,WITHIN,WITHOUT,X,YEAR,YEARS,ZONE") + assert(infoValue.getStringValue == "ADD,AFTER,AGGREGATE,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,ASENSITIVE,AT,ATOMIC,AUTHORIZATION,BEGIN,BERNOULLI,BETWEEN,BIGINT,BINARY,BINDING,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CALL,CALLED,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHANGES,CHAR,CHARACTER,CHECK,CLEAR,CLOSE,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONDITION,CONSTRAINT,CONTAINS,CONTINUE,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,CURSOR,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFINED,DEFINER,DELAY,DELETE,DELIMITED,DESC,DESCRIBE,DETERMINISTIC,DFS,DIRECTORIES,DIRECTORY,DISTINCT,DISTRIBUTE,DIV,DO,DOUBLE,DROP,ELSE,ELSEIF,END,ENFORCED,ESCAPE,ESCAPED,EVOLUTION,EXCEPT,EXCHANGE,EXCLUDE,EXCLUSIVE,EXECUTE,EXISTS,EXIT,EXPLAIN,EXPORT,EXTEND,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FLOW,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FOUND,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GEOGRAPHY,GEOMETRY,GLOBAL,GRANT,GROUP,GROUPING,HANDLER,HAVING,HOUR,HOURS,IDENTIFIED,IDENTIFIER,IDENTITY,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INCLUSIVE,INCREMENT,INDEX,INDEXES,INNER,INPATH,INPUT,INPUTFORMAT,INSENSITIVE,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,INVOKER,IS,ITEMS,ITERATE,JOIN,JSON,KEY,KEYS,LANGUAGE,LAST,LATERAL,LAZY,LEADING,LEAVE,LEFT,LEVEL,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MATERIALIZED,MAX,MEASURE,MERGE,METRICS,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MODIFIES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NEXT,NO,NONE,NORELY,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPEN,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PERCENT,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROCEDURE,PROCEDURES,PROPERTIES,PURGE,QUARTER,QUERY,RANGE,READ,READS,REAL,RECORDREADER,RECORDWRITER,RECOVER,RECURSION,RECURSIVE,REDUCE,REFERENCES,REFRESH,RELY,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,RETURN,RETURNS,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SECURITY,SELECT,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,SPECIFIC,SQL,SQLEXCEPTION,SQLSTATE,START,STATISTICS,STORED,STRATIFY,STREAM,STREAMING,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UNTIL,UPDATE,USE,USER,USING,VALUE,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WATERMARK,WEEK,WEEKS,WHEN,WHERE,WHILE,WINDOW,WITH,WITHIN,WITHOUT,X,YEAR,YEARS,ZONE") // scalastyle:on line.size.limit } }