Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions itests/src/test/resources/testconfiguration.properties
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ minitez.query.files=\
orc_merge12.q,\
orc_vectorization_ppd.q,\
partition_default_name_change_numeric.q,\
str_to_map_utf8.q,\
tez_complextype_with_null.q,\
tez_tag.q,\
tez_union_udtf.q,\
Expand Down Expand Up @@ -180,6 +181,7 @@ minillap.query.files=\
skip_header_footer_aggr.q,\
skip_header_footer_proj.q,\
str_to_map.q,\
str_to_map_utf8.q,\
table_nonprintable.q,\
temp_table_add_part_with_loc.q,\
temp_table_add_partition_with_location.q,\
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.nio.charset.StandardCharsets;

import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.serde2.io.DateWritableV2;
Expand Down Expand Up @@ -465,7 +466,7 @@ private void assignRowColumn(
{
if (object instanceof String) {
String string = (String) object;
byte[] bytes = string.getBytes();
byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
((BytesColumnVector) columnVector).setVal(
batchIndex, bytes, 0, bytes.length);
} else {
Expand Down
45 changes: 45 additions & 0 deletions ql/src/test/queries/clientpositive/str_to_map_utf8.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
-- HIVE-28728: STR_TO_MAP() must preserve UTF-8 in vectorized execution when JVM default charset is not UTF-8.
-- Tez container opts below force US-ASCII in Tez tasks
-- Use driver-level mimic for testing with llap: -Dmaven.test.jvm.args="-Dfile.encoding=US-ASCII"

SET tez.am.launch.cmd-opts=-Dfile.encoding=US-ASCII;
SET hive.tez.java.opts=-Dfile.encoding=US-ASCII;
SET hive.vectorized.execution.enabled=true;
SET hive.fetch.task.conversion=none;

CREATE TABLE hive28728_src (id string, name string, multi string) STORED AS ORC;
INSERT INTO hive28728_src VALUES
('100','hive', 'en:1'),
('200','spark', null),
('300','oozie', 'a:1,b:2'),
('400','airflow', 'ascii:值'),
('500','优惠活动', '上海:北京,优惠活动:折扣'),
('600','日本語', 'val:1,val:2');

SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src ORDER BY id;
SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE multi IS NOT NULL ORDER BY id;
SELECT STR_TO_MAP(multi, ',', ':')['优惠活动'] FROM hive28728_src WHERE id = '500';
SELECT STR_TO_MAP('优惠活动:折扣,北京:海淀', ',', ':');

SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE id = '200';
SELECT STR_TO_MAP('700', ',', ':');

-- Vectorized INSERT OVERWRITE
CREATE TABLE hive28728_result (cd MAP<STRING, STRING>) STORED AS ORC;
INSERT OVERWRITE TABLE hive28728_result
SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src;
SELECT * FROM hive28728_result ORDER BY cd;

CREATE TABLE hive28728_multi (cd MAP<STRING, STRING>) STORED AS ORC;
INSERT OVERWRITE TABLE hive28728_multi
SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE multi IS NOT NULL ORDER BY id;
SELECT * FROM hive28728_multi ORDER BY cd;

-- Non-vectorized baseline
SET hive.vectorized.execution.enabled=false;
CREATE TABLE hive28728_result_novec (cd MAP<STRING, STRING>) STORED AS ORC;
INSERT OVERWRITE TABLE hive28728_result_novec
SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src;
SELECT * FROM hive28728_result_novec ORDER BY cd;

SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src ORDER BY id;
206 changes: 206 additions & 0 deletions ql/src/test/results/clientpositive/llap/str_to_map_utf8.q.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
PREHOOK: query: CREATE TABLE hive28728_src (id string, name string, multi string) STORED AS ORC
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@hive28728_src
POSTHOOK: query: CREATE TABLE hive28728_src (id string, name string, multi string) STORED AS ORC
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@hive28728_src
PREHOOK: query: INSERT INTO hive28728_src VALUES
('100','hive', 'en:1'),
('200','spark', null),
('300','oozie', 'a:1,b:2'),
('400','airflow', 'ascii:值'),
('500','优惠活动', '上海:北京,优惠活动:折扣'),
('600','日本語', 'val:1,val:2')
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@hive28728_src
POSTHOOK: query: INSERT INTO hive28728_src VALUES
('100','hive', 'en:1'),
('200','spark', null),
('300','oozie', 'a:1,b:2'),
('400','airflow', 'ascii:值'),
('500','优惠活动', '上海:北京,优惠活动:折扣'),
('600','日本語', 'val:1,val:2')
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@hive28728_src
POSTHOOK: Lineage: hive28728_src.id SCRIPT []
POSTHOOK: Lineage: hive28728_src.multi SCRIPT []
POSTHOOK: Lineage: hive28728_src.name SCRIPT []
PREHOOK: query: SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src ORDER BY id
PREHOOK: type: QUERY
PREHOOK: Input: default@hive28728_src
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src ORDER BY id
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive28728_src
POSTHOOK: Output: hdfs://### HDFS PATH ###
{"100":"hive"}
{"200":"spark"}
{"300":"oozie"}
{"400":"airflow"}
{"500":"优惠活动"}
{"600":"日本語"}
PREHOOK: query: SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE multi IS NOT NULL ORDER BY id
PREHOOK: type: QUERY
PREHOOK: Input: default@hive28728_src
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE multi IS NOT NULL ORDER BY id
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive28728_src
POSTHOOK: Output: hdfs://### HDFS PATH ###
{"en":"1"}
{"a":"1","b":"2"}
{"ascii":"值"}
{"上海":"北京","优惠活动":"折扣"}
{"val":"2"}
PREHOOK: query: SELECT STR_TO_MAP(multi, ',', ':')['优惠活动'] FROM hive28728_src WHERE id = '500'
PREHOOK: type: QUERY
PREHOOK: Input: default@hive28728_src
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: SELECT STR_TO_MAP(multi, ',', ':')['优惠活动'] FROM hive28728_src WHERE id = '500'
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive28728_src
POSTHOOK: Output: hdfs://### HDFS PATH ###
折扣
PREHOOK: query: SELECT STR_TO_MAP('优惠活动:折扣,北京:海淀', ',', ':')
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: SELECT STR_TO_MAP('优惠活动:折扣,北京:海淀', ',', ':')
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: hdfs://### HDFS PATH ###
{"优惠活动":"折扣","北京":"海淀"}
PREHOOK: query: SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE id = '200'
PREHOOK: type: QUERY
PREHOOK: Input: default@hive28728_src
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE id = '200'
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive28728_src
POSTHOOK: Output: hdfs://### HDFS PATH ###
{}
PREHOOK: query: SELECT STR_TO_MAP('700', ',', ':')
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: SELECT STR_TO_MAP('700', ',', ':')
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: hdfs://### HDFS PATH ###
{"700":null}
PREHOOK: query: CREATE TABLE hive28728_result (cd MAP<STRING, STRING>) STORED AS ORC
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@hive28728_result
POSTHOOK: query: CREATE TABLE hive28728_result (cd MAP<STRING, STRING>) STORED AS ORC
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@hive28728_result
PREHOOK: query: INSERT OVERWRITE TABLE hive28728_result
SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src
PREHOOK: type: QUERY
PREHOOK: Input: default@hive28728_src
PREHOOK: Output: default@hive28728_result
POSTHOOK: query: INSERT OVERWRITE TABLE hive28728_result
SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive28728_src
POSTHOOK: Output: default@hive28728_result
POSTHOOK: Lineage: hive28728_result.cd EXPRESSION [(hive28728_src)hive28728_src.FieldSchema(name:id, type:string, comment:null), (hive28728_src)hive28728_src.FieldSchema(name:name, type:string, comment:null), ]
PREHOOK: query: SELECT * FROM hive28728_result ORDER BY cd
PREHOOK: type: QUERY
PREHOOK: Input: default@hive28728_result
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: SELECT * FROM hive28728_result ORDER BY cd
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive28728_result
POSTHOOK: Output: hdfs://### HDFS PATH ###
{"100":"hive"}
{"200":"spark"}
{"300":"oozie"}
{"400":"airflow"}
{"500":"优惠活动"}
{"600":"日本語"}
PREHOOK: query: CREATE TABLE hive28728_multi (cd MAP<STRING, STRING>) STORED AS ORC
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@hive28728_multi
POSTHOOK: query: CREATE TABLE hive28728_multi (cd MAP<STRING, STRING>) STORED AS ORC
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@hive28728_multi
PREHOOK: query: INSERT OVERWRITE TABLE hive28728_multi
SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE multi IS NOT NULL ORDER BY id
PREHOOK: type: QUERY
PREHOOK: Input: default@hive28728_src
PREHOOK: Output: default@hive28728_multi
POSTHOOK: query: INSERT OVERWRITE TABLE hive28728_multi
SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE multi IS NOT NULL ORDER BY id
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive28728_src
POSTHOOK: Output: default@hive28728_multi
POSTHOOK: Lineage: hive28728_multi.cd EXPRESSION [(hive28728_src)hive28728_src.FieldSchema(name:multi, type:string, comment:null), ]
PREHOOK: query: SELECT * FROM hive28728_multi ORDER BY cd
PREHOOK: type: QUERY
PREHOOK: Input: default@hive28728_multi
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: SELECT * FROM hive28728_multi ORDER BY cd
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive28728_multi
POSTHOOK: Output: hdfs://### HDFS PATH ###
{"a":"1","b":"2"}
{"ascii":"值"}
{"en":"1"}
{"val":"2"}
{"上海":"北京","优惠活动":"折扣"}
PREHOOK: query: CREATE TABLE hive28728_result_novec (cd MAP<STRING, STRING>) STORED AS ORC
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@hive28728_result_novec
POSTHOOK: query: CREATE TABLE hive28728_result_novec (cd MAP<STRING, STRING>) STORED AS ORC
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@hive28728_result_novec
PREHOOK: query: INSERT OVERWRITE TABLE hive28728_result_novec
SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src
PREHOOK: type: QUERY
PREHOOK: Input: default@hive28728_src
PREHOOK: Output: default@hive28728_result_novec
POSTHOOK: query: INSERT OVERWRITE TABLE hive28728_result_novec
SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive28728_src
POSTHOOK: Output: default@hive28728_result_novec
POSTHOOK: Lineage: hive28728_result_novec.cd EXPRESSION [(hive28728_src)hive28728_src.FieldSchema(name:id, type:string, comment:null), (hive28728_src)hive28728_src.FieldSchema(name:name, type:string, comment:null), ]
PREHOOK: query: SELECT * FROM hive28728_result_novec ORDER BY cd
PREHOOK: type: QUERY
PREHOOK: Input: default@hive28728_result_novec
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: SELECT * FROM hive28728_result_novec ORDER BY cd
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive28728_result_novec
POSTHOOK: Output: hdfs://### HDFS PATH ###
{"100":"hive"}
{"200":"spark"}
{"300":"oozie"}
{"400":"airflow"}
{"500":"优惠活动"}
{"600":"日本語"}
PREHOOK: query: SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src ORDER BY id
PREHOOK: type: QUERY
PREHOOK: Input: default@hive28728_src
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src ORDER BY id
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive28728_src
POSTHOOK: Output: hdfs://### HDFS PATH ###
{"100":"hive"}
{"200":"spark"}
{"300":"oozie"}
{"400":"airflow"}
{"500":"优惠活动"}
{"600":"日本語"}
Loading