diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 51b7c44a8540ea..01c27c6e5f2018 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1496,6 +1496,41 @@ DEFINE_mInt64(hive_sink_max_file_size, "1073741824"); // 1GB /** Iceberg sink configurations **/ DEFINE_mInt64(iceberg_sink_max_file_size, "1073741824"); // 1GB +// URI scheme to Doris file type mappings used by paimon-cpp DorisFileSystem. +// Each entry uses the format "=", and file_type must be one of: +// local, hdfs, s3, http, broker. +DEFINE_Strings(paimon_file_system_scheme_mappings, + "file=local,hdfs=hdfs,viewfs=hdfs,local=hdfs,jfs=hdfs," + "s3=s3,s3a=s3,s3n=s3,oss=s3,obs=s3,cos=s3,cosn=s3,gs=s3," + "abfs=s3,abfss=s3,wasb=s3,wasbs=s3,http=http,https=http," + "ofs=broker,gfs=broker"); +DEFINE_Validator(paimon_file_system_scheme_mappings, + ([](const std::vector& mappings) -> bool { + doris::StringCaseUnorderedSet seen_schemes; + static const doris::StringCaseUnorderedSet supported_types = { + "local", "hdfs", "s3", "http", "broker"}; + for (const auto& raw_entry : mappings) { + std::string_view entry = doris::trim(raw_entry); + size_t separator = entry.find('='); + if (separator == std::string_view::npos) { + return false; + } + std::string scheme = std::string(doris::trim(entry.substr(0, separator))); + std::string file_type = + std::string(doris::trim(entry.substr(separator + 1))); + if (scheme.empty() || file_type.empty()) { + return false; + } + if (supported_types.find(file_type) == supported_types.end()) { + return false; + } + if (!seen_schemes.insert(scheme).second) { + return false; + } + } + return true; + })); + DEFINE_mInt32(thrift_client_open_num_tries, "1"); DEFINE_Bool(enable_index_compaction, "false"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 42ddcdd3ff99c1..7ebf0091ca85c5 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1592,6 +1592,9 @@ DECLARE_mInt64(hive_sink_max_file_size); /** Iceberg sink configurations **/ DECLARE_mInt64(iceberg_sink_max_file_size); +/** Paimon file system configurations **/ +DECLARE_Strings(paimon_file_system_scheme_mappings); + // Number of open tries, default 1 means only try to open once. // Retry the Open num_retries time waiting 100 milliseconds between retries. DECLARE_mInt32(thrift_client_open_num_tries); diff --git a/be/src/format/table/paimon_doris_file_system.cpp b/be/src/format/table/paimon_doris_file_system.cpp index 999b6c684a57ea..bd303b5e26afb8 100644 --- a/be/src/format/table/paimon_doris_file_system.cpp +++ b/be/src/format/table/paimon_doris_file_system.cpp @@ -19,8 +19,6 @@ #include -#include -#include #include #include #include @@ -29,6 +27,7 @@ #include #include +#include "common/config.h" #include "common/status.h" #include "io/file_factory.h" #include "io/fs/file_reader.h" @@ -40,6 +39,7 @@ #include "paimon/fs/file_system_factory.h" #include "paimon/result.h" #include "paimon/status.h" +#include "util/string_util.h" namespace paimon { @@ -48,12 +48,6 @@ struct ParsedUri { std::string authority; }; -std::string to_lower(std::string value) { - std::ranges::transform(value, value.begin(), - [](unsigned char c) { return static_cast(std::tolower(c)); }); - return value; -} - ParsedUri parse_uri(const std::string& path) { ParsedUri parsed; size_t scheme_pos = path.find("://"); @@ -65,7 +59,7 @@ ParsedUri parse_uri(const std::string& path) { if (scheme_pos == std::string::npos || scheme_pos == 0) { return parsed; } - parsed.scheme = to_lower(path.substr(0, scheme_pos)); + parsed.scheme = doris::to_lower(path.substr(0, scheme_pos)); size_t authority_start = scheme_pos + delim_len; if (authority_start >= path.size() || path[authority_start] == '/') { return parsed; @@ -79,38 +73,58 @@ ParsedUri parse_uri(const std::string& path) { return parsed; } -bool is_s3_scheme(const std::string& scheme) { - return scheme == "s3" || scheme == "s3a" || scheme == "s3n" || scheme == "oss" || - scheme == "obs" || scheme == "cos" || scheme == "cosn" || scheme == "gs" || - scheme == "abfs" || scheme == "abfss" || scheme == "wasb" || scheme == "wasbs"; -} - -bool is_hdfs_scheme(const std::string& scheme) { - return scheme == "hdfs" || scheme == "viewfs" || scheme == "local"; +bool parse_scheme_mapping_target(std::string_view raw_target, doris::TFileType::type* type) { + std::string target = doris::to_lower(std::string(doris::trim(raw_target))); + if (target == "local") { + *type = doris::TFileType::FILE_LOCAL; + return true; + } + if (target == "hdfs") { + *type = doris::TFileType::FILE_HDFS; + return true; + } + if (target == "s3") { + *type = doris::TFileType::FILE_S3; + return true; + } + if (target == "http") { + *type = doris::TFileType::FILE_HTTP; + return true; + } + if (target == "broker") { + *type = doris::TFileType::FILE_BROKER; + return true; + } + return false; } -bool is_http_scheme(const std::string& scheme) { - return scheme == "http" || scheme == "https"; +bool parse_scheme_mapping_entry(std::string_view raw_entry, std::string* scheme, + doris::TFileType::type* type) { + size_t separator = raw_entry.find('='); + if (separator == std::string_view::npos) { + return false; + } + *scheme = doris::to_lower(std::string(doris::trim(raw_entry.substr(0, separator)))); + if (scheme->empty()) { + return false; + } + return parse_scheme_mapping_target(raw_entry.substr(separator + 1), type); } doris::TFileType::type map_scheme_to_file_type(const std::string& scheme) { if (scheme.empty()) { return doris::TFileType::FILE_HDFS; } - if (scheme == "file") { - return doris::TFileType::FILE_LOCAL; - } - if (is_hdfs_scheme(scheme)) { - return doris::TFileType::FILE_HDFS; - } - if (is_s3_scheme(scheme)) { - return doris::TFileType::FILE_S3; - } - if (is_http_scheme(scheme)) { - return doris::TFileType::FILE_HTTP; - } - if (scheme == "ofs" || scheme == "gfs" || scheme == "jfs") { - return doris::TFileType::FILE_BROKER; + std::string normalized_scheme = doris::to_lower(scheme); + for (const auto& mapping_entry : doris::config::paimon_file_system_scheme_mappings) { + std::string configured_scheme; + doris::TFileType::type configured_type; + if (!parse_scheme_mapping_entry(mapping_entry, &configured_scheme, &configured_type)) { + continue; + } + if (configured_scheme == normalized_scheme) { + return configured_type; + } } return doris::TFileType::FILE_HDFS; } @@ -149,7 +163,7 @@ std::string normalize_path_for_type(const std::string& path, const std::string& if (type == doris::TFileType::FILE_LOCAL) { return normalize_local_path(path); } - if (type == doris::TFileType::FILE_S3 && scheme != "s3" && !is_http_scheme(scheme)) { + if (type == doris::TFileType::FILE_S3 && scheme != "s3") { return replace_scheme(path, "s3"); } return path; diff --git a/be/src/format/table/paimon_doris_file_system.h b/be/src/format/table/paimon_doris_file_system.h index 1ed2bd8822d42c..561e7aeac30fe4 100644 --- a/be/src/format/table/paimon_doris_file_system.h +++ b/be/src/format/table/paimon_doris_file_system.h @@ -17,9 +17,20 @@ #pragma once +#include + +#include + +namespace paimon { + +// Visible for tests: maps a URI scheme to the Doris file type used by paimon-cpp. +doris::TFileType::type map_scheme_to_file_type(const std::string& scheme); + +} // namespace paimon + namespace doris { // Force-link helper so the paimon-cpp file system factory registration is kept. void register_paimon_doris_file_system(); -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/gen_cpp/CMakeLists.txt b/be/src/gen_cpp/CMakeLists.txt index e91b5e29c141b8..c4463f75520cb0 100644 --- a/be/src/gen_cpp/CMakeLists.txt +++ b/be/src/gen_cpp/CMakeLists.txt @@ -23,6 +23,7 @@ file(GLOB SRC_FILES CONFIGURE_DEPENDS ${GEN_CPP_DIR}/*.cpp ${GEN_CPP_DIR}/*.cc ) +list(FILTER SRC_FILES EXCLUDE REGEX "_server\\.skeleton\\.cpp$") add_compile_options(-Wno-return-type) @@ -43,4 +44,3 @@ endif() #add_dependencies(DorisGen thrift-cpp) #add_dependencies(Opcode function) - diff --git a/be/src/io/file_factory.h b/be/src/io/file_factory.h index 4966137508d62f..7d662e4fdde469 100644 --- a/be/src/io/file_factory.h +++ b/be/src/io/file_factory.h @@ -119,6 +119,7 @@ class FileFactory { case TStorageBackendType::BROKER: return TFileType::FILE_BROKER; case TStorageBackendType::HDFS: + case TStorageBackendType::JFS: return TFileType::FILE_HDFS; default: return ResultError(Status::FatalError("not match type to convert, from type:{}", type)); diff --git a/be/test/format/table/paimon_doris_file_system_test.cpp b/be/test/format/table/paimon_doris_file_system_test.cpp new file mode 100644 index 00000000000000..a2032dff8188c9 --- /dev/null +++ b/be/test/format/table/paimon_doris_file_system_test.cpp @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/table/paimon_doris_file_system.h" + +#include + +#include + +#include "common/config.h" + +namespace doris { + +class PaimonDorisFileSystemTest : public testing::Test { +protected: + void SetUp() override { saved_mappings_ = config::paimon_file_system_scheme_mappings; } + + void TearDown() override { config::paimon_file_system_scheme_mappings = saved_mappings_; } + + std::vector saved_mappings_; +}; + +TEST_F(PaimonDorisFileSystemTest, UsesDefaultSchemeMappings) { + EXPECT_EQ(TFileType::FILE_LOCAL, paimon::map_scheme_to_file_type("file")); + EXPECT_EQ(TFileType::FILE_HDFS, paimon::map_scheme_to_file_type("jfs")); + EXPECT_EQ(TFileType::FILE_S3, paimon::map_scheme_to_file_type("s3a")); + EXPECT_EQ(TFileType::FILE_S3, paimon::map_scheme_to_file_type("gs")); + EXPECT_EQ(TFileType::FILE_HTTP, paimon::map_scheme_to_file_type("https")); + EXPECT_EQ(TFileType::FILE_BROKER, paimon::map_scheme_to_file_type("ofs")); + EXPECT_EQ(TFileType::FILE_HDFS, paimon::map_scheme_to_file_type("unknown")); +} + +TEST_F(PaimonDorisFileSystemTest, AllowsOverridingSchemeMappingsFromConfig) { + config::paimon_file_system_scheme_mappings = {"file=local", "jfs = s3", "gs = hdfs", + "custom-http = http", "custom-broker = broker"}; + + EXPECT_EQ(TFileType::FILE_LOCAL, paimon::map_scheme_to_file_type("file")); + EXPECT_EQ(TFileType::FILE_S3, paimon::map_scheme_to_file_type("JFS")); + EXPECT_EQ(TFileType::FILE_HDFS, paimon::map_scheme_to_file_type("gs")); + EXPECT_EQ(TFileType::FILE_HTTP, paimon::map_scheme_to_file_type("custom-http")); + EXPECT_EQ(TFileType::FILE_BROKER, paimon::map_scheme_to_file_type("custom-broker")); + EXPECT_EQ(TFileType::FILE_HDFS, paimon::map_scheme_to_file_type("still-unknown")); +} + +} // namespace doris diff --git a/bin/start_be.sh b/bin/start_be.sh index 6bc3045b16b740..ac91b5ee0b7eee 100755 --- a/bin/start_be.sh +++ b/bin/start_be.sh @@ -254,6 +254,14 @@ if [[ -d "${DORIS_HOME}/lib/java_extensions/jindofs" ]]; then done fi +# add juicefs +# should after jars in lib/hadoop_hdfs/, or it will override the hadoop jars in lib/hadoop_hdfs +if [[ -d "${DORIS_HOME}/lib/java_extensions/juicefs" ]]; then + for f in "${DORIS_HOME}/lib/java_extensions/juicefs"/*.jar; do + DORIS_CLASSPATH="${DORIS_CLASSPATH}:${f}" + done +fi + # add custom_libs to CLASSPATH # ATTN, custom_libs is deprecated, use plugins/java_extensions if [[ -d "${DORIS_HOME}/custom_lib" ]]; then diff --git a/bin/start_fe.sh b/bin/start_fe.sh index 85f97696f9bc29..9657946e9696db 100755 --- a/bin/start_fe.sh +++ b/bin/start_fe.sh @@ -368,6 +368,14 @@ if [[ -d "${DORIS_HOME}/lib/jindofs" ]]; then done fi +# add juicefs +# should after jars in lib/, or it will override the hadoop jars in lib/ +if [[ -d "${DORIS_HOME}/lib/juicefs" ]]; then + for f in "${DORIS_HOME}/lib/juicefs"/*.jar; do + CLASSPATH="${CLASSPATH}:${f}" + done +fi + # add plugins/java_extensions to CLASSPATH if [[ -d "${DORIS_HOME}/plugins/java_extensions" ]]; then for f in "${DORIS_HOME}/plugins/java_extensions"/*.jar; do diff --git a/build.sh b/build.sh index ae2a930b24e805..bf02668d7e4360 100755 --- a/build.sh +++ b/build.sh @@ -77,6 +77,7 @@ Usage: $0 DISABLE_BE_JAVA_EXTENSIONS If set DISABLE_BE_JAVA_EXTENSIONS=ON, we will do not build binary with java-udf,hadoop-hudi-scanner,jdbc-scanner and so on Default is OFF. DISABLE_JAVA_CHECK_STYLE If set DISABLE_JAVA_CHECK_STYLE=ON, it will skip style check of java code in FE. DISABLE_BUILD_AZURE If set DISABLE_BUILD_AZURE=ON, it will not build azure into BE. + DISABLE_BUILD_JUICEFS If set DISABLE_BUILD_JUICEFS=ON, it will skip packaging juicefs-hadoop jar into FE/BE output. Eg. $0 build all @@ -137,6 +138,45 @@ function copy_common_files() { cp -r -p "${DORIS_HOME}/dist/licenses" "$1/" } +. "${DORIS_HOME}/docker/thirdparties/juicefs-helpers.sh" + +find_juicefs_hadoop_jar() { + juicefs_find_hadoop_jar_by_globs \ + "${DORIS_THIRDPARTY}/installed/juicefs_libs/juicefs-hadoop-[0-9]*.jar" \ + "${DORIS_THIRDPARTY}/src/juicefs-hadoop-[0-9]*.jar" \ + "${DORIS_HOME}/thirdparty/installed/juicefs_libs/juicefs-hadoop-[0-9]*.jar" \ + "${DORIS_HOME}/thirdparty/src/juicefs-hadoop-[0-9]*.jar" +} + +detect_juicefs_version() { + local juicefs_jar + juicefs_jar=$(find_juicefs_hadoop_jar || true) + juicefs_detect_hadoop_version "${juicefs_jar}" "${JUICEFS_DEFAULT_VERSION}" +} + +download_juicefs_hadoop_jar() { + local juicefs_version="$1" + local cache_dir="${DORIS_HOME}/thirdparty/installed/juicefs_libs" + juicefs_download_hadoop_jar_to_cache "${juicefs_version}" "${cache_dir}" +} + +copy_juicefs_hadoop_jar() { + local target_dir="$1" + local source_jar="" + source_jar=$(find_juicefs_hadoop_jar || true) + if [[ -z "${source_jar}" ]]; then + local juicefs_version + juicefs_version=$(detect_juicefs_version) + source_jar=$(download_juicefs_hadoop_jar "${juicefs_version}" || true) + fi + if [[ -z "${source_jar}" ]]; then + echo "WARN: skip copying juicefs-hadoop jar, not found in thirdparty and download failed" + return 0 + fi + cp -r -p "${source_jar}" "${target_dir}/" + echo "Copy JuiceFS Hadoop jar to ${target_dir}: $(basename "${source_jar}")" +} + if ! OPTS="$(getopt \ -n "$0" \ -o '' \ @@ -489,6 +529,12 @@ else BUILD_JINDOFS='ON' fi +if [[ "$(echo "${DISABLE_BUILD_JUICEFS}" | tr '[:lower:]' '[:upper:]')" == "ON" ]]; then + BUILD_JUICEFS='OFF' +else + BUILD_JUICEFS='ON' +fi + if [[ -z "${ENABLE_INJECTION_POINT}" ]]; then ENABLE_INJECTION_POINT='OFF' fi @@ -537,6 +583,7 @@ echo "Get params: BUILD_BE_JAVA_EXTENSIONS -- ${BUILD_BE_JAVA_EXTENSIONS} BUILD_BE_CDC_CLIENT -- ${BUILD_BE_CDC_CLIENT} BUILD_HIVE_UDF -- ${BUILD_HIVE_UDF} + BUILD_JUICEFS -- ${BUILD_JUICEFS} PARALLEL -- ${PARALLEL} CLEAN -- ${CLEAN} GLIBC_COMPATIBILITY -- ${GLIBC_COMPATIBILITY} @@ -756,6 +803,57 @@ function build_ui() { cp -r "${ui_dist}"/* "${DORIS_HOME}/fe/fe-core/src/main/resources/static"/ } +function build_fe_modules() { + local thread_count="${FE_MAVEN_THREADS:-1C}" + local retry_thread_count="${FE_MAVEN_RETRY_THREADS:-1}" + local log_file + local -a dependency_mvn_opts=() + local -a extra_mvn_opts=() + local -a user_settings_opts=() + local -a mvn_cmd=( + "${MVN_CMD}" + package + -pl + "${FE_MODULES}" + -am + -Dskip.doc=true + -DskipTests + ) + + if [[ "${DISABLE_JAVA_CHECK_STYLE}" = "ON" ]]; then + mvn_cmd+=("-Dcheckstyle.skip=true") + fi + if [[ -n "${MVN_OPT}" ]]; then + # shellcheck disable=SC2206 + extra_mvn_opts=(${MVN_OPT}) + fi + if [[ "${BUILD_OBS_DEPENDENCIES}" -eq 0 ]]; then + dependency_mvn_opts+=("-Dobs.dependency.scope=provided") + fi + if [[ "${BUILD_COS_DEPENDENCIES}" -eq 0 ]]; then + dependency_mvn_opts+=("-Dcos.dependency.scope=provided") + fi + if [[ -n "${USER_SETTINGS_MVN_REPO}" && -f "${USER_SETTINGS_MVN_REPO}" ]]; then + user_settings_opts=(-gs "${USER_SETTINGS_MVN_REPO}") + fi + + mvn_cmd+=("${extra_mvn_opts[@]}" "${dependency_mvn_opts[@]}" "${user_settings_opts[@]}" -T "${thread_count}") + log_file="$(mktemp)" + if "${mvn_cmd[@]}" 2>&1 | tee "${log_file}"; then + rm -f "${log_file}" + return 0 + fi + if [[ "${thread_count}" != "${retry_thread_count}" ]] && grep -Fq "Could not acquire lock(s)" "${log_file}"; then + echo "FE Maven build hit Maven resolver lock contention. Retrying with -T ${retry_thread_count}." + mvn_cmd=("${mvn_cmd[@]:0:${#mvn_cmd[@]}-2}" -T "${retry_thread_count}") + "${mvn_cmd[@]}" + rm -f "${log_file}" + return 0 + fi + rm -f "${log_file}" + return 1 +} + # FE UI must be built before building FE if [[ "${BUILD_FE}" -eq 1 ]]; then if [[ "${BUILD_UI}" -eq 1 ]]; then @@ -770,28 +868,7 @@ if [[ "${FE_MODULES}" != '' ]]; then if [[ "${CLEAN}" -eq 1 ]]; then clean_fe fi - DEPENDENCIES_MVN_OPTS=" " - if [[ "${BUILD_OBS_DEPENDENCIES}" -eq 0 ]]; then - DEPENDENCIES_MVN_OPTS+=" -Dobs.dependency.scope=provided " - fi - if [[ "${BUILD_COS_DEPENDENCIES}" -eq 0 ]]; then - DEPENDENCIES_MVN_OPTS+=" -Dcos.dependency.scope=provided " - fi - - if [[ "${DISABLE_JAVA_CHECK_STYLE}" = "ON" ]]; then - # Allowed user customer set env param USER_SETTINGS_MVN_REPO means settings.xml file path - if [[ -n ${USER_SETTINGS_MVN_REPO} && -f ${USER_SETTINGS_MVN_REPO} ]]; then - "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests -Dcheckstyle.skip=true ${MVN_OPT:+${MVN_OPT}} ${DEPENDENCIES_MVN_OPTS} -gs "${USER_SETTINGS_MVN_REPO}" -T 1C - else - "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests -Dcheckstyle.skip=true ${MVN_OPT:+${MVN_OPT}} ${DEPENDENCIES_MVN_OPTS} -T 1C - fi - else - if [[ -n ${USER_SETTINGS_MVN_REPO} && -f ${USER_SETTINGS_MVN_REPO} ]]; then - "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests ${MVN_OPT:+${MVN_OPT}} ${DEPENDENCIES_MVN_OPTS} -gs "${USER_SETTINGS_MVN_REPO}" -T 1C - else - "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests ${MVN_OPT:+${MVN_OPT}} ${DEPENDENCIES_MVN_OPTS} -T 1C - fi - fi + build_fe_modules cd "${DORIS_HOME}" fi @@ -813,6 +890,9 @@ if [[ "${BUILD_FE}" -eq 1 ]]; then if [[ "${BUILD_JINDOFS}" == "ON" ]]; then install -d "${DORIS_OUTPUT}/fe/lib/jindofs" fi + if [[ "${BUILD_JUICEFS}" == "ON" ]]; then + install -d "${DORIS_OUTPUT}/fe/lib/juicefs" + fi cp -r -p "${DORIS_HOME}/fe/fe-core/target/lib"/* "${DORIS_OUTPUT}/fe/lib"/ cp -r -p "${DORIS_HOME}/fe/fe-core/target/doris-fe.jar" "${DORIS_OUTPUT}/fe/lib"/ if [[ "${WITH_TDE_DIR}" != "" ]]; then @@ -833,6 +913,11 @@ if [[ "${BUILD_FE}" -eq 1 ]]; then fi fi + # copy juicefs hadoop client jar + if [[ "${BUILD_JUICEFS}" == "ON" ]]; then + copy_juicefs_hadoop_jar "${DORIS_OUTPUT}/fe/lib/juicefs" + fi + cp -r -p "${DORIS_HOME}/minidump" "${DORIS_OUTPUT}/fe"/ cp -r -p "${DORIS_HOME}/webroot/static" "${DORIS_OUTPUT}/fe/webroot"/ @@ -1011,6 +1096,12 @@ EOF cp -r -p "${DORIS_THIRDPARTY}"/installed/jindofs_libs/jindo-sdk-[0-9]*.jar "${DORIS_OUTPUT}/be/lib/java_extensions/jindofs"/ fi fi + if [[ "${BUILD_JUICEFS}" == "ON" ]]; then + install -d "${DORIS_OUTPUT}/be/lib/java_extensions/juicefs"/ + + # copy juicefs hadoop client jar + copy_juicefs_hadoop_jar "${DORIS_OUTPUT}/be/lib/java_extensions/juicefs" + fi cp -r -p "${DORIS_THIRDPARTY}/installed/webroot"/* "${DORIS_OUTPUT}/be/www"/ copy_common_files "${DORIS_OUTPUT}/be/" diff --git a/docker/thirdparties/docker-compose/hive/hadoop-hive-3x.env.tpl b/docker/thirdparties/docker-compose/hive/hadoop-hive-3x.env.tpl index 5698c84ff2b838..a5676e8aed9128 100644 --- a/docker/thirdparties/docker-compose/hive/hadoop-hive-3x.env.tpl +++ b/docker/thirdparties/docker-compose/hive/hadoop-hive-3x.env.tpl @@ -22,6 +22,8 @@ HIVE_SITE_CONF_hive_metastore_transactional_event_listeners=org.apache.hive.hcat HIVE_SITE_CONF_hive_stats_column_autogather=false HIVE_SITE_CONF_fs_s3_impl=org.apache.hadoop.fs.s3a.S3AFileSystem HIVE_SITE_CONF_fs_s3a_impl=org.apache.hadoop.fs.s3a.S3AFileSystem +HIVE_SITE_CONF_fs_jfs_impl=io.juicefs.JuiceFileSystem +HIVE_SITE_CONF_juicefs_cluster_meta=${JFS_CLUSTER_META} HIVE_SITE_CONF_fs_s3a_access_key=${AWSAk} HIVE_SITE_CONF_fs_s3a_secret_key=${AWSSk} HIVE_SITE_CONF_fs_s3a_endpoint=${AWSEndpoint} @@ -48,4 +50,3 @@ HIVE_SITE_CONF_fs_gs_auth_service_account_private_key_id=${GCSAccountPrivateKeyI HIVE_SITE_CONF_fs_gs_auth_service_account_private_key=${GCSAccountPrivateKey} HIVE_SITE_CONF_fs_gs_proxy_address=${GCSProxyAddress} enablePaimonHms=${enablePaimonHms} - diff --git a/docker/thirdparties/docker-compose/hive/hadoop-hive.env.tpl b/docker/thirdparties/docker-compose/hive/hadoop-hive.env.tpl index ecbf735216b5c4..d48d497bafa039 100644 --- a/docker/thirdparties/docker-compose/hive/hadoop-hive.env.tpl +++ b/docker/thirdparties/docker-compose/hive/hadoop-hive.env.tpl @@ -32,6 +32,8 @@ HIVE_SITE_CONF_hive_stats_column_autogather=false HIVE_SITE_CONF_hive_exec_parallel=true CORE_CONF_fs_defaultFS=hdfs://${IP_HOST}:${FS_PORT} +CORE_CONF_fs_jfs_impl=io.juicefs.JuiceFileSystem +CORE_CONF_juicefs_cluster_meta=${JFS_CLUSTER_META} CORE_CONF_hadoop_http_staticuser_user=root CORE_CONF_hadoop_proxyuser_hue_hosts=* CORE_CONF_hadoop_proxyuser_hue_groups=* @@ -62,4 +64,3 @@ HADOOP_HEAPSIZE=4096 NEED_LOAD_DATA=${NEED_LOAD_DATA} LOAD_PARALLEL=${LOAD_PARALLEL} - diff --git a/docker/thirdparties/docker-compose/hive/hive-2x_settings.env b/docker/thirdparties/docker-compose/hive/hive-2x_settings.env index 9045bb91683dc5..d076b9dbb5bc52 100644 --- a/docker/thirdparties/docker-compose/hive/hive-2x_settings.env +++ b/docker/thirdparties/docker-compose/hive/hive-2x_settings.env @@ -24,3 +24,9 @@ export FS_PORT=8020 # should be same as hive2HmsPort in regression-conf.groovy export HMS_PORT=9083 # should be same as hive2HmsPort in regression-conf.groovy export HS_PORT=10000 # should be same as hive2ServerPort in regression-conf.groovy export PG_PORT=5432 # should be same as hive2PgPort in regression-conf.groovy + +# JuiceFS metadata endpoint for property `juicefs.cluster.meta`. +# CI can override this env, e.g.: +# export JFS_CLUSTER_META="mysql://user:pwd@(127.0.0.1:3316)/juicefs_meta" +# default to mysql_57 (3316) because external pipeline always starts mysql, but not redis. +export JFS_CLUSTER_META="${JFS_CLUSTER_META:-mysql://root:123456@(127.0.0.1:3316)/juicefs_meta}" diff --git a/docker/thirdparties/docker-compose/hive/hive-3x.yaml.tpl b/docker/thirdparties/docker-compose/hive/hive-3x.yaml.tpl index 5118b6bd65d638..d6e4b1cfba52ef 100644 --- a/docker/thirdparties/docker-compose/hive/hive-3x.yaml.tpl +++ b/docker/thirdparties/docker-compose/hive/hive-3x.yaml.tpl @@ -92,6 +92,7 @@ services: - "${HMS_PORT}" volumes: - ./scripts:/mnt/scripts + - /tmp/jfs-bucket:/tmp/jfs-bucket depends_on: hive-metastore-postgresql: condition: service_healthy diff --git a/docker/thirdparties/docker-compose/hive/hive-3x_settings.env b/docker/thirdparties/docker-compose/hive/hive-3x_settings.env index bf2bc4424f8e83..b0af3bc172bc41 100644 --- a/docker/thirdparties/docker-compose/hive/hive-3x_settings.env +++ b/docker/thirdparties/docker-compose/hive/hive-3x_settings.env @@ -25,6 +25,11 @@ export HMS_PORT=9383 # should be same as hive3HmsPort in regression-conf.groovy export HS_PORT=13000 # should be same as hive3ServerPort in regression-conf.groovy export PG_PORT=5732 # should be same as hive3PgPort in regression-conf.groovy +# JuiceFS metadata endpoint for property `juicefs.cluster.meta`. +# CI can override this env, e.g.: +# export JFS_CLUSTER_META="mysql://user:pwd@(127.0.0.1:3316)/juicefs_meta" +export JFS_CLUSTER_META="${JFS_CLUSTER_META:-mysql://root:123456@(127.0.0.1:3316)/juicefs_meta}" + # prepare for paimon hms test,control load paimon hms data or not export enablePaimonHms="false" # hms on s3/oss/obs/cos @@ -44,4 +49,4 @@ export GCSProjectId="" export GCSAccountEmail="" export GCSAccountPrivateKeyId="" export GCSAccountPrivateKey="" -export GCSProxyAddress="" \ No newline at end of file +export GCSProxyAddress="" diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh index d2ac9fa17a19dc..69d5af071b78bd 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh @@ -27,9 +27,21 @@ for file in "${AUX_LIB}"/*.tar.gz; do done ls "${AUX_LIB}/" -# copy auxiliary jars to hive lib, avoid jars copy +# Keep existing behavior for Hive metastore classpath. cp -r "${AUX_LIB}"/* /opt/hive/lib/ +# Add JuiceFS jar into Hadoop classpath for `hadoop fs jfs://...`. +shopt -s nullglob +juicefs_jars=("${AUX_LIB}"/juicefs-hadoop-*.jar) +if (( ${#juicefs_jars[@]} > 0 )); then + for target in /opt/hadoop-3.2.1/share/hadoop/common/lib /opt/hadoop/share/hadoop/common/lib; do + if [[ -d "${target}" ]]; then + cp -f "${juicefs_jars[@]}" "${target}"/ + fi + done +fi +shopt -u nullglob + # start metastore nohup /opt/hive/bin/hive --service metastore & diff --git a/docker/thirdparties/juicefs-helpers.sh b/docker/thirdparties/juicefs-helpers.sh new file mode 100644 index 00000000000000..8ea41189ec3dbf --- /dev/null +++ b/docker/thirdparties/juicefs-helpers.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Shared JuiceFS helper functions used by build and docker scripts. + +JUICEFS_DEFAULT_VERSION="${JUICEFS_DEFAULT_VERSION:-1.3.1}" +JUICEFS_HADOOP_MAVEN_REPO="${JUICEFS_HADOOP_MAVEN_REPO:-https://repo1.maven.org/maven2/io/juicefs/juicefs-hadoop}" + +juicefs_find_hadoop_jar_by_globs() { + local jar_glob="" + local matched_jar="" + for jar_glob in "$@"; do + matched_jar=$(compgen -G "${jar_glob}" | head -n 1 || true) + if [[ -n "${matched_jar}" ]]; then + echo "${matched_jar}" + return 0 + fi + done + return 1 +} + +juicefs_detect_hadoop_version() { + local juicefs_jar="$1" + local default_version="${2:-${JUICEFS_DEFAULT_VERSION}}" + if [[ -z "${juicefs_jar}" ]]; then + echo "${default_version}" + return 0 + fi + juicefs_jar=$(basename "${juicefs_jar}") + juicefs_jar=${juicefs_jar#juicefs-hadoop-} + echo "${juicefs_jar%.jar}" +} + +juicefs_hadoop_jar_download_url() { + local juicefs_version="$1" + local jar_name="juicefs-hadoop-${juicefs_version}.jar" + echo "${JUICEFS_HADOOP_MAVEN_REPO}/${juicefs_version}/${jar_name}" +} + +juicefs_download_hadoop_jar_to_cache() { + local juicefs_version="$1" + local cache_dir="$2" + local jar_name="juicefs-hadoop-${juicefs_version}.jar" + local target_jar="${cache_dir}/${jar_name}" + local download_url + download_url=$(juicefs_hadoop_jar_download_url "${juicefs_version}") + + mkdir -p "${cache_dir}" + if [[ -s "${target_jar}" ]]; then + echo "${target_jar}" + return 0 + fi + + echo "Downloading JuiceFS Hadoop jar ${juicefs_version} from ${download_url}" >&2 + if command -v curl >/dev/null 2>&1; then + if curl -fL --retry 3 --retry-delay 2 --connect-timeout 10 -o "${target_jar}" "${download_url}"; then + echo "${target_jar}" + return 0 + fi + elif command -v wget >/dev/null 2>&1; then + if wget -q "${download_url}" -O "${target_jar}"; then + echo "${target_jar}" + return 0 + fi + fi + + rm -f "${target_jar}" + return 1 +} diff --git a/docker/thirdparties/run-thirdparties-docker.sh b/docker/thirdparties/run-thirdparties-docker.sh index 8731880a6bc001..cd8540d5d328cc 100755 --- a/docker/thirdparties/run-thirdparties-docker.sh +++ b/docker/thirdparties/run-thirdparties-docker.sh @@ -25,6 +25,7 @@ set -eo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" . "${ROOT}/custom_settings.env" +. "${ROOT}/juicefs-helpers.sh" usage() { echo " @@ -230,6 +231,178 @@ reserve_ports() { fi } +JFS_META_FORMATTED=0 +DORIS_ROOT="${DORIS_ROOT:-$(cd "${ROOT}/../.." &>/dev/null && pwd)}" +JUICEFS_RUNTIME_ROOT="${ROOT}/juicefs" + +JUICEFS_LOCAL_BIN="${JUICEFS_RUNTIME_ROOT}/bin/juicefs" + +find_juicefs_hadoop_jar() { + local -a jar_globs=( + "${JUICEFS_RUNTIME_ROOT}/lib/juicefs-hadoop-[0-9]*.jar" + "${ROOT}/docker-compose/hive/scripts/auxlib/juicefs-hadoop-[0-9]*.jar" + "${DORIS_ROOT}/thirdparty/installed/juicefs_libs/juicefs-hadoop-[0-9]*.jar" + "${DORIS_ROOT}/output/fe/lib/juicefs/juicefs-hadoop-[0-9]*.jar" + "${DORIS_ROOT}/output/be/lib/java_extensions/juicefs/juicefs-hadoop-[0-9]*.jar" + "${DORIS_ROOT}/../../../clusterEnv/*/Cluster*/fe/lib/juicefs/juicefs-hadoop-[0-9]*.jar" + "${DORIS_ROOT}/../../../clusterEnv/*/Cluster*/be/lib/java_extensions/juicefs/juicefs-hadoop-[0-9]*.jar" + "/mnt/ssd01/pipline/OpenSourceDoris/clusterEnv/*/Cluster*/fe/lib/juicefs/juicefs-hadoop-[0-9]*.jar" + "/mnt/ssd01/pipline/OpenSourceDoris/clusterEnv/*/Cluster*/be/lib/java_extensions/juicefs/juicefs-hadoop-[0-9]*.jar" + ) + juicefs_find_hadoop_jar_by_globs "${jar_globs[@]}" +} + +detect_juicefs_version() { + local juicefs_jar + juicefs_jar=$(find_juicefs_hadoop_jar || true) + juicefs_detect_hadoop_version "${juicefs_jar}" "${JUICEFS_DEFAULT_VERSION}" +} + +download_juicefs_hadoop_jar() { + local juicefs_version="$1" + local cache_dir="${JUICEFS_RUNTIME_ROOT}/lib" + juicefs_download_hadoop_jar_to_cache "${juicefs_version}" "${cache_dir}" +} + +install_juicefs_cli() { + local juicefs_version="$1" + local cache_dir="${JUICEFS_RUNTIME_ROOT}/bin" + local archive_name="juicefs-${juicefs_version}-linux-amd64.tar.gz" + local download_url="https://github.com/juicedata/juicefs/releases/download/v${juicefs_version}/${archive_name}" + local tmp_dir + local extracted_bin + + mkdir -p "${cache_dir}" + tmp_dir=$(mktemp -d "${cache_dir}/tmp.XXXXXX") + + echo "Downloading JuiceFS CLI ${juicefs_version} from ${download_url}" >&2 + if ! curl -fL --retry 3 --retry-delay 2 -o "${tmp_dir}/${archive_name}" "${download_url}"; then + rm -rf "${tmp_dir}" + echo "ERROR: failed to download JuiceFS CLI from ${download_url}" >&2 + return 1 + fi + + tar -xzf "${tmp_dir}/${archive_name}" -C "${tmp_dir}" + extracted_bin=$(find "${tmp_dir}" -maxdepth 2 -type f -name juicefs | head -n 1) + if [[ -z "${extracted_bin}" ]]; then + rm -rf "${tmp_dir}" + echo "ERROR: failed to locate extracted JuiceFS CLI in ${archive_name}" >&2 + return 1 + fi + + install -m 0755 "${extracted_bin}" "${JUICEFS_LOCAL_BIN}" + rm -rf "${tmp_dir}" +} + +resolve_juicefs_cli() { + local juicefs_version + + if command -v juicefs >/dev/null 2>&1; then + command -v juicefs + return 0 + fi + + if [[ -x "${JUICEFS_LOCAL_BIN}" ]]; then + echo "${JUICEFS_LOCAL_BIN}" + return 0 + fi + + juicefs_version=$(detect_juicefs_version) + install_juicefs_cli "${juicefs_version}" || return 1 + echo "${JUICEFS_LOCAL_BIN}" +} + +ensure_juicefs_meta_database() { + local jfs_meta="$1" + local meta_db + local mysql_container + + if [[ "${jfs_meta}" != *"@(127.0.0.1:3316)/"* && "${jfs_meta}" != *"@(localhost:3316)/"* ]]; then + return 0 + fi + + meta_db="${jfs_meta##*/}" + meta_db="${meta_db%%\?*}" + + if command -v mysql >/dev/null 2>&1; then + mysql -h127.0.0.1 -P3316 -uroot -p123456 -e "CREATE DATABASE IF NOT EXISTS \`${meta_db}\`;" + return 0 + fi + + mysql_container=$(sudo docker ps --format '{{.Names}}' | grep -E "(^|-)${CONTAINER_UID}mysql_57(-[0-9]+)?$" | head -n 1 || true) + if [[ -n "${mysql_container}" ]]; then + sudo docker exec "${mysql_container}" \ + mysql -uroot -p123456 -e "CREATE DATABASE IF NOT EXISTS \`${meta_db}\`;" + fi +} + +run_juicefs_cli() { + local juicefs_cli + juicefs_cli=$(resolve_juicefs_cli) + "${juicefs_cli}" "$@" +} + +ensure_juicefs_hadoop_jar_for_hive() { + local auxlib_dir="${ROOT}/docker-compose/hive/scripts/auxlib" + local source_jar + local juicefs_version + + source_jar=$(find_juicefs_hadoop_jar || true) + if [[ -z "${source_jar}" ]]; then + juicefs_version=$(detect_juicefs_version) + source_jar=$(download_juicefs_hadoop_jar "${juicefs_version}" || true) + fi + + if [[ -z "${source_jar}" ]]; then + echo "WARN: skip syncing juicefs-hadoop jar for hive, not found and download failed." + return 0 + fi + + mkdir -p "${auxlib_dir}" + cp -f "${source_jar}" "${auxlib_dir}/" + echo "Synced JuiceFS Hadoop jar to hive auxlib: $(basename "${source_jar}")" +} + +prepare_juicefs_meta_for_hive() { + local jfs_meta="$1" + local jfs_cluster_name="${2:-cluster}" + if [[ -z "${jfs_meta}" || "${jfs_meta}" != mysql://* ]]; then + return 0 + fi + if [[ "${JFS_META_FORMATTED}" -eq 1 ]]; then + return 0 + fi + + local bucket_dir="${JFS_BUCKET_DIR:-/tmp/jfs-bucket}" + sudo mkdir -p "${bucket_dir}" + sudo chmod 777 "${bucket_dir}" + + # For local mysql_57 metadata DSN, ensure metadata database exists. + ensure_juicefs_meta_database "${jfs_meta}" + + if run_juicefs_cli status "${jfs_meta}" >/dev/null 2>&1; then + echo "JuiceFS metadata is already formatted." + JFS_META_FORMATTED=1 + return 0 + fi + + # Clean stale bucket data before formatting. When meta is not formatted, + # any leftover data in the bucket directory is orphaned from a previous run + # and will cause "juicefs format" to fail with "Storage ... is not empty". + if [[ -d "${bucket_dir}" ]]; then + echo "Cleaning stale JuiceFS bucket directory: ${bucket_dir}" + sudo rm -rf "${bucket_dir:?}"/* + fi + + if ! run_juicefs_cli \ + format --storage file --bucket "${bucket_dir}" "${jfs_meta}" "${jfs_cluster_name}"; then + # If format reports conflict on rerun, verify by status and continue. + run_juicefs_cli status "${jfs_meta}" >/dev/null + fi + + JFS_META_FORMATTED=1 +} + start_es() { # elasticsearch cp "${ROOT}"/docker-compose/elasticsearch/es.yaml.tpl "${ROOT}"/docker-compose/elasticsearch/es.yaml @@ -603,6 +776,12 @@ if [[ $need_prepare_hive_data -eq 1 ]]; then bash "${ROOT}/docker-compose/hive/scripts/prepare-hive-data.sh" fi +if [[ "${STOP}" -ne 1 ]]; then + if [[ "${RUN_HIVE2}" -eq 1 ]] || [[ "${RUN_HIVE3}" -eq 1 ]]; then + ensure_juicefs_hadoop_jar_for_hive + fi +fi + declare -A pids if [[ "${RUN_ES}" -eq 1 ]]; then @@ -723,6 +902,17 @@ for compose in "${!pids[@]}"; do fi done +if [[ "${STOP}" -ne 1 ]]; then + if [[ "${RUN_HIVE2}" -eq 1 ]]; then + . "${ROOT}"/docker-compose/hive/hive-2x_settings.env + prepare_juicefs_meta_for_hive "${JFS_CLUSTER_META}" "cluster" + fi + if [[ "${RUN_HIVE3}" -eq 1 ]]; then + . "${ROOT}"/docker-compose/hive/hive-3x_settings.env + prepare_juicefs_meta_for_hive "${JFS_CLUSTER_META}" "cluster" + fi +fi + echo "docker started" sudo docker ps -a --format "{{.ID}} | {{.Image}} | {{.Status}}" echo "all dockers started successfully" diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/BrokerDesc.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/BrokerDesc.java index c77937cc8c3568..2833cf977d3442 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/BrokerDesc.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/BrokerDesc.java @@ -141,12 +141,12 @@ public TFileType getFileType() { case S3: return TFileType.FILE_S3; case HDFS: + case JFS: return TFileType.FILE_HDFS; case STREAM: return TFileType.FILE_STREAM; case BROKER: case OFS: - case JFS: default: return TFileType.FILE_BROKER; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java index c0fc647fd43f18..17250c5cb90b74 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java @@ -617,8 +617,9 @@ private void analyzeBrokerDesc(Map copiedProps) throws UserExcep * - Centralize HDFS URI parsing logic * - Add validation in FE to reject incomplete or malformed configs */ - if (null != brokerDesc.getStorageType() && brokerDesc.getStorageType() - .equals(StorageBackend.StorageType.HDFS)) { + if (null != brokerDesc.getStorageType() && (brokerDesc.getStorageType() + .equals(StorageBackend.StorageType.HDFS) + || brokerDesc.getStorageType().equals(StorageBackend.StorageType.JFS))) { String defaultFs = HdfsPropertiesUtils.extractDefaultFsFromPath(filePath); brokerDesc.getBackendConfigProperties().put(HdfsProperties.HDFS_DEFAULT_FS_NAME, defaultFs); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Resource.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Resource.java index de0c5542b3eaab..db0659c4b08644 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Resource.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Resource.java @@ -61,6 +61,9 @@ public enum ResourceType { AI; public static ResourceType fromString(String resourceType) { + if ("jfs".equalsIgnoreCase(resourceType) || "juicefs".equalsIgnoreCase(resourceType)) { + return HDFS; + } for (ResourceType type : ResourceType.values()) { if (type.name().equalsIgnoreCase(resourceType)) { return type; diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/ResourceMgr.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/ResourceMgr.java index 81ce3b3009bdb6..1135a8a35faa81 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/ResourceMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/ResourceMgr.java @@ -75,7 +75,8 @@ public ResourceMgr() { public void createResource(CreateResourceCommand command) throws DdlException { CreateResourceInfo info = command.getInfo(); if (info.getResourceType() == ResourceType.UNKNOWN) { - throw new DdlException("Only support SPARK, ODBC_CATALOG ,JDBC, S3_COOLDOWN, S3, HDFS and HMS resource."); + throw new DdlException( + "Only support SPARK, ODBC_CATALOG, JDBC, S3_COOLDOWN, S3, HDFS(JFS/JUICEFS), and HMS resource."); } Resource resource = Resource.fromCommand(command); if (createResource(resource, info.isIfNotExists())) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/HdfsProperties.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/HdfsProperties.java index 1ef4c5f921d119..f2077b45ad79ae 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/HdfsProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/HdfsProperties.java @@ -88,7 +88,7 @@ public class HdfsProperties extends HdfsCompatibleProperties { private static final String DFS_NAME_SERVICES_KEY = "dfs.nameservices"; - private static final Set supportSchema = ImmutableSet.of("hdfs", "viewfs"); + private static final Set supportSchema = ImmutableSet.of("hdfs", "viewfs", "jfs"); /** * The final HDFS configuration map that determines the effective settings. @@ -143,7 +143,8 @@ private void extractUserOverriddenHdfsConfig(Map origProps) { } userOverriddenHdfsConfig = new HashMap<>(); origProps.forEach((key, value) -> { - if (key.startsWith("hadoop.") || key.startsWith("dfs.") || key.startsWith("fs.")) { + if (key.startsWith("hadoop.") || key.startsWith("dfs.") || key.startsWith("fs.") + || key.startsWith("juicefs.")) { userOverriddenHdfsConfig.put(key, value); } }); diff --git a/fe/fe-core/src/main/java/org/apache/doris/fs/SchemaTypeMapper.java b/fe/fe-core/src/main/java/org/apache/doris/fs/SchemaTypeMapper.java index 0686f977d4d1dc..1b6404a0b41033 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/fs/SchemaTypeMapper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/fs/SchemaTypeMapper.java @@ -58,7 +58,9 @@ public enum SchemaTypeMapper { //todo Support for this type is planned but not yet implemented. OFS("ofs", StorageProperties.Type.BROKER, FileSystemType.OFS, TFileType.FILE_BROKER), GFS("gfs", StorageProperties.Type.BROKER, FileSystemType.HDFS, TFileType.FILE_BROKER), - JFS("jfs", StorageProperties.Type.BROKER, FileSystemType.JFS, TFileType.FILE_BROKER), + // JuiceFS is mounted through Hadoop FileSystem implementation in Doris, + // so it should follow the HDFS-compatible path. + JFS("jfs", StorageProperties.Type.HDFS, FileSystemType.HDFS, TFileType.FILE_HDFS), VIEWFS("viewfs", StorageProperties.Type.HDFS, FileSystemType.HDFS, TFileType.FILE_HDFS), FILE("file", StorageProperties.Type.LOCAL, FileSystemType.FILE, TFileType.FILE_LOCAL), OSS_HDFS("oss", StorageProperties.Type.OSS_HDFS, FileSystemType.HDFS, TFileType.FILE_HDFS), @@ -158,4 +160,3 @@ public static TFileType fromSchemaToFileType(String schema) { return SCHEMA_TO_FILE_TYPE_MAP.get(schema.toLowerCase()); } } - diff --git a/fe/fe-core/src/test/java/org/apache/doris/common/util/LocationPathTest.java b/fe/fe-core/src/test/java/org/apache/doris/common/util/LocationPathTest.java index 8a73619824cddd..9e7351aa3e4df9 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/common/util/LocationPathTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/common/util/LocationPathTest.java @@ -103,16 +103,17 @@ public void testHdfsLocationConvert() throws UserException { } @Test - public void testJFSLocationConvert() { + public void testJfsLocationConvertAsHdfsCompatible() { LocationPath locationPath = LocationPath.of("jfs://test.com"); // FE Assertions.assertTrue(locationPath.getNormalizedLocation().startsWith("jfs://")); // BE String loc = locationPath.toStorageLocation().toString(); Assertions.assertTrue(loc.startsWith("jfs://")); - Assertions.assertEquals(FileSystemType.JFS, locationPath.getFileSystemType()); + // JFS is treated as Hadoop-compatible, so Doris maps it to HDFS file system type. + Assertions.assertEquals(FileSystemType.HDFS, locationPath.getFileSystemType()); Assertions.assertEquals("jfs://test.com", locationPath.getFsIdentifier()); - Assertions.assertEquals(TFileType.FILE_BROKER, locationPath.getTFileTypeForBE()); + Assertions.assertEquals(TFileType.FILE_HDFS, locationPath.getTFileTypeForBE()); } @Disabled("not support in master") diff --git a/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/HdfsPropertiesTest.java b/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/HdfsPropertiesTest.java index b8ba275e9cfdf5..6554829f9f711e 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/HdfsPropertiesTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/HdfsPropertiesTest.java @@ -41,6 +41,8 @@ public void testBasicHdfsCreate() throws UserException { Map simpleHdfsProperties = new HashMap<>(); simpleHdfsProperties.put("uri", "hdfs://test/1.orc"); Assertions.assertEquals(HdfsProperties.class, StorageProperties.createPrimary(simpleHdfsProperties).getClass()); + simpleHdfsProperties.put("uri", "jfs://test/1.orc"); + Assertions.assertEquals(HdfsProperties.class, StorageProperties.createPrimary(simpleHdfsProperties).getClass()); Map origProps = createBaseHdfsProperties(); List storageProperties = StorageProperties.createAll(origProps); HdfsProperties hdfsProperties = (HdfsProperties) storageProperties.get(0); @@ -151,6 +153,21 @@ public void testBasicCreateByOldProperties() throws UserException { } + @Test + public void testPassThroughJuicefsProperties() throws UserException { + Map origProps = new HashMap<>(); + origProps.put("hdfs.authentication.type", "simple"); + origProps.put("fs.defaultFS", "jfs://cluster"); + origProps.put("fs.jfs.impl", "io.juicefs.JuiceFileSystem"); + origProps.put("juicefs.cluster.meta", "redis://127.0.0.1:6379/1"); + + StorageProperties properties = StorageProperties.createAll(origProps).get(0); + Assertions.assertEquals(HdfsProperties.class, properties.getClass()); + Map beProperties = properties.getBackendConfigProperties(); + Assertions.assertEquals("redis://127.0.0.1:6379/1", beProperties.get("juicefs.cluster.meta")); + Assertions.assertEquals("jfs://cluster", beProperties.get("fs.defaultFS")); + } + // Helper methods to reduce code duplication private Map createBaseHdfsProperties() { Map origProps = Maps.newHashMap(); diff --git a/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/HdfsPropertiesUtilsTest.java b/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/HdfsPropertiesUtilsTest.java index eed7360206b1b9..8ed1a4776bbf73 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/HdfsPropertiesUtilsTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/HdfsPropertiesUtilsTest.java @@ -30,7 +30,7 @@ public class HdfsPropertiesUtilsTest { - private static final Set supportSchema = ImmutableSet.of("hdfs", "viewfs"); + private static final Set supportSchema = ImmutableSet.of("hdfs", "viewfs", "jfs"); @Test public void testCheckLoadPropsAndReturnUri_success() throws Exception { @@ -69,6 +69,13 @@ public void testConvertUrlToFilePath_valid() throws Exception { Assertions.assertEquals("viewfs://cluster/user/test", result); } + @Test + public void testConvertUrlToFilePath_jfs() throws Exception { + String uri = "jfs://cluster/user/test"; + String result = HdfsPropertiesUtils.convertUrlToFilePath(uri, "", supportSchema); + Assertions.assertEquals("jfs://cluster/user/test", result); + } + @Test public void testConvertUrlToFilePath_invalidSchema() { String uri = "s3://bucket/file.txt"; diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/CreateResourceCommandTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/CreateResourceCommandTest.java index 954d9e90745a13..e822474d5ceaed 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/CreateResourceCommandTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/CreateResourceCommandTest.java @@ -46,27 +46,37 @@ public void testValidate(@Mocked Env env, @Mocked AccessControllerManager access }; // test validate normal - ImmutableMap properties = ImmutableMap.of("type", "es", "host", "http://127.0.0.1:29200"); - CreateResourceInfo info = new CreateResourceInfo(true, false, "test", properties); - CreateResourceCommand createResourceCommand = new CreateResourceCommand(info); - Assertions.assertDoesNotThrow(() -> createResourceCommand.getInfo().validate()); + final ImmutableMap esProperties = + ImmutableMap.of("type", "es", "host", "http://127.0.0.1:29200"); + final CreateResourceInfo esInfo = new CreateResourceInfo(true, false, "test", esProperties); + final CreateResourceCommand esCommand = new CreateResourceCommand(esInfo); + Assertions.assertDoesNotThrow(() -> esCommand.getInfo().validate()); + + // jfs/juicefs should be treated as HDFS-compatible resource type + final ImmutableMap jfsProperties = + ImmutableMap.of("type", "jfs", "fs.defaultFS", "jfs://cluster"); + final CreateResourceInfo jfsInfo = new CreateResourceInfo(true, false, "test_jfs", jfsProperties); + final CreateResourceCommand jfsCommand = new CreateResourceCommand(jfsInfo); + Assertions.assertDoesNotThrow(() -> jfsCommand.getInfo().validate()); // test validate abnormal // test properties - info = new CreateResourceInfo(false, false, "test", null); - CreateResourceCommand createResourceCommand1 = new CreateResourceCommand(info); + final CreateResourceInfo nullPropertiesInfo = new CreateResourceInfo(false, false, "test", null); + final CreateResourceCommand createResourceCommand1 = new CreateResourceCommand(nullPropertiesInfo); Assertions.assertThrows(AnalysisException.class, () -> createResourceCommand1.getInfo().validate()); // test resource type - properties = ImmutableMap.of("host", "http://127.0.0.1:29200"); - info = new CreateResourceInfo(false, false, "test", properties); - CreateResourceCommand createResourceCommand2 = new CreateResourceCommand(info); + final ImmutableMap noTypeProperties = ImmutableMap.of("host", "http://127.0.0.1:29200"); + final CreateResourceInfo noTypeInfo = new CreateResourceInfo(false, false, "test", noTypeProperties); + final CreateResourceCommand createResourceCommand2 = new CreateResourceCommand(noTypeInfo); Assertions.assertThrows(AnalysisException.class, () -> createResourceCommand2.getInfo().validate()); // test unsupported resource type - properties = ImmutableMap.of("type", "flink", "host", "http://127.0.0.1:29200"); - info = new CreateResourceInfo(false, false, "test", properties); - CreateResourceCommand createResourceCommand3 = new CreateResourceCommand(info); + final ImmutableMap unsupportedTypeProperties = + ImmutableMap.of("type", "flink", "host", "http://127.0.0.1:29200"); + final CreateResourceInfo unsupportedTypeInfo = + new CreateResourceInfo(false, false, "test", unsupportedTypeProperties); + final CreateResourceCommand createResourceCommand3 = new CreateResourceCommand(unsupportedTypeInfo); Assertions.assertThrows(AnalysisException.class, () -> createResourceCommand3.getInfo().validate()); } diff --git a/regression-test/pipeline/external/conf/regression-conf.groovy b/regression-test/pipeline/external/conf/regression-conf.groovy index bd3e395c9643bf..0d8f60387520ff 100644 --- a/regression-test/pipeline/external/conf/regression-conf.groovy +++ b/regression-test/pipeline/external/conf/regression-conf.groovy @@ -97,6 +97,15 @@ hdfsUser = "doris-test" hdfsPasswd = "" brokerName = "broker_name" +// for JuiceFS(hadoop-compatible) regression cases +// first step: enable case execution in external pipeline +enableJfsTest=true +jfsFs = "jfs://cluster" +jfsImpl = "io.juicefs.JuiceFileSystem" +jfsMeta = "mysql://root:123456@(127.0.0.1:3316)/juicefs_meta" +jfsHiveMetastoreUris = "thrift://127.0.0.1:9383" +jfsHadoopUser = "root" + // broker load test config enableBrokerLoad=true diff --git a/regression-test/suites/external_table_p0/refactor_storage_param/test_jfs_hms_catalog_read.groovy b/regression-test/suites/external_table_p0/refactor_storage_param/test_jfs_hms_catalog_read.groovy new file mode 100644 index 00000000000000..c87e0e826972fe --- /dev/null +++ b/regression-test/suites/external_table_p0/refactor_storage_param/test_jfs_hms_catalog_read.groovy @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_jfs_hms_catalog_read", "p0,external") { + String enableJfs = context.config.otherConfigs.get("enableJfsTest") + if (enableJfs == null || !enableJfs.equalsIgnoreCase("true")) { + logger.info("disable JFS test.") + return + } + + String enableHive = context.config.otherConfigs.get("enableHiveTest") + if (enableHive == null || !enableHive.equalsIgnoreCase("true")) { + logger.info("disable Hive test.") + return + } + + String jfsFs = context.config.otherConfigs.get("jfsFs") + if (jfsFs == null || jfsFs.trim().isEmpty()) { + logger.info("skip JFS test because jfsFs is empty.") + return + } + + String jfsImpl = context.config.otherConfigs.get("jfsImpl") + if (jfsImpl == null || jfsImpl.trim().isEmpty()) { + jfsImpl = "io.juicefs.JuiceFileSystem" + } + String jfsMeta = context.config.otherConfigs.get("jfsMeta") + if (jfsMeta == null || jfsMeta.trim().isEmpty()) { + throw new IllegalStateException("jfsMeta must be configured for JFS data IO regression") + } + String jfsCluster = jfsFs.replaceFirst("^jfs://", "") + int slashPos = jfsCluster.indexOf("/") + if (slashPos > 0) { + jfsCluster = jfsCluster.substring(0, slashPos) + } + String jfsMetaProperty = ",\n 'juicefs.${jfsCluster}.meta' = '${jfsMeta}'" + + String hdfsUser = context.config.otherConfigs.get("jfsHadoopUser") + if (hdfsUser == null || hdfsUser.trim().isEmpty()) { + hdfsUser = context.config.otherConfigs.get("hdfsUser") + } + if (hdfsUser == null || hdfsUser.trim().isEmpty()) { + hdfsUser = "root" + } + + String hmsUris = context.config.otherConfigs.get("jfsHiveMetastoreUris") + if (hmsUris == null || hmsUris.trim().isEmpty()) { + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String hmsPort = context.config.otherConfigs.get("hive3HmsPort") + if (hmsPort == null || hmsPort.trim().isEmpty()) { + hmsPort = context.config.otherConfigs.get("hive2HmsPort") + } + if (externalEnvIp == null || externalEnvIp.trim().isEmpty() + || hmsPort == null || hmsPort.trim().isEmpty()) { + logger.info("skip JFS test because jfsHiveMetastoreUris is empty and fallback externalEnvIp/hmsPort is invalid.") + return + } + hmsUris = "thrift://${externalEnvIp}:${hmsPort}" + } + String catalogName = "test_jfs_hms_catalog_read" + String dbName = "test_jfs_hms_catalog_read_db" + String tableName = "test_jfs_hms_catalog_read_tbl" + String jfsDbBasePath = context.config.otherConfigs.get("jfsDbBasePath") + if (jfsDbBasePath == null || jfsDbBasePath.trim().isEmpty()) { + jfsDbBasePath = "${jfsFs}/doris_jfs/${hdfsUser}" + } + jfsDbBasePath = jfsDbBasePath.replaceAll('/+$', '') + String jfsStagingDir = context.config.otherConfigs.get("jfsStagingDir") + if (jfsStagingDir == null || jfsStagingDir.trim().isEmpty()) { + jfsStagingDir = "${jfsDbBasePath}/.doris_staging" + } + jfsStagingDir = jfsStagingDir.replaceAll('/+$', '') + String dbLocation = "${jfsDbBasePath}/${dbName}" + + sql """drop catalog if exists ${catalogName}""" + + try { + sql """ + CREATE CATALOG ${catalogName} PROPERTIES ( + 'type' = 'hms', + 'hive.metastore.uris' = '${hmsUris}', + 'fs.defaultFS' = '${jfsFs}', + 'fs.jfs.impl' = '${jfsImpl}', + 'hadoop.username' = '${hdfsUser}', + 'hive.staging_dir' = '${jfsStagingDir}' + ${jfsMetaProperty} + ); + """ + + sql """switch ${catalogName}""" + def dbs = sql """show databases""" + assertTrue(dbs.size() > 0) + + def hasDb = sql """show databases like '${dbName}'""" + if (hasDb.size() > 0) { + sql """drop table if exists `${dbName}`.`${tableName}`""" + sql """drop database if exists `${dbName}`""" + } + sql """ + create database `${dbName}` + properties('location'='${dbLocation}') + """ + sql """use `${dbName}`""" + sql """ + CREATE TABLE `${tableName}` ( + `id` INT, + `name` STRING + ) ENGINE=hive + PROPERTIES ( + 'file_format'='parquet' + ) + """ + sql """insert into `${tableName}` values (1, 'jfs_1'), (2, 'jfs_2')""" + + def cnt = sql """select count(*) from `${tableName}`""" + assertEquals("2", cnt[0][0].toString()) + + def rows = sql """select * from `${tableName}` order by id""" + assertTrue(rows.size() == 2) + assertEquals("1", rows[0][0].toString()) + assertEquals("jfs_1", rows[0][1].toString()) + assertEquals("2", rows[1][0].toString()) + assertEquals("jfs_2", rows[1][1].toString()) + } finally { + sql """switch internal""" + } +} diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh index 18a6f5ac4defdc..88a3cdbcfb8495 100755 --- a/thirdparty/build-thirdparty.sh +++ b/thirdparty/build-thirdparty.sh @@ -1967,6 +1967,15 @@ build_jindofs() { cp -r ${TP_SOURCE_DIR}/${JINDOFS_SOURCE}/* "${TP_INSTALL_DIR}/jindofs_libs/" } +# juicefs +build_juicefs() { + check_if_archive_exist "${JUICEFS_NAME}" + + rm -rf "${TP_INSTALL_DIR}/juicefs_libs/" + mkdir -p "${TP_INSTALL_DIR}/juicefs_libs/" + cp -r "${TP_SOURCE_DIR}/${JUICEFS_NAME}" "${TP_INSTALL_DIR}/juicefs_libs/" +} + # pugixml build_pugixml() { check_if_source_exist "${PUGIXML_SOURCE}" @@ -2071,6 +2080,7 @@ build_paimon_cpp() { if [[ "${#packages[@]}" -eq 0 ]]; then packages=( jindofs + juicefs odbc openssl libevent @@ -2237,6 +2247,7 @@ cleanup_package_source() { dragonbox) src_var="DRAGONBOX_SOURCE" ;; icu) src_var="ICU_SOURCE" ;; jindofs) src_var="JINDOFS_SOURCE" ;; + juicefs) src_var="JUICEFS_SOURCE" ;; pugixml) src_var="PUGIXML_SOURCE" ;; paimon_cpp) src_var="PAIMON_CPP_SOURCE" ;; aws_sdk) src_var="AWS_SDK_SOURCE" ;; diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh index c46119cfd49f30..87f4132b0ff066 100644 --- a/thirdparty/vars.sh +++ b/thirdparty/vars.sh @@ -547,6 +547,12 @@ JINDOFS_NAME=jindofs-6.10.4-libs-0.1.tar.gz JINDOFS_SOURCE=jindofs-6.10.4-libs-0.1 JINDOFS_MD5SUM="bd30b4c5fe97c4367eeb3bb228b317d9" +# juicefs +JUICEFS_DOWNLOAD="https://repo1.maven.org/maven2/io/juicefs/juicefs-hadoop/1.3.1/juicefs-hadoop-1.3.1.jar" +JUICEFS_NAME=juicefs-hadoop-1.3.1.jar +JUICEFS_SOURCE= +JUICEFS_MD5SUM="f374dfbfbdc4b83417cfea78a6728c54" + # pugixml PUGIXML_DOWNLOAD="https://github.com/zeux/pugixml/releases/download/v1.15/pugixml-1.15.tar.gz" PUGIXML_NAME=pugixml-1.15.tar.gz @@ -640,6 +646,7 @@ export TP_ARCHIVES=( 'DRAGONBOX' 'ICU' 'JINDOFS' + 'JUICEFS' 'PUGIXML' 'PAIMON_CPP' )