changes from testing PR GoogleCloudDataproc#1275

cjac · cjac · commit 989b445b20a2 · 2025-01-28T17:39:48.000-08:00
diff --git a/templates/common/util_functions b/templates/common/util_functions
@@ -7,9 +7,9 @@ function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | c
 # ( version_ge 2.0 2.1 ) evaluates to false
 # ( version_ge 2.2 2.1 ) evaluates to true
 function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
-function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
+function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge "$1" "$2" ; )
 function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
-function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
+function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le "$1" "$2" ; )
 
 function define_os_comparison_functions() {
 
@@ -500,8 +500,7 @@ function harden_sshd_config() {
     sshd_config_line=$(
       (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g';
        ssh -Q "${ftr}" ) \
-      | sort -u | perl -e '@a=grep{!/(sha1|md5)/ig}<STDIN>;
-      print("$ENV{feature} ",join(",",map{ chomp; $_ }@a), $/) if "@a"')
+      | sort -u | grep -v -ie sha1 -e md5 | paste -sd "," -)
     grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
     echo "$sshd_config_line" >> /tmp/sshd_config_new
     # TODO: test whether sshd will reload with this change before mv
diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions
@@ -119,7 +119,7 @@ function set_cuda_runfile_url() {
           ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
           ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
           ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
-          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
+          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05"
   )
 
   # Verify that the file with the indicated combination exists
@@ -200,6 +200,7 @@ function uninstall_local_cuda_repo(){
 }
 
 function install_local_cudnn_repo() {
+  # https://docs.nvidia.com/deeplearning/cudnn/sla/index.html
   is_complete install-local-cudnn-repo && return
 
   pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
@@ -368,6 +369,7 @@ function install_nvidia_nccl() {
   mark_complete nccl
 }
 
+# https://docs.nvidia.com/deeplearning/cudnn/sla/index.html
 function install_nvidia_cudnn() {
   if le_debian10 ; then return ; fi
   is_complete cudnn && return
@@ -435,45 +437,64 @@ function install_nvidia_cudnn() {
 }
 
 function install_pytorch() {
-  if test -f "${workdir}/complete/pytorch" ; then return ; fi
+  is_complete pytorch && return
+
   local env
   env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
   local mc3=/opt/conda/miniconda3
   local envpath="${mc3}/envs/${env}"
+  if [[ "${env}" == "base" ]]; then
+    echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${mc3}" ; fi
   # Set numa node to 0 for all GPUs
   for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
 
-  readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
-  case "${INCLUDE_PYTORCH^^}" in
-    "1" | "YES" | "TRUE" )
-      local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
-      local local_tarball="${workdir}/${build_tarball}"
-      local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
+  local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
+  local local_tarball="${workdir}/${build_tarball}"
+  local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
 
-      output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
-      if echo "${output}" | grep -q "${gcs_tarball}" ; then
-        # cache hit - unpack from cache
-        echo "cache hit"
-        mkdir -p "${envpath}"
-        gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
-      else
-        local verb=create
-        if test -d "${envpath}" ; then verb=install ; fi
-        cudart_spec="cuda-cudart"
-        if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
-        "${mc3}/bin/mamba" "${verb}" -n "${env}" \
-          -c conda-forge -c nvidia -c rapidsai \
-          numba pytorch tensorflow[and-cuda] rapids pyspark \
-          "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
-        pushd "${envpath}"
-        tar czf "${local_tarball}" .
-        popd
-        gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      fi
-      ;;
-    * ) echo "skip pytorch install" ;;
-  esac
-  touch "${workdir}/complete/pytorch"
+  if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
+    # do not build in tests with < 32 cores
+    sleep $(( ( RANDOM % 11 ) + 10 ))
+    while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
+      sleep 5m
+    done
+  fi
+
+  output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+  if echo "${output}" | grep -q "${gcs_tarball}" ; then
+    # cache hit - unpack from cache
+    echo "cache hit"
+    mkdir -p "${envpath}"
+    gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
+  else
+    touch "${local_tarball}.building"
+    gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
+    local verb=create
+    if test -d "${envpath}" ; then verb=install ; fi
+    cudart_spec="cuda-cudart"
+    if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
+
+    # Install pytorch and company to this environment
+    "${mc3}/bin/mamba" "${verb}" -n "${env}" \
+      -c conda-forge -c nvidia -c rapidsai \
+      numba pytorch tensorflow[and-cuda] rapids pyspark \
+      "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
+
+    # Install jupyter kernel in this environment
+    "${envpath}/bin/python3" -m pip install ipykernel
+
+    # package environment and cache in GCS
+    pushd "${envpath}"
+    tar czf "${local_tarball}" .
+    popd
+    gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+    if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
+  fi
+
+  # register the environment as a selectable kernel
+  "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})"
+
+  mark_complete pytorch
 }
 
 function add_nonfree_components() {
@@ -508,7 +529,16 @@ function add_repo_nvidia_container_toolkit() {
 
 function add_repo_cuda() {
   if is_debuntu ; then
-    install_cuda_keyring_pkg # 11.7+, 12.0+
+    if version_le "${CUDA_VERSION}" 11.6 ; then
+      local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
+      local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
+      echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
+      | sudo tee "${sources_list_path}"
+      curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
+        -o "${kr_path}"
+    else
+      install_cuda_keyring_pkg # 11.7+, 12.0+
+    fi
   elif is_rocky ; then
     execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
   fi
diff --git a/templates/gpu/spark_functions b/templates/gpu/spark_functions
@@ -7,14 +7,15 @@ function download_spark_jar() {
 
 function install_spark_rapids() {
   # Update SPARK RAPIDS config
-  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_SPARK_RAPIDS_VERSION
+  DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
   local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
 
   if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
-    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
+    DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
   fi
 
   readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
@@ -60,7 +60,7 @@ function set_cuda_version() {
   case "${DATAPROC_IMAGE_VERSION}" in
     "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
     "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
-    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
+    "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
     *   )
       echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
       exit 1
diff --git a/templates/gpu/yarn_functions b/templates/gpu/yarn_functions
@@ -12,6 +12,25 @@ function configure_yarn_gpu_resources() {
     'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
 
   set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  # Older CapacityScheduler does not permit use of gpu resources ; switch to FairScheduler on 2.0 and below
+  if version_lt "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
+    fs_xml="$HADOOP_CONF_DIR/fair-scheduler.xml"
+    set_hadoop_property 'yarn-site.xml' \
+      'yarn.resourcemanager.scheduler.class' 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler'
+    set_hadoop_property 'yarn-site.xml' \
+      "yarn.scheduler.fair.user-as-default-queue" "false"
+    set_hadoop_property 'yarn-site.xml' \
+      "yarn.scheduler.fair.allocation.file" "${fs_xml}"
+    set_hadoop_property 'yarn-site.xml' \
+      'yarn.scheduler.fair.resource-calculator' 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+    cat > "${fs_xml}" <<EOF
+<!-- ${fs_xml} -->
+<allocations>
+  <queueMaxAppsDefault>1</queueMaxAppsDefault>
+</allocations>
+EOF
+  fi
 }
 
 function configure_gpu_script() {
@@ -44,9 +63,15 @@ function configure_gpu_script() {
 #
 # Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
 
+set -e
+resources_json="/dev/shm/nvidia/gpusResources.json"
+if test -f "${resources_json}" ; then cat "${resources_json}" ; exit 0 ; fi
+
+mkdir -p "$(dirname ${resources_json})"
+
 ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
 
-echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
+echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} | tee "${resources_json}"
 EOF
 
   chmod a+rx "${gpus_resources_script}"
@@ -78,14 +103,14 @@ EOF
 # having AQE enabled gives user the best performance.
 spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
 spark.executor.resource.gpu.amount=${gpu_count}
-spark.plugins=com.nvidia.spark.SQLPlugin
 spark.executor.cores=${executor_cores}
 spark.executor.memory=${executor_memory_gb}G
 spark.dynamicAllocation.enabled=false
 # please update this config according to your application
 spark.task.resource.gpu.amount=${gpu_amount}
 spark.task.cpus=2
 spark.yarn.unmanagedAM.enabled=false
+spark.plugins=com.nvidia.spark.SQLPlugin
 ###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
 EOF
 }
@@ -97,6 +122,7 @@ function configure_yarn_nodemanager_gpu() {
     'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
   set_hadoop_property 'yarn-site.xml' \
     'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
+
   configure_yarn_nodemanager
 }