Skip to content

Commit 989b445

Browse files
committed
changes from testing PR GoogleCloudDataproc#1275
1 parent 07949a9 commit 989b445

5 files changed

Lines changed: 98 additions & 42 deletions

File tree

templates/common/util_functions

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | c
77
# ( version_ge 2.0 2.1 ) evaluates to false
88
# ( version_ge 2.2 2.1 ) evaluates to true
99
function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
10-
function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
10+
function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge "$1" "$2" ; )
1111
function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
12-
function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
12+
function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le "$1" "$2" ; )
1313

1414
function define_os_comparison_functions() {
1515

@@ -500,8 +500,7 @@ function harden_sshd_config() {
500500
sshd_config_line=$(
501501
(sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g';
502502
ssh -Q "${ftr}" ) \
503-
| sort -u | perl -e '@a=grep{!/(sha1|md5)/ig}<STDIN>;
504-
print("$ENV{feature} ",join(",",map{ chomp; $_ }@a), $/) if "@a"')
503+
| sort -u | grep -v -ie sha1 -e md5 | paste -sd "," -)
505504
grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
506505
echo "$sshd_config_line" >> /tmp/sshd_config_new
507506
# TODO: test whether sshd will reload with this change before mv

templates/gpu/install_functions

Lines changed: 63 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ function set_cuda_runfile_url() {
119119
["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
120120
["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
121121
["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
122-
["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
122+
["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05"
123123
)
124124

125125
# Verify that the file with the indicated combination exists
@@ -200,6 +200,7 @@ function uninstall_local_cuda_repo(){
200200
}
201201

202202
function install_local_cudnn_repo() {
203+
# https://docs.nvidia.com/deeplearning/cudnn/sla/index.html
203204
is_complete install-local-cudnn-repo && return
204205

205206
pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
@@ -368,6 +369,7 @@ function install_nvidia_nccl() {
368369
mark_complete nccl
369370
}
370371

372+
# https://docs.nvidia.com/deeplearning/cudnn/sla/index.html
371373
function install_nvidia_cudnn() {
372374
if le_debian10 ; then return ; fi
373375
is_complete cudnn && return
@@ -435,45 +437,64 @@ function install_nvidia_cudnn() {
435437
}
436438

437439
function install_pytorch() {
438-
if test -f "${workdir}/complete/pytorch" ; then return ; fi
440+
is_complete pytorch && return
441+
439442
local env
440443
env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
441444
local mc3=/opt/conda/miniconda3
442445
local envpath="${mc3}/envs/${env}"
446+
if [[ "${env}" == "base" ]]; then
447+
echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${mc3}" ; fi
443448
# Set numa node to 0 for all GPUs
444449
for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
445450

446-
readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
447-
case "${INCLUDE_PYTORCH^^}" in
448-
"1" | "YES" | "TRUE" )
449-
local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
450-
local local_tarball="${workdir}/${build_tarball}"
451-
local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
451+
local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
452+
local local_tarball="${workdir}/${build_tarball}"
453+
local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
452454

453-
output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
454-
if echo "${output}" | grep -q "${gcs_tarball}" ; then
455-
# cache hit - unpack from cache
456-
echo "cache hit"
457-
mkdir -p "${envpath}"
458-
gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
459-
else
460-
local verb=create
461-
if test -d "${envpath}" ; then verb=install ; fi
462-
cudart_spec="cuda-cudart"
463-
if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
464-
"${mc3}/bin/mamba" "${verb}" -n "${env}" \
465-
-c conda-forge -c nvidia -c rapidsai \
466-
numba pytorch tensorflow[and-cuda] rapids pyspark \
467-
"cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
468-
pushd "${envpath}"
469-
tar czf "${local_tarball}" .
470-
popd
471-
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
472-
fi
473-
;;
474-
* ) echo "skip pytorch install" ;;
475-
esac
476-
touch "${workdir}/complete/pytorch"
455+
if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
456+
# do not build in tests with < 32 cores
457+
sleep $(( ( RANDOM % 11 ) + 10 ))
458+
while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
459+
sleep 5m
460+
done
461+
fi
462+
463+
output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
464+
if echo "${output}" | grep -q "${gcs_tarball}" ; then
465+
# cache hit - unpack from cache
466+
echo "cache hit"
467+
mkdir -p "${envpath}"
468+
gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
469+
else
470+
touch "${local_tarball}.building"
471+
gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
472+
local verb=create
473+
if test -d "${envpath}" ; then verb=install ; fi
474+
cudart_spec="cuda-cudart"
475+
if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
476+
477+
# Install pytorch and company to this environment
478+
"${mc3}/bin/mamba" "${verb}" -n "${env}" \
479+
-c conda-forge -c nvidia -c rapidsai \
480+
numba pytorch tensorflow[and-cuda] rapids pyspark \
481+
"cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
482+
483+
# Install jupyter kernel in this environment
484+
"${envpath}/bin/python3" -m pip install ipykernel
485+
486+
# package environment and cache in GCS
487+
pushd "${envpath}"
488+
tar czf "${local_tarball}" .
489+
popd
490+
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
491+
if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
492+
fi
493+
494+
# register the environment as a selectable kernel
495+
"${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})"
496+
497+
mark_complete pytorch
477498
}
478499

479500
function add_nonfree_components() {
@@ -508,7 +529,16 @@ function add_repo_nvidia_container_toolkit() {
508529

509530
function add_repo_cuda() {
510531
if is_debuntu ; then
511-
install_cuda_keyring_pkg # 11.7+, 12.0+
532+
if version_le "${CUDA_VERSION}" 11.6 ; then
533+
local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
534+
local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
535+
echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
536+
| sudo tee "${sources_list_path}"
537+
curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
538+
-o "${kr_path}"
539+
else
540+
install_cuda_keyring_pkg # 11.7+, 12.0+
541+
fi
512542
elif is_rocky ; then
513543
execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
514544
fi

templates/gpu/spark_functions

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,15 @@ function download_spark_jar() {
77

88
function install_spark_rapids() {
99
# Update SPARK RAPIDS config
10-
local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
10+
local DEFAULT_SPARK_RAPIDS_VERSION
11+
DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
1112
local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
1213

1314
# https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
1415
local -r scala_ver="2.12"
1516

1617
if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
17-
local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
18+
DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
1819
fi
1920

2021
readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})

templates/gpu/util_functions

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ function set_cuda_version() {
6060
case "${DATAPROC_IMAGE_VERSION}" in
6161
"2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
6262
"2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
63-
"2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
63+
"2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
6464
* )
6565
echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
6666
exit 1

templates/gpu/yarn_functions

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,25 @@ function configure_yarn_gpu_resources() {
1212
'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
1313

1414
set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
15+
16+
# Older CapacityScheduler does not permit use of gpu resources ; switch to FairScheduler on 2.0 and below
17+
if version_lt "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
18+
fs_xml="$HADOOP_CONF_DIR/fair-scheduler.xml"
19+
set_hadoop_property 'yarn-site.xml' \
20+
'yarn.resourcemanager.scheduler.class' 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler'
21+
set_hadoop_property 'yarn-site.xml' \
22+
"yarn.scheduler.fair.user-as-default-queue" "false"
23+
set_hadoop_property 'yarn-site.xml' \
24+
"yarn.scheduler.fair.allocation.file" "${fs_xml}"
25+
set_hadoop_property 'yarn-site.xml' \
26+
'yarn.scheduler.fair.resource-calculator' 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
27+
cat > "${fs_xml}" <<EOF
28+
<!-- ${fs_xml} -->
29+
<allocations>
30+
<queueMaxAppsDefault>1</queueMaxAppsDefault>
31+
</allocations>
32+
EOF
33+
fi
1534
}
1635

1736
function configure_gpu_script() {
@@ -44,9 +63,15 @@ function configure_gpu_script() {
4463
#
4564
# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
4665

66+
set -e
67+
resources_json="/dev/shm/nvidia/gpusResources.json"
68+
if test -f "${resources_json}" ; then cat "${resources_json}" ; exit 0 ; fi
69+
70+
mkdir -p "$(dirname ${resources_json})"
71+
4772
ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
4873

49-
echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
74+
echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} | tee "${resources_json}"
5075
EOF
5176

5277
chmod a+rx "${gpus_resources_script}"
@@ -78,14 +103,14 @@ EOF
78103
# having AQE enabled gives user the best performance.
79104
spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
80105
spark.executor.resource.gpu.amount=${gpu_count}
81-
spark.plugins=com.nvidia.spark.SQLPlugin
82106
spark.executor.cores=${executor_cores}
83107
spark.executor.memory=${executor_memory_gb}G
84108
spark.dynamicAllocation.enabled=false
85109
# please update this config according to your application
86110
spark.task.resource.gpu.amount=${gpu_amount}
87111
spark.task.cpus=2
88112
spark.yarn.unmanagedAM.enabled=false
113+
spark.plugins=com.nvidia.spark.SQLPlugin
89114
###### END : RAPIDS properties for Spark ${SPARK_VERSION} ######
90115
EOF
91116
}
@@ -97,6 +122,7 @@ function configure_yarn_nodemanager_gpu() {
97122
'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
98123
set_hadoop_property 'yarn-site.xml' \
99124
'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
125+
100126
configure_yarn_nodemanager
101127
}
102128

0 commit comments

Comments
 (0)