@@ -119,7 +119,7 @@ function set_cuda_runfile_url() {
119119 ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
120120 ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
121121 ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
122- ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
122+ ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05"
123123 )
124124
125125 # Verify that the file with the indicated combination exists
@@ -200,6 +200,7 @@ function uninstall_local_cuda_repo(){
200200}
201201
202202function install_local_cudnn_repo() {
203+ # https://docs.nvidia.com/deeplearning/cudnn/sla/index.html
203204 is_complete install-local-cudnn-repo && return
204205
205206 pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
@@ -368,6 +369,7 @@ function install_nvidia_nccl() {
368369 mark_complete nccl
369370}
370371
372+ # https://docs.nvidia.com/deeplearning/cudnn/sla/index.html
371373function install_nvidia_cudnn() {
372374 if le_debian10 ; then return ; fi
373375 is_complete cudnn && return
@@ -435,45 +437,64 @@ function install_nvidia_cudnn() {
435437}
436438
437439function install_pytorch() {
438- if test -f "${workdir}/complete/pytorch" ; then return ; fi
440+ is_complete pytorch && return
441+
439442 local env
440443 env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
441444 local mc3=/opt/conda/miniconda3
442445 local envpath="${mc3}/envs/${env}"
446+ if [[ "${env}" == "base" ]]; then
447+ echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${mc3}" ; fi
443448 # Set numa node to 0 for all GPUs
444449 for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
445450
446- readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
447- case "${INCLUDE_PYTORCH^^}" in
448- "1" | "YES" | "TRUE" )
449- local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
450- local local_tarball="${workdir}/${build_tarball}"
451- local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
451+ local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
452+ local local_tarball="${workdir}/${build_tarball}"
453+ local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
452454
453- output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
454- if echo "${output}" | grep -q "${gcs_tarball}" ; then
455- # cache hit - unpack from cache
456- echo "cache hit"
457- mkdir -p "${envpath}"
458- gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
459- else
460- local verb=create
461- if test -d "${envpath}" ; then verb=install ; fi
462- cudart_spec="cuda-cudart"
463- if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
464- "${mc3}/bin/mamba" "${verb}" -n "${env}" \
465- -c conda-forge -c nvidia -c rapidsai \
466- numba pytorch tensorflow[and-cuda] rapids pyspark \
467- "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
468- pushd "${envpath}"
469- tar czf "${local_tarball}" .
470- popd
471- gcloud storage cp "${local_tarball}" "${gcs_tarball}"
472- fi
473- ;;
474- * ) echo "skip pytorch install" ;;
475- esac
476- touch "${workdir}/complete/pytorch"
455+ if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
456+ # do not build in tests with < 32 cores
457+ sleep $(( ( RANDOM % 11 ) + 10 ))
458+ while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
459+ sleep 5m
460+ done
461+ fi
462+
463+ output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
464+ if echo "${output}" | grep -q "${gcs_tarball}" ; then
465+ # cache hit - unpack from cache
466+ echo "cache hit"
467+ mkdir -p "${envpath}"
468+ gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
469+ else
470+ touch "${local_tarball}.building"
471+ gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
472+ local verb=create
473+ if test -d "${envpath}" ; then verb=install ; fi
474+ cudart_spec="cuda-cudart"
475+ if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
476+
477+ # Install pytorch and company to this environment
478+ "${mc3}/bin/mamba" "${verb}" -n "${env}" \
479+ -c conda-forge -c nvidia -c rapidsai \
480+ numba pytorch tensorflow[and-cuda] rapids pyspark \
481+ "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
482+
483+ # Install jupyter kernel in this environment
484+ "${envpath}/bin/python3" -m pip install ipykernel
485+
486+ # package environment and cache in GCS
487+ pushd "${envpath}"
488+ tar czf "${local_tarball}" .
489+ popd
490+ gcloud storage cp "${local_tarball}" "${gcs_tarball}"
491+ if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
492+ fi
493+
494+ # register the environment as a selectable kernel
495+ "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})"
496+
497+ mark_complete pytorch
477498}
478499
479500function add_nonfree_components() {
@@ -508,7 +529,16 @@ function add_repo_nvidia_container_toolkit() {
508529
509530function add_repo_cuda() {
510531 if is_debuntu ; then
511- install_cuda_keyring_pkg # 11.7+, 12.0+
532+ if version_le "${CUDA_VERSION}" 11.6 ; then
533+ local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
534+ local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
535+ echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
536+ | sudo tee "${sources_list_path}"
537+ curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
538+ -o "${kr_path}"
539+ else
540+ install_cuda_keyring_pkg # 11.7+, 12.0+
541+ fi
512542 elif is_rocky ; then
513543 execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
514544 fi
0 commit comments