|
| 1 | +--- gather_azhpc_vm_diagnostics.sh.orig 2026-02-05 15:20:05.410458535 +1100 |
| 2 | ++++ gather_azhpc_vm_diagnostics.sh 2026-02-11 18:24:23.902965560 +1100 |
| 3 | +@@ -1,4 +1,7 @@ |
| 4 | + #!/bin/bash |
| 5 | ++{{ ansible_managed | comment(prefix="", postfix="") | trim }} |
| 6 | ++{{ "system_role:hpc" | comment(prefix="", postfix="") | trim }} |
| 7 | ++ |
| 8 | + # Azure HPC Diagnostics Tool |
| 9 | + # Gathers Diagnostic info from guest VM |
| 10 | + # |
| 11 | +@@ -51,16 +54,11 @@ |
| 12 | + # Copyright (c) Microsoft Corporation. |
| 13 | + # Licensed under the MIT license. |
| 14 | + |
| 15 | +- |
| 16 | +- |
| 17 | + #################################################################################################### |
| 18 | + # Begin Constants |
| 19 | + #################################################################################################### |
| 20 | + |
| 21 | + STREAM_URL='https://azhpcstor.blob.core.windows.net/diagtool-binaries/stream.tgz' |
| 22 | +-LSVMBUS_URL='https://raw.githubusercontent.com/torvalds/linux/master/tools/hv/lsvmbus' |
| 23 | +-HPC_DIAG_URL='https://raw.githubusercontent.com/Azure/azhpc-diagnostics/main/Linux/src/gather_azhpc_vm_diagnostics.sh' |
| 24 | +-SCRIPT_DIR="$( cd "$( dirname "$0" )" >/dev/null 2>&1 && pwd )" |
| 25 | + SYSFS_PATH=/sys # store as a variable so it is mockable |
| 26 | + ETC_PATH=/etc |
| 27 | + PROC_PATH=/proc |
| 28 | +@@ -74,15 +72,7 @@ |
| 29 | + CPU_LIST=(["Standard_HB120rs_v2"]="0 1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,65,69,73,77,81,85,89,93,97,101,105,109,113,117" |
| 30 | + ["Standard_HB60rs"]="0 1,5,9,13,17,21,25,29,33,37,41,45,49,53,57") |
| 31 | + RELEASE_DATE=20220316 # update upon each release |
| 32 | +-COMMIT_HASH=$( |
| 33 | +- ( |
| 34 | +- command -v git >/dev/null && |
| 35 | +- cd "$SCRIPT_DIR" && |
| 36 | +- git config --get remote.origin.url | grep -q 'Azure/azhpc-diagnostics.git$' && |
| 37 | +- git rev-parse HEAD 2>/dev/null |
| 38 | +- ) || |
| 39 | +- echo 'Unknown') |
| 40 | +-VERSION_INFO="$RELEASE_DATE-$COMMIT_HASH" |
| 41 | ++VERSION_INFO="$RELEASE_DATE-unknown" |
| 42 | + |
| 43 | + HELP_MESSAGE=" |
| 44 | + Usage: $0 [OPTION] |
| 45 | +@@ -100,8 +90,8 @@ |
| 46 | + Execution Mode: |
| 47 | + --gpu-level=GPU_LEVEL dcgmi run level (default is 1) |
| 48 | + --mem-level=MEM_LEVEL set to 1 to run stream test (default is 0) |
| 49 | +- --no-update do not prompt for auto-update |
| 50 | +- --offline skips steps that require Internet access |
| 51 | ++ --no-update Does nothing, auto-update functionality has been elided. |
| 52 | ++ --online Run steps that require Internet access |
| 53 | + |
| 54 | + For more information on this script and the data it gathers, visit its Github: |
| 55 | + |
| 56 | +@@ -217,10 +207,9 @@ |
| 57 | + echo "${CPU_LIST[$1]}" |
| 58 | + } |
| 59 | + |
| 60 | ++COLUMNS=80 |
| 61 | + if tput cols >/dev/null 2>/dev/null && (( $(tput cols) < 80 )); then |
| 62 | + COLUMNS=$(tput cols) |
| 63 | +-else |
| 64 | +- COLUMNS=80 |
| 65 | + fi |
| 66 | + |
| 67 | + print_enclosed() { |
| 68 | +@@ -244,24 +233,6 @@ |
| 69 | + echo '' |
| 70 | + } |
| 71 | + |
| 72 | +-check_for_updates() { |
| 73 | +- local message="You are not running the latest release of this tool. Switch to latest version?" |
| 74 | +- |
| 75 | +- local tmpfile |
| 76 | +- tmpfile=$(mktemp) |
| 77 | +- curl -s "$HPC_DIAG_URL" >"$tmpfile" || return 1 |
| 78 | +- if ! cmp --silent "$0" "$tmpfile"; then |
| 79 | +- if prompt "$message"; then |
| 80 | +- mv "$tmpfile" "$0" |
| 81 | +- bash "$0" "$RUNTIME_OPTIONS" |
| 82 | +- exit $? |
| 83 | +- else |
| 84 | +- return 0 |
| 85 | +- fi |
| 86 | +- fi |
| 87 | +- rm "$tmpfile" |
| 88 | +-} |
| 89 | +- |
| 90 | + get_metadata() { |
| 91 | + local path="$1" |
| 92 | + curl -s -H Metadata:true "http://169.254.169.254/metadata/instance/$path?api-version=2021-03-01&format=text" |
| 93 | +@@ -876,11 +847,11 @@ |
| 94 | + print_divider |
| 95 | + print_enclosed "NOTICES:" |
| 96 | + print_divider |
| 97 | +- print_enclosed This tool generates and bundles together various logs and diagnostic information. It, however, DOES NOT TRANSMIT any of said data. It is left to the user to choose to transmit this data to Microsoft. |
| 98 | ++ print_enclosed This tool generates and bundles together various logs and diagnostic information. It, however, DOES NOT TRANSMIT any of said data. It is left to the user to choose to transmit this data to Red Hat. |
| 99 | + print_divider |
| 100 | +- print_enclosed Some of this info, such as IP addresses, may be Personally Identifiable Information. It is up to the user to redact any sensitive info from the output 'if' necessary before sending it to Microsoft. |
| 101 | ++ print_enclosed Some of this info, such as IP addresses, may be Personally Identifiable Information. It is up to the user to redact any sensitive info from the output 'if' necessary before sending it to Red Hat. |
| 102 | + print_divider |
| 103 | +- print_enclosed This tool invokes various 3rd party tools 'if' they are present on the system Please review them and their EULAs at: |
| 104 | ++ print_enclosed This tool invokes various 3rd party tools 'if' they are present on the system. Please review them and their EULAs at: |
| 105 | + print_enclosed "https://github.com/Azure/azhpc-diagnostics" |
| 106 | + print_divider |
| 107 | + print_enclosed WARNING: THINK BEFORE YOU RUN THIS |
| 108 | +@@ -1018,9 +989,6 @@ |
| 109 | + print_enclosed 'Placing diagnostic files in the following location:' |
| 110 | + print_enclosed "$DIAG_DIR.tar.gz" |
| 111 | + print_divider |
| 112 | +- print_enclosed If you have already opened a support request, you can take the tarball and follow this link to upload it: |
| 113 | +- print_enclosed 'https://portal.azure.com/#blade/Microsoft_Azure_Support/HelpAndSupportBlade/managesupportrequest' |
| 114 | +- print_divider |
| 115 | + tar czf "$DIAG_DIR.tar.gz" -C "$DIAG_DIR_LOC" "$VM_ID.$TIMESTAMP" 2>/dev/null && rm -r "$DIAG_DIR" |
| 116 | + } |
| 117 | + |
| 118 | +@@ -1030,9 +998,11 @@ |
| 119 | + |
| 120 | + GPU_LEVEL=1 |
| 121 | + MEM_LEVEL=0 |
| 122 | ++OFFLINE=true |
| 123 | + DISPLAY_HELP=false |
| 124 | +-# should be /opt/azurehpc/diagnostics |
| 125 | +-DIAG_DIR_LOC="$SCRIPT_DIR" |
| 126 | ++DISPLAY_VERSION=false |
| 127 | ++# should be /var/hpc/azure/diagnostics |
| 128 | ++DIAG_DIR_LOC="{{ __hpc_azure_runtime_dir }}/diagnostics" |
| 129 | + |
| 130 | + # save options |
| 131 | + RUNTIME_OPTIONS=$* |
| 132 | +@@ -1063,8 +1033,8 @@ |
| 133 | + validate_run_level "$1" |
| 134 | + MEM_LEVEL="$1" |
| 135 | + ;; |
| 136 | +- --no-update) DISABLE_UPDATE=true;; |
| 137 | +- --offline) OFFLINE=true;; |
| 138 | ++ --no-update) ;; # does nothing |
| 139 | ++ --online) OFFLINE=false;; |
| 140 | + --tuning) TUNING=true;; |
| 141 | + -V|--version) DISPLAY_VERSION=true;; |
| 142 | + esac |
| 143 | +@@ -1081,10 +1051,6 @@ |
| 144 | + # End Option Parsing |
| 145 | + #################################################################################################### |
| 146 | + |
| 147 | +-if [ "$OFFLINE" != true ] && [ "$DISABLE_UPDATE" != true ] && ! [[ $- =~ 's' ]]; then |
| 148 | +- check_for_updates |
| 149 | +-fi |
| 150 | +- |
| 151 | + if [ ! "${BASH_SOURCE[0]}" -ef "$0" ]; then |
| 152 | + # This lets us load all functions for unit testing. |
| 153 | + # We wouldn't want people sourcing this script anyway. |
0 commit comments