Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions aci-preupgrade-validation-script.py
Original file line number Diff line number Diff line change
Expand Up @@ -5962,6 +5962,57 @@ def configpush_shard_check(tversion, **kwargs):

return Result(result=result, headers=headers, data=data, recommended_action=recommended_action, doc_url=doc_url)


@check_wrapper(check_title='/tmp directory disk space for snapshot storage during upgrade')
def tmp_dir_snapshot_storage_check(tversion, **kwargs):
result = FAIL_UF
headers = ['Fault', 'Pod', 'Node', 'Mount Point', 'Current Usage %', 'Recommended Action']
data = []
unformatted_headers = ['Fault', 'Fault DN', 'Recommended Action']
unformatted_data = []
recommended_action = 'Contact Cisco TAC for assistance. The /tmp directory may need cleanup or the upgrade may require special handling.'
doc_url = 'https://datacenter.github.io/ACI-Pre-Upgrade-Validation-Script/validations/#tmp-directory-snapshot-storage'

if not tversion:
return Result(result=MANUAL, msg=TVER_MISSING)

if tversion.older_than("6.1(4a)"):
dn_regex = node_regex + r'/.+p-\[(?P<mountpoint>.+)\]-f'
desc_regex = r'is (?P<usage>\d{2,3}%) full'

# Query for F1527, F1528, or F1529 faults
faultInsts = icurl('class',
'faultInst.json?query-target-filter=or(eq(faultInst.code,"F1527"),eq(faultInst.code,"F1528"),eq(faultInst.code,"F1529"))')

for faultInst in faultInsts:
fc = faultInst['faultInst']['attributes']['code']
dn = re.search(dn_regex, faultInst['faultInst']['attributes']['dn'])
desc = re.search(desc_regex, faultInst['faultInst']['attributes']['descr'])

# Only flag /tmp directory issues for this check
if dn and desc and dn.group('mountpoint') == '/tmp':
data.append([fc, dn.group('pod'), dn.group('node'), dn.group('mountpoint'),
desc.group('usage'), recommended_action])
elif dn and dn.group('mountpoint') == '/tmp':
# If we can parse DN but not description, still report it
unformatted_data.append([fc, faultInst['faultInst']['attributes']['dn'], recommended_action])

if not data and not unformatted_data:
result = PASS
else:
result = NA

return Result(
result=result,
headers=headers,
data=data,
unformatted_headers=unformatted_headers,
unformatted_data=unformatted_data,
recommended_action=recommended_action,
doc_url=doc_url,
)


# ---- Script Execution ----


Expand Down Expand Up @@ -6069,6 +6120,7 @@ class CheckManager:
scalability_faults_check,
fabric_port_down_check,
equipment_disk_limits_exceeded,
tmp_dir_snapshot_storage_check,

# Configurations
vpc_paired_switches_check,
Expand Down
37 changes: 36 additions & 1 deletion docs/docs/validations.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,8 @@ Items | Defect | This Script
[Observer Database Size][d25] | CSCvw45531 | :white_check_mark: | :no_entry_sign:
[Stale pconsRA Object][d26] | CSCwp22212 | :warning:{title="Deprecated"} | :no_entry_sign:
[ISIS DTEPs Byte Size][d27] | CSCwp15375 | :white_check_mark: | :no_entry_sign:
[Policydist configpushShardCont Crash][d28] | CSCwp95515 | :white_check_mark: |
[Policydist configpushShardCont Crash][d28] | CSCwp95515 | :white_check_mark: | :no_entry_sign:
[/tmp Directory Disk Space for Snapshot Storage][d29] | CSCwo96334 | :white_check_mark: | :no_entry_sign:

[d1]: #ep-announce-compatibility
[d2]: #eventmgr-db-size-defect-susceptibility
Expand Down Expand Up @@ -220,6 +221,7 @@ Items | Defect | This Script
[d26]: #stale-pconsra-object
[d27]: #isis-dteps-byte-size
[d28]: #policydist-configpushshardcont-crash
[d29]: #tmp-directory-disk-space-for-snapshot-storage


## General Check Details
Expand Down Expand Up @@ -2604,6 +2606,38 @@ Due to [CSCwp95515][59], upgrading to an affected version while having any `conf
If any instances of `configpushShardCont` are flagged by this script, Cisco TAC must be contacted to identify and resolve the underlying issue before performing the upgrade.


### /tmp Directory Disk Space for Snapshot Storage

Prior to ACI version 6.1(4), the APIC uses the `/tmp` directory to store database snapshots during the upgrade process. If the `/tmp` directory has insufficient free space (typically indicated by disk space faults F1527, F1528, or F1529), the upgrade process may fail due to inability to create required snapshot files.

Due to [CSCwo96334][60], starting from ACI version 6.1(4), snapshots are stored in `/data` directory instead of `/tmp`, which provides more available disk space and resolves this issue.

This check monitors the `/tmp` directory utilization on APICs by querying for the following faults:

- **F1527** (Minor): Storage unit is 75-84% full
- **F1528** (Major): Storage unit is 85-89% full
- **F1529** (Critical): Storage unit is 90-100% full

**Impact:**

If `/tmp` is at or above 75% utilization when upgrading to versions prior to 6.1(4), the upgrade may fail when attempting to create database snapshots. This can result in:

- Upgrade workflow failure
- Inability to complete APIC database conversion
- Potential need for manual cleanup and upgrade retry

**Recommended Action:**

If this check flags high `/tmp` utilization:

1. Contact Cisco TAC for assistance before proceeding with the upgrade
2. Work with TAC to identify and remove unnecessary files from `/tmp`
3. Consider upgrading to ACI 6.1(4) or later where snapshots use `/data` directory instead
4. Ensure at least 25-30% free space in `/tmp` before attempting upgrade to pre-6.1(4) versions

**Note:** This check only applies when upgrading to versions older than 6.1(4). For upgrades to 6.1(4) or later, this check returns N/A as the issue is resolved in those versions.


[0]: https://github.com/datacenter/ACI-Pre-Upgrade-Validation-Script
[1]: https://www.cisco.com/c/dam/en/us/td/docs/Website/datacenter/apicmatrix/index.html
[2]: https://www.cisco.com/c/en/us/support/switches/nexus-9000-series-switches/products-release-notes-list.html
Expand Down Expand Up @@ -2664,5 +2698,6 @@ If any instances of `configpushShardCont` are flagged by this script, Cisco TAC
[57]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwp22212
[58]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwp15375
[59]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwp95515
[60]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwo96334
[60]: https://www.cisco.com/c/en/us/solutions/collateral/data-center-virtualization/application-centric-infrastructure/white-paper-c11-743951.html#Inter
[61]: https://www.cisco.com/c/en/us/solutions/collateral/data-center-virtualization/application-centric-infrastructure/white-paper-c11-743951.html#EnablePolicyCompression
32 changes: 32 additions & 0 deletions tests/checks/tmp_dir_snapshot_storage_check/faultInst_mixed.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1529",
"descr": "Storage unit /tmp on Node 1 of pod 1 is 92% full",
"dn": "topology/pod-1/node-1/sys/ch/p-[/tmp]-fault-F1529"
}
}
},
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1527",
"descr": "Storage unit /firmware on Node 2 of pod 1 is 76% full",
"dn": "topology/pod-1/node-2/sys/ch/p-[/firmware]-fault-F1527"
}
}
},
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1528",
"descr": "Storage unit /techsupport on Node 3 of pod 1 is 85% full",
"dn": "topology/pod-1/node-3/sys/ch/p-[/techsupport]-fault-F1528"
}
}
}
]
22 changes: 22 additions & 0 deletions tests/checks/tmp_dir_snapshot_storage_check/faultInst_non_tmp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1527",
"descr": "Storage unit /firmware on Node 1 of pod 1 is 78% full",
"dn": "topology/pod-1/node-1/sys/ch/p-[/firmware]-fault-F1527"
}
}
},
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1528",
"descr": "Storage unit /techsupport on Node 2 of pod 1 is 88% full",
"dn": "topology/pod-1/node-2/sys/ch/p-[/techsupport]-fault-F1528"
}
}
}
]
32 changes: 32 additions & 0 deletions tests/checks/tmp_dir_snapshot_storage_check/faultInst_tmp_pos.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1527",
"descr": "Storage unit /tmp on Node 1 of pod 1 is 80% full",
"dn": "topology/pod-1/node-1/sys/ch/p-[/tmp]-fault-F1527"
}
}
},
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1528",
"descr": "Storage unit /tmp on Node 2 of pod 1 is 87% full",
"dn": "topology/pod-1/node-2/sys/ch/p-[/tmp]-fault-F1528"
}
}
},
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1529",
"descr": "Storage unit /tmp on Node 3 of pod 1 is 95% full",
"dn": "topology/pod-1/node-3/sys/ch/p-[/tmp]-fault-F1529"
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import os
import pytest
import logging
import importlib
from helpers.utils import read_data

script = importlib.import_module("aci-preupgrade-validation-script")

log = logging.getLogger(__name__)
dir = os.path.dirname(os.path.abspath(__file__))

test_function = "tmp_dir_snapshot_storage_check"

# icurl queries
faultInst_api = 'faultInst.json?query-target-filter=or(eq(faultInst.code,"F1527"),eq(faultInst.code,"F1528"),eq(faultInst.code,"F1529"))'


@pytest.mark.parametrize(
"icurl_outputs, tversion, expected_result",
[
# ===== AFFECTED VERSIONS (< 6.1(4a)) =====
# Older 4.x version, no /tmp faults
(
{faultInst_api: []},
"4.2(7f)",
script.PASS,
),
# 5.x version, no /tmp faults
(
{faultInst_api: []},
"5.2(8f)",
script.PASS,
),
# 6.0.x version, no /tmp faults
(
{faultInst_api: []},
"6.0(5a)",
script.PASS,
),
# Just before fix version 6.1(3z), no /tmp faults
(
{faultInst_api: []},
"6.1(3z)",
script.PASS,
),
# 4.x version with /tmp faults
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"4.2(7t)",
script.FAIL_UF,
),
# 5.x version with /tmp faults
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"5.2(8f)",
script.FAIL_UF,
),
# 6.0.x version with /tmp faults
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"6.0(2h)",
script.FAIL_UF,
),
# Just before fix version 6.1(3z) with /tmp faults
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"6.1(3z)",
script.FAIL_UF,
),
# Affected version with only non-/tmp faults (should PASS)
(
{faultInst_api: read_data(dir, "faultInst_non_tmp.json")},
"5.2(6a)",
script.PASS,
),
# Affected version with mixed /tmp and non-/tmp faults (should FAIL_UF)
(
{faultInst_api: read_data(dir, "faultInst_mixed.json")},
"6.0(3a)",
script.FAIL_UF,
),
# 3.x version with /tmp faults
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"3.2(10e)",
script.FAIL_UF,
),
# 4.x version with only non-/tmp faults (should PASS)
(
{faultInst_api: read_data(dir, "faultInst_non_tmp.json")},
"4.2(7f)",
script.PASS,
),
# 6.0.x version with mixed faults
(
{faultInst_api: read_data(dir, "faultInst_mixed.json")},
"6.0(5h)",
script.FAIL_UF,
),
# ===== FIXED VERSIONS (>= 6.1(4a)) =====
# Exact fix version 6.1(4a) with /tmp faults (should be NA)
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"6.1(4a)",
script.NA,
),
# Exact fix version 6.1(4a) without faults (should be NA)
(
{faultInst_api: []},
"6.1(4a)",
script.NA,
),
# Later 6.1.x version with /tmp faults (should be NA)
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"6.1(5a)",
script.NA,
),
# 6.2.x version with /tmp faults (should be NA)
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"6.2(1a)",
script.NA,
),
# Future 7.x version with /tmp faults (should be NA)
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"7.0(1a)",
script.NA,
),
],
)
def test_logic(run_check, mock_icurl, tversion, expected_result):
result = run_check(
tversion=script.AciVersion(tversion) if tversion else None,
)
assert result.result == expected_result