From a7facb919bf4d534f7c877c0de0c9e6fdf3aee28 Mon Sep 17 00:00:00 2001 From: Ivan Pepelnjak Date: Thu, 29 Jan 2026 11:43:52 +0100 Subject: [PATCH] Integrate internal SSH readiness checks with Ansible checks This commit refactors the readiness checks to integrate the internal SSH readiness checks with the Ansible checks (for example, the check for the first Junos interface): Data structure changes: * The devices that require readiness checks MUST have netlab_ready group variable, which should include value 'ansible' for devices with Ansible checks. This commit modifies device definitions for all devices using readiness checks * The 'ansible' output module creates netlab_ready_ansible and netlab_ready_ssh groups netlab initial changes: * The 'ready' module got its own run function which is invoked with the args.ready option * The 'ready' module first collect the nodes based on their wait-for-ready requirements, executes internal readiness checks, and starts the Ansible 'device-ready' playbook if needed * The internal readiness checks can be disable in topology defaults (defaults.netlab.initial.ready._check_ variable) * deploy.run function calls ready.run function as one of the first steps * deploy.run and ready.run functions use log.section_header for improved logging functionality Ansible-related changes: * The 'device-ready.ansible' playbook is split into two plays (SSH readiness and Ansible checks) * The new ansible groups are used in 'device-ready.ansible' playbook to limit the hosts involved in each play * The 'wait-for-ready' task list is no longer included into the initial-config.ansible playbook -- the readiness check is performed solely in the device-ready.ansible playbook * The 'wait-for-ready' task list no longer performs the generic readiness checks (ssh was the only generic check). The generic checks are performed as plays in the 'device-ready.ansible' playbook --- docs/netlab/initial.md | 11 ++++-- netsim/ansible/device-ready.ansible | 13 ++++++- netsim/ansible/initial-config.ansible | 1 - netsim/ansible/tasks/wait-for-ready.yml | 11 ------ netsim/cli/initial/__init__.py | 9 +---- netsim/cli/initial/deploy.py | 25 ++++++------ netsim/cli/initial/ready.py | 51 +++++++++++++++++++------ netsim/cli/initial/utils.py | 22 ++++++++--- netsim/defaults/netlab.yml | 4 ++ netsim/devices/cumulus_nvue.yml | 1 + netsim/devices/fortios.yml | 1 + netsim/devices/junos.yml | 5 +++ netsim/devices/netscaler.yml | 3 +- netsim/devices/nxos.yml | 3 +- netsim/devices/sros.yml | 1 + netsim/devices/vjunos-router.yml | 1 - netsim/devices/vjunos-switch.yml | 1 - netsim/devices/vmx.yml | 2 - netsim/devices/vptx.yml | 1 - netsim/devices/vsrx.yml | 1 - netsim/devices/vyos.yml | 1 + netsim/outputs/ansible.py | 6 ++- 22 files changed, 111 insertions(+), 63 deletions(-) diff --git a/docs/netlab/initial.md b/docs/netlab/initial.md index eb0aa1651f..9f176afa51 100644 --- a/docs/netlab/initial.md +++ b/docs/netlab/initial.md @@ -71,13 +71,18 @@ All other arguments are passed directly to ansible-playbook ## Wait for Devices to Become Ready -Some devices are not ready immediately after they complete the boot process. For example, Cisco Nexus OS or Juniper vPTX need another minute to realize they have data-plane interfaces. +**netlab initial** starts with a device readiness check to ensure the lab devices are ready for configuration deployment. If you want to execute just this part of the process, use the `--ready` option. + +There are several reasons a device might not be ready when the virtualization providers finish their job: -Likewise, the virtualization provider might prematurely report that the devices are ready. For example, *containerlab* does not wait for VMs running in containers to complete their boot process (see [](clab-vrnetlab) for more details).[^vssh] +* A virtualization provider might prematurely report that the devices are ready. For example, *containerlab* does not wait for VMs running in containers to complete their boot process (see [](clab-vrnetlab) for more details).[^vssh] _netlab_ checks the reachability of SSH servers for all containers that are configured via SSH. +* Some devices are not ready even after their SSH servers start accepting incoming sessions. For example, Cisco Nexus OS or Juniper vPTX requires around a minute to detect data-plane interfaces. In such cases, _netlab_ uses a device-specific Ansible task list to verify that the devices are ready for configuration. [^vssh]: Vagrant waits for all devices to become reachable via SSH before reporting them ready. -**netlab initial** starts with a device readiness check to ensure the lab devices are ready for configuration deployment. If you want to execute just this part of the process, use the `--ready` option. +```{tip} +_netlab_ uses internal (Python) code to check the reachability of SSH servers. If you want to check the SSH servers from an Ansible playbook, set the **‌defaults.netlab.initial.ready.ssh** [topology default](topo-defaults) to **‌ansible** (preferably using a [user defaults file](defaults-user-file)). +``` ## Initial Device Configurations diff --git a/netsim/ansible/device-ready.ansible b/netsim/ansible/device-ready.ansible index 19a46eaca1..fffc7cbed1 100644 --- a/netsim/ansible/device-ready.ansible +++ b/netsim/ansible/device-ready.ansible @@ -1,9 +1,18 @@ #!/usr/bin/env ansible-playbook --- -- name: Deploy initial device configuration - hosts: all:!unprovisioned +- name: Wait for SSH servers + hosts: netlab_ready_ssh:!unprovisioned strategy: "{{ netlab_strategy|default('linear') }}" gather_facts: false + tags: [ ready_ssh ] + tasks: + - import_tasks: tasks/readiness-check/ssh.yml + +- name: Wait for device-specific conditions + hosts: netlab_ready_ansible:!unprovisioned + strategy: "{{ netlab_strategy|default('linear') }}" + gather_facts: false + tags: [ ready_ansible ] tasks: - name: Set variables that cannot be set with VARS set_fact: diff --git a/netsim/ansible/initial-config.ansible b/netsim/ansible/initial-config.ansible index 95da69e92d..8ef9e52aec 100755 --- a/netsim/ansible/initial-config.ansible +++ b/netsim/ansible/initial-config.ansible @@ -14,7 +14,6 @@ node_files: "{{ lookup('env','PWD') }}/node_files" tags: [ always ] - - import_tasks: tasks/wait-for-ready.yml - import_tasks: tasks/initial-config.yml # diff --git a/netsim/ansible/tasks/wait-for-ready.yml b/netsim/ansible/tasks/wait-for-ready.yml index b9aaac7041..0fe590687e 100644 --- a/netsim/ansible/tasks/wait-for-ready.yml +++ b/netsim/ansible/tasks/wait-for-ready.yml @@ -3,17 +3,6 @@ # --- - block: - - name: "Generic readiness tests" - include_tasks: "{{ ready_script }}" - vars: - netlab_device_type: "{{ item }}" - params: - paths: "{{ paths_ready.dirs }}" - files: "{{ paths_ready.files }}" - ready_script: "{{ lookup('first_found',params,errors='ignore') }}" - when: ready_script is string and ready_script != '' - loop: "{{ netlab_ready|default([])}}" - - name: Find device readiness script set_fact: ready_script: "{{ lookup('first_found',params,errors='ignore') }}" diff --git a/netsim/cli/initial/__init__.py b/netsim/cli/initial/__init__.py index 4af3d51849..8a4f0b0b5e 100644 --- a/netsim/cli/initial/__init__.py +++ b/netsim/cli/initial/__init__.py @@ -10,7 +10,7 @@ from ...utils import log from ...utils import status as _status from .. import ansible, error_and_exit, external_commands, get_message, lab_status_change, load_snapshot -from . import configs, deploy, utils +from . import configs, deploy, ready, utils def run_initial(cli_args: typing.List[str]) -> None: @@ -30,13 +30,8 @@ def run_initial(cli_args: typing.List[str]) -> None: configs.run(topology,args,cwd) return elif args.ready: - rest += utils.ansible_args(args) - ansible.check_version() - ansible.playbook('device-ready.ansible',rest) - if topology: - lab_status_change(topology,'devices are ready') + ready.run(topology,args,rest) else: - rest += utils.ansible_args(args) ansible.check_version() deploy.run(topology,args,rest) diff --git a/netsim/cli/initial/deploy.py b/netsim/cli/initial/deploy.py index c5f53f63e4..5d15460955 100644 --- a/netsim/cli/initial/deploy.py +++ b/netsim/cli/initial/deploy.py @@ -14,7 +14,7 @@ from ...data import get_empty_box from ...providers import execute_node from ...utils import log, strings -from .. import _nodeset, ansible, error_and_exit, external_commands, get_message, lab_status_change +from .. import ansible, error_and_exit, external_commands, get_message, lab_status_change from . import configs, ready, utils @@ -100,16 +100,14 @@ def run(topology: Box, args: argparse.Namespace, rest: list) -> None: deploy_parts = utils.get_deploy_parts(args) deploy_text = ", ".join(deploy_parts) or "complete configuration" - devices.process_config_sw_check(topology) - lab_status_change(topology, f"deploying configuration: {deploy_text}") - - nodeset = _nodeset.parse_nodeset(args.limit, topology) if args.limit else list(topology.nodes.keys()) - nodeset = utils.filter_unprovisioned(nodeset, topology) + nodeset = utils.get_deploy_nodeset(args,topology) if not nodeset: error_and_exit("The specified nodeset is empty, there are no nodes to configure") + devices.process_config_sw_check(topology) + if not args.deploy: - log.info(text="Creating configuration snippets") + log.section_header('Creating',f'Device configuration snippets') configs.create_node_configs( topology=topology, nodeset=nodeset, @@ -121,18 +119,21 @@ def run(topology: Box, args: argparse.Namespace, rest: list) -> None: ) log.exit_on_error() - ready.device_ready(nodeset,topology) - (used_internal, status_internal) = deploy_provider_config(nodeset, topology, args) - if used_internal: - print() + ready.run(topology,args,rest) + log.exit_on_error() + + log.section_header('Config',f'Deploying device configurations') + lab_status_change(topology, f"deploying configuration: {deploy_text}") + + (used_internal, status_internal) = deploy_provider_config(nodeset, topology, args) ansible_skip_list = utils.nodeset_ansible_skip(nodeset, topology, args) if len(ansible_skip_list) != len(nodeset): utils.ansible_skip_group(ansible_skip_list) if used_internal: log.info("Starting Ansible playbook to deploy the rest of the configurations") - status_ansible = deploy_ansible_playbook(topology, rest) + status_ansible = deploy_ansible_playbook(topology,rest + utils.ansible_args(args)) utils.ansible_skip_group([]) else: status_ansible = True diff --git a/netsim/cli/initial/ready.py b/netsim/cli/initial/ready.py index ec719aae04..db30b94f35 100644 --- a/netsim/cli/initial/ready.py +++ b/netsim/cli/initial/ready.py @@ -1,6 +1,7 @@ # # netlab initial -- implement standard device readiness checks # +import argparse import concurrent.futures import subprocess import time @@ -11,7 +12,8 @@ from ...augment import devices as a_devices from ...data import append_to_list, get_empty_box from ...utils import log, strings -from .. import error_and_exit, external_commands +from .. import ansible, error_and_exit, external_commands, lab_status_change +from . import utils """ Prepare for SSH readiness check -- copy timeouts and retry counters, check for "sshpass", set up the SSH command @@ -97,7 +99,7 @@ def wait_for_ssh(n_name: str) -> bool: # Try out SSH server o r_data.ssh_ready = True # No errors, we're ready to roll now = time.time() if now > start_time + 5 or log.VERBOSE: # Report progress only if it's worth reporting - strings.print_colored_text('[SSH] ','green') + strings.print_colored_text('[SSH] ','green') print(f'SSH server on node {n_name} (device {n_data.device}) ' +\ f'is ready after {round(now - start_time,1)} seconds',flush=True) return True @@ -112,7 +114,7 @@ def wait_for_ssh(n_name: str) -> bool: # Try out SSH server o now = time.time() if now > start_time + r_data.wait: # Have we exceeded the wait period? r_data.ssh_failed = True - strings.print_colored_text('[SSH] ','red') + strings.print_colored_text('[SSH] ','red') print(f'SSH server on node {n_name} (device {n_data.device}) ' +\ f'is not ready after {round(now - start_time,1)} seconds',flush=True) if log.debug_active('ssh'): # Do we need to report SSH status periodically? @@ -134,7 +136,7 @@ def wait_for_ssh(n_name: str) -> bool: # Try out SSH server o setup_ssh_ready_parameters(waitset,topology) log.exit_on_error() start_time = time.time() - log.info(text=f'Checking SSH servers on {",".join(waitset)}') + log.info(text=f'Checking SSH server(s) on {",".join(waitset)}') with concurrent.futures.ThreadPoolExecutor() as executor: if strings.rich_color: @@ -157,8 +159,23 @@ def wait_for_ssh(n_name: str) -> bool: # Try out SSH server o Execute all "wait for device to be ready" steps recognized by "netlab initial". Further steps might have to be executed by Ansible playbooks """ -def device_ready(nodeset: list, topology: Box) -> None: +def internal_device_ready(waitlists: Box, topology: Box) -> Box: global READY_ACTIONS + + # Iterate over known steps, check whether any device needs that, and execute + # the corresponding ready function + for r_step in READY_ACTIONS.keys(): + if r_step not in waitlists: # Nobody asked for this step, move on + continue + if topology.defaults.netlab.initial.ready[r_step] != 'internal': + continue # We're not using the internal code for this step + + READY_ACTIONS[r_step](waitlists[r_step],topology) + waitlists.pop(r_step) + + return waitlists + +def get_waitlists(nodeset: list, topology: Box) -> Box: waitlists = get_empty_box() defaults = topology.defaults @@ -172,9 +189,21 @@ def device_ready(nodeset: list, topology: Box) -> None: for step in ready_steps: append_to_list(waitlists,step,n_name) - # Iterate over known steps, check whether any device needs that, and execute - # the corresponding ready function - for r_step in READY_ACTIONS.keys(): - if r_step not in waitlists: # Nobody asked for this step, move on - continue - READY_ACTIONS[r_step](waitlists[r_step],topology) + return waitlists + +def run(topology: Box, args: argparse.Namespace, rest: list) -> None: + nodeset = utils.get_deploy_nodeset(args,topology) + node_waits = get_waitlists(nodeset,topology) + if node_waits and not log.QUIET: + log.section_header('Checking',f'Are lab devices ready to be configured?') + + ansible_waits = internal_device_ready(node_waits,topology) + if ansible_waits: + log.info(text='Checking lab devices with an Ansible playbook') + ansible.check_version() + ansible_limit = ['--tag',','.join(['ready_'+wl for wl in ansible_waits.keys()])] + ansible.playbook('device-ready.ansible',rest + utils.min_ansible_args(args) + ansible_limit) + + if args.ready: + lab_status_change(topology,'devices are ready') + log.info(text='Lab devices are ready to be configured') diff --git a/netsim/cli/initial/utils.py b/netsim/cli/initial/utils.py index d8f7376c5f..923d200c45 100644 --- a/netsim/cli/initial/utils.py +++ b/netsim/cli/initial/utils.py @@ -13,7 +13,7 @@ from ...data import global_vars from ...utils import files as _files from ...utils import log, strings -from .. import common_parse_args, parser_lab_location +from .. import _nodeset, common_parse_args, parser_lab_location # @@ -91,11 +91,17 @@ def common_ansible_args() -> list: """ Build Ansible arguments based on 'netlab initial' parameters """ -def ansible_args(args: argparse.Namespace) -> list: +def min_ansible_args(args: argparse.Namespace) -> list: rest = common_ansible_args() - if args.limit: rest = ['--limit',args.limit] + rest + if args.fast or os.environ.get('NETLAB_FAST_CONFIG',None): + rest = ['-e','netlab_strategy=free'] + rest + + return rest + +def ansible_args(args: argparse.Namespace) -> list: + rest = min_ansible_args(args) if args.initial: rest = ['-t','initial'] + rest @@ -111,9 +117,6 @@ def ansible_args(args: argparse.Namespace) -> list: if args.custom: rest = ['-t','custom'] + rest - if args.fast or os.environ.get('NETLAB_FAST_CONFIG',None): - rest = ['-e','netlab_strategy=free'] + rest - return rest """ @@ -205,6 +208,13 @@ def filter_unprovisioned(nodeset: typing.List[str], topology: Box) -> typing.Lis unprovisioned_members = groups.group_members(topology, 'unprovisioned') return [node for node in nodeset if node not in unprovisioned_members] +""" +Get deployment nodeset from args.limit +""" +def get_deploy_nodeset(args: argparse.Namespace, topology: Box) -> list: + nodeset = _nodeset.parse_nodeset(args.limit, topology) if args.limit else list(topology.nodes.keys()) + return filter_unprovisioned(nodeset, topology) + """ ansible_skip_group: Modify Ansible inventory to include _grp_config_skip listing all the hosts that do not need Ansible deployment diff --git a/netsim/defaults/netlab.yml b/netsim/defaults/netlab.yml index db3a31c436..6f5f1f356c 100644 --- a/netsim/defaults/netlab.yml +++ b/netsim/defaults/netlab.yml @@ -12,3 +12,7 @@ create: pickle: tools: ansible: dirs + +initial: + ready: + ssh: internal diff --git a/netsim/devices/cumulus_nvue.yml b/netsim/devices/cumulus_nvue.yml index 6bf7f1a9f8..1a25624846 100644 --- a/netsim/devices/cumulus_nvue.yml +++ b/netsim/devices/cumulus_nvue.yml @@ -29,6 +29,7 @@ group_vars: ansible_network_os: cumulus_nvue ansible_connection: paramiko ansible_python_interpreter: auto_silent + netlab_ready: [ ansible ] features: initial: system_mtu: True diff --git a/netsim/devices/fortios.yml b/netsim/devices/fortios.yml index 433cf4c386..d7137bf24d 100644 --- a/netsim/devices/fortios.yml +++ b/netsim/devices/fortios.yml @@ -32,6 +32,7 @@ group_vars: ansible_httpapi_validate_certs: no ansible_httpapi_port: 80 netlab_console_connection: ssh + netlab_ready: [ ansible ] external: image: none features: diff --git a/netsim/devices/junos.yml b/netsim/devices/junos.yml index 7b604d5600..5929ff7199 100644 --- a/netsim/devices/junos.yml +++ b/netsim/devices/junos.yml @@ -13,6 +13,7 @@ group_vars: ansible_network_os: junos ansible_connection: netconf netlab_console_connection: ssh + netlab_ready: [ ansible ] features: initial: @@ -73,3 +74,7 @@ features: external: image: none + +clab: + group_vars: + netlab_ready: [ ssh, ansible ] diff --git a/netsim/devices/netscaler.yml b/netsim/devices/netscaler.yml index 18ab4f7aa6..60e8021c17 100644 --- a/netsim/devices/netscaler.yml +++ b/netsim/devices/netscaler.yml @@ -24,8 +24,7 @@ clab: ansible_ssh_pass: clab@123 ansible_connection: docker netlab_show_command: [ nscli, -U, '127.0.0.1:clab:clab@123', 'show $@' ] -# netlab_ready: [ ssh ] -# netlab_check_command: who + netlab_ready: [ ansible ] external: image: none diff --git a/netsim/devices/nxos.yml b/netsim/devices/nxos.yml index e9cc951e10..741d279823 100644 --- a/netsim/devices/nxos.yml +++ b/netsim/devices/nxos.yml @@ -11,7 +11,7 @@ clab: group_vars: ansible_ssh_pass: admin ansible_user: admin - netlab_ready: [ ssh ] + netlab_ready: [ ssh, ansible ] image: vrnetlab/vr-n9kv:9.3.8 node: kind: cisco_n9kv @@ -24,6 +24,7 @@ group_vars: ansible_connection: network_cli netlab_check_retries: 50 netlab_check_delay: 10 + netlab_ready: [ ansible ] # yamllint disable-line rule:line-length netlab_ssh_args: "-o KexAlgorithms=+diffie-hellman-group14-sha1 -o PubkeyAcceptedKeyTypes=+ssh-rsa -o HostKeyAlgorithms=+ssh-rsa" bfd: # NXOS requires lower default timer values diff --git a/netsim/devices/sros.yml b/netsim/devices/sros.yml index 5e6c495027..f1a2b5f166 100644 --- a/netsim/devices/sros.yml +++ b/netsim/devices/sros.yml @@ -14,6 +14,7 @@ group_vars: ansible_network_os: sros ansible_connection: paramiko_ssh netlab_console_connection: ssh + netlab_ready: [ ansible ] sros_use_openconfig: False netlab_match_protomap: ospf: [ ospf, ospf3 ] diff --git a/netsim/devices/vjunos-router.yml b/netsim/devices/vjunos-router.yml index 7134ab94a9..000976a7c0 100644 --- a/netsim/devices/vjunos-router.yml +++ b/netsim/devices/vjunos-router.yml @@ -27,6 +27,5 @@ clab: ansible_ssh_pass: admin@123 netlab_check_retries: 60 netlab_check_delay: 10 - netlab_ready: [ ssh ] graphite.icon: router diff --git a/netsim/devices/vjunos-switch.yml b/netsim/devices/vjunos-switch.yml index d7d6acc5bc..3bdc63fa93 100644 --- a/netsim/devices/vjunos-switch.yml +++ b/netsim/devices/vjunos-switch.yml @@ -33,6 +33,5 @@ clab: ansible_ssh_pass: admin@123 netlab_check_retries: 40 netlab_check_delay: 10 - netlab_ready: [ ssh ] graphite.icon: switch diff --git a/netsim/devices/vmx.yml b/netsim/devices/vmx.yml index 4e5b078957..1ef07b4a15 100644 --- a/netsim/devices/vmx.yml +++ b/netsim/devices/vmx.yml @@ -23,8 +23,6 @@ features: clab: image: vrnetlab/vr-vmx:18.2R1.9 build: https://containerlab.dev/manual/kinds/vr-vmx/ - group_vars: - netlab_ready: [ ssh ] node: kind: vr-vmx interface: diff --git a/netsim/devices/vptx.yml b/netsim/devices/vptx.yml index c97a6b2c17..280cf1dc3f 100644 --- a/netsim/devices/vptx.yml +++ b/netsim/devices/vptx.yml @@ -36,6 +36,5 @@ clab: ansible_ssh_pass: admin@123 netlab_check_retries: 40 netlab_check_delay: 10 - netlab_ready: [ ssh ] graphite.icon: switch diff --git a/netsim/devices/vsrx.yml b/netsim/devices/vsrx.yml index f1ca8c4e61..019e245004 100644 --- a/netsim/devices/vsrx.yml +++ b/netsim/devices/vsrx.yml @@ -44,6 +44,5 @@ clab: ansible_ssh_pass: "admin@123" netlab_check_retries: 60 netlab_check_delay: 10 - netlab_ready: [ ssh ] graphite.icon: firewall diff --git a/netsim/devices/vyos.yml b/netsim/devices/vyos.yml index 8442e0ea38..05f77833fb 100644 --- a/netsim/devices/vyos.yml +++ b/netsim/devices/vyos.yml @@ -94,6 +94,7 @@ clab: group_vars: ansible_connection: docker ansible_user: vyos + netlab_ready: [ ansible ] features: stub_loopback: True diff --git a/netsim/outputs/ansible.py b/netsim/outputs/ansible.py index 338b35af71..cd9a9b547a 100644 --- a/netsim/outputs/ansible.py +++ b/netsim/outputs/ansible.py @@ -6,7 +6,7 @@ from box import Box -from ..augment import nodes, plugin +from ..augment import devices, nodes, plugin from ..utils import files as _files from ..utils import log, strings, templates from . import _TopologyOutput, check_writeable @@ -84,6 +84,10 @@ def create(topology: Box) -> Box: if node.get(xg,False): # Add device to the extra group if it has the corresponding attribute set inventory[extra_groups[xg]].hosts[name] = {} + ready = devices.get_node_group_var(node,'netlab_ready',topology.defaults) or [] + for r_item in ready: + inventory[f'netlab_ready_{r_item}'].hosts[name] = {} + if 'devices' in defaults: for group in inventory.keys(): if group in defaults.devices: