From 84bdeb49ea3d7ba1679d8b106a395a5fed4a3ccf Mon Sep 17 00:00:00 2001 From: Damian Meden Date: Thu, 26 Feb 2026 13:29:23 +0000 Subject: [PATCH] test: fix race conditions and improve diagnostics in check_threads.py The thread_config test intermittently fails on CI with exit code 1 (no matching [TS_MAIN] process found) despite ATS being fully initialized. The script produced no diagnostic output, making CI failures impossible to debug. Root causes: - p.name() and p.cwd() make fresh system calls even though process_iter already pre-fetches attributes into p.info, creating a race window where the process can exit between iteration and inspection. - psutil.Process(t.id).name() can throw NoSuchProcess if a thread exits between p.threads() and the per-thread lookup. - On loaded CI machines, transient process visibility issues cause the single-attempt scan to miss the ATS process. Changes: - Use cached p.info dict instead of p.name()/p.cwd() method calls to eliminate the iteration-vs-inspection race. - Handle psutil.NoSuchProcess and psutil.AccessDenied exceptions in process iteration, thread listing, and per-thread name lookup. - Add retry logic (3 attempts, 2s delay) only for the "process not found" case (exit code 1). Real failures (thread count mismatches, exit codes 2-11) return immediately and are never masked. - Print diagnostic output on failure: the searched path and any [TS_MAIN] processes found with their cwds. --- .../gold_tests/thread_config/check_threads.py | 55 +++++++++++++++++-- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/tests/gold_tests/thread_config/check_threads.py b/tests/gold_tests/thread_config/check_threads.py index 61a5414d875..555309c4eae 100755 --- a/tests/gold_tests/thread_config/check_threads.py +++ b/tests/gold_tests/thread_config/check_threads.py @@ -20,26 +20,45 @@ import psutil import argparse import sys +import time def count_threads(ts_path, etnet_threads, accept_threads, task_threads, aio_threads): for p in psutil.process_iter(['name', 'cwd', 'threads']): + # Use cached info from process_iter attrs to avoid race conditions + # where the process exits between iteration and inspection. + try: + proc_name = p.info.get('name', '') + proc_cwd = p.info.get('cwd', '') + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + # Find the pid corresponding to the ats process we started in autest. # It needs to match the process name and the binary path. # If autest can expose the pid of the process this is not needed anymore. - if p.name() == '[TS_MAIN]' and p.cwd() == ts_path: + if proc_name == '[TS_MAIN]' and proc_cwd == ts_path: etnet_check = set() accept_check = set() task_check = set() aio_check = set() - for t in p.threads(): + try: + threads = p.threads() + except (psutil.NoSuchProcess, psutil.AccessDenied): + sys.stderr.write(f'Process {p.pid} disappeared while reading threads.\n') + return 1 + + for t in threads: - # Get the name of the thread. - thread_name = psutil.Process(t.id).name() + # Get the name of the thread. The thread may have exited + # between p.threads() and this call, so handle that. + try: + thread_name = psutil.Process(t.id).name() + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue if thread_name.startswith('[ET_NET'): @@ -103,7 +122,20 @@ def count_threads(ts_path, etnet_threads, accept_threads, task_threads, aio_thre else: return 0 - # Return 1 if no pid is found to match the ats process. + # No matching process found. Print diagnostic info to help debug CI failures. + ts_main_procs = [] + for p in psutil.process_iter(['name', 'cwd']): + try: + if p.info.get('name') == '[TS_MAIN]': + ts_main_procs.append(f' pid={p.pid} cwd={p.info.get("cwd")}') + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + sys.stderr.write(f'No [TS_MAIN] process found with cwd={ts_path}.\n') + if ts_main_procs: + sys.stderr.write('Found [TS_MAIN] processes:\n' + '\n'.join(ts_main_procs) + '\n') + else: + sys.stderr.write('No [TS_MAIN] processes found at all.\n') return 1 @@ -118,7 +150,18 @@ def main(): '-t', '--task-threads', type=int, dest='task_threads', help='expected number of TASK threads', required=True) parser.add_argument('-c', '--aio-threads', type=int, dest='aio_threads', help='expected number of AIO threads', required=True) args = parser.parse_args() - exit(count_threads(args.ts_path, args.etnet_threads, args.accept_threads, args.task_threads, args.aio_threads)) + + max_attempts = 3 + result = 1 + for attempt in range(max_attempts): + result = count_threads(args.ts_path, args.etnet_threads, args.accept_threads, args.task_threads, args.aio_threads) + if result != 1: # Only retry when process not found (exit code 1). + break + if attempt < max_attempts - 1: + sys.stderr.write(f'Attempt {attempt + 1}/{max_attempts}: process not found, retrying in 2s...\n') + time.sleep(2) + + exit(result) if __name__ == '__main__':