@@ -893,6 +893,8 @@ const LPROC = LocalProcess()
893893const LPROCROLE = Ref {Symbol} (:master )
894894const HDR_VERSION_LEN= 16
895895const HDR_COOKIE_LEN= 16
896+ const map_pid_statuses = Dict {Int, Any} ()
897+ const map_pid_statuses_lock = ReentrantLock ()
896898const map_pid_wrkr = Dict {Int, Union{Worker, LocalProcess}} ()
897899const map_sock_wrkr = IdDict ()
898900const map_del_wrkr = Set {Int} ()
@@ -1035,15 +1037,16 @@ for any reason (i.e. not only because of [`rmprocs()`](@ref) but also the worker
10351037segfaulting etc). Chooses and returns a unique key for the callback if `key` is
10361038not specified.
10371039
1038- The callback will be called with the worker ID and the final
1039- `Distributed.WorkerState` of the worker, e.g. `f(w::Int, state)`. `state` is an
1040+ The callback will be called with the worker ID, the final
1041+ `Distributed.WorkerState` of the worker, and the last status of the worker as
1042+ set by [`setstatus!`](@ref), e.g. `f(w::Int, state, status)`. `state` is an
10401043enum, a value of `WorkerState_terminated` means a graceful exit and a value of
10411044`WorkerState_exterminated` means the worker died unexpectedly.
10421045
10431046If the callback throws an exception it will be caught and printed.
10441047"""
10451048add_worker_exited_callback (f:: Base.Callable ; key= nothing ) = _add_callback (f, key, worker_exited_callbacks;
1046- arg_types= Tuple{Int, WorkerState})
1049+ arg_types= Tuple{Int, WorkerState, Any })
10471050
10481051"""
10491052 remove_worker_exited_callback(key)
@@ -1231,6 +1234,59 @@ Identical to [`workers()`](@ref) except that the current worker is filtered out.
12311234"""
12321235other_workers () = filter (!= (myid ()), workers ())
12331236
1237+ """
1238+ setstatus!(x, pid::Int=myid())
1239+
1240+ Set the status for worker `pid` to `x`. `x` may be any serializable object but
1241+ it's recommended to keep it small enough to cheaply send over a network. The
1242+ status will be passed to the worker-exited callbacks (see
1243+ [`add_worker_exited_callback`](@ref)) when the worker exits.
1244+
1245+ This can be handy if you want a way to know what a worker is doing at any given
1246+ time, or (in combination with a worker-exited callback) for knowing what a
1247+ worker was last doing before it died.
1248+
1249+ # Examples
1250+ ```julia-repl
1251+ julia> DistributedNext.setstatus!("working on dataset 42")
1252+ "working on dataset 42"
1253+
1254+ julia> DistributedNext.getstatus()
1255+ "working on dataset 42"
1256+ ```
1257+ """
1258+ function setstatus! (x, pid:: Int = myid ())
1259+ if pid ∉ procs ()
1260+ throw (ArgumentError (" Worker $(pid) does not exist, cannot set its status" ))
1261+ end
1262+
1263+ if myid () == 1
1264+ @lock map_pid_statuses_lock map_pid_statuses[pid] = x
1265+ else
1266+ remotecall_fetch (setstatus!, 1 , x, myid ())
1267+ end
1268+ end
1269+
1270+ _getstatus (pid) = @lock map_pid_statuses_lock get! (map_pid_statuses, pid, nothing )
1271+
1272+ """
1273+ getstatus(pid::Int=myid())
1274+
1275+ Get the status for worker `pid`. If one was never explicitly set with
1276+ [`setstatus!`](@ref) this will return `nothing`.
1277+ """
1278+ function getstatus (pid:: Int = myid ())
1279+ if pid ∉ procs ()
1280+ throw (ArgumentError (" Worker $(pid) does not exist, cannot get its status" ))
1281+ end
1282+
1283+ if myid () == 1
1284+ _getstatus (pid)
1285+ else
1286+ remotecall_fetch (getstatus, 1 , pid)
1287+ end
1288+ end
1289+
12341290function cluster_mgmt_from_master_check ()
12351291 if myid () != 1
12361292 throw (ErrorException (" Only process 1 can add and remove workers" ))
@@ -1450,15 +1506,20 @@ function deregister_worker(pg, pid)
14501506 end
14511507 end
14521508
1453- # Call callbacks on the master
14541509 if myid () == 1
1510+ status = _getstatus (pid)
1511+
1512+ # Call callbacks on the master
14551513 for (name, callback) in worker_exited_callbacks
14561514 try
1457- callback (pid, w. state)
1515+ callback (pid, w. state, status )
14581516 catch ex
14591517 @error " Error when running worker-exited callback '$(name) '" exception= (ex, catch_backtrace ())
14601518 end
14611519 end
1520+
1521+ # Delete its status
1522+ @lock map_pid_statuses_lock delete! (map_pid_statuses, pid)
14621523 end
14631524
14641525 return
0 commit comments