From 766f8c30439932a18f59b4cd51421a6379f5488c Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Fri, 6 Dec 2024 22:12:27 +0100 Subject: [PATCH] Replace a timeout task with timedwait() According to a stacktrace from a hung DistributedNext CI job this task was causing the process to hang before exiting: ```julia InterruptException() _jl_mutex_unlock at C:/workdir/src\threading.c:1012 jl_mutex_unlock at C:/workdir/src\julia_locks.h:80 [inlined] ijl_task_get_next at C:/workdir/src\scheduler.c:458 poptask at .\task.jl:1163 wait at .\task.jl:1172 task_done_hook at .\task.jl:839 jfptr_task_done_hook_98752.1 at C:\hostedtoolcache\windows\julia\nightly\x64\lib\julia\sys.dll (unknown line) jl_apply at C:/workdir/src\julia.h:2233 [inlined] jl_finish_task at C:/workdir/src\task.c:338 start_task at C:/workdir/src\task.c:1274 From worker 82: fatal: error thrown and no exception handler available.Unhandled Task ERROR: InterruptException: Stacktrace: [1] poptask(W::Base.IntrusiveLinkedListSynchronized{Task}) @ Base .\task.jl:1163 [2] wait() @ Base .\task.jl:1172 [3] wait(c::Base.GenericCondition{ReentrantLock}; first::Bool) @ Base .\condition.jl:141 [4] wait @ .\condition.jl:136 [inlined] [5] put_buffered(c::Channel{Any}, v::Int64) @ Base .\channels.jl:420 [6] put!(c::Channel{Any}, v::Int64) @ Base .\channels.jl:398 [7] put!(rv::DistributedNext.RemoteValue, args::Int64) @ DistributedNext D:\a\DistributedNext.jl\DistributedNext.jl\src\remotecall.jl:703 [8] (::DistributedNext.var"#create_worker##11#create_worker##12"{DistributedNext.RemoteValue, Float64})() @ DistributedNext D:\a\DistributedNext.jl\DistributedNext.jl\src\cluster.jl:721 ``` Replaced it with a call to `timedwait()`, which has the advantage of being a lot simpler than an extra task. --- src/cluster.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 2444695..5712451 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -683,11 +683,9 @@ function create_worker(manager, wconfig) send_msg_now(w, MsgHeader(RRID(0,0), ntfy_oid), join_message) @async manage(w.manager, w.id, w.config, :register) + # wait for rr_ntfy_join with timeout - timedout = false - @async (sleep($timeout); timedout = true; put!(rr_ntfy_join, 1)) - wait(rr_ntfy_join) - if timedout + if timedwait(() -> isready(rr_ntfy_join), timeout) === :timed_out error("worker did not connect within $timeout seconds") end lock(client_refs) do