Do not leave processes running after test ends
Currently a test may leave spawned processes running. Attempts are made in the code to combat this problem by killing all server and runner processes with pkill
, cf. LocalCluster.CleanUp()
.
This does not work for multiple reasons:
- The test may time out. CTest will then send
SIGKILL
to the test; the resource clean-up does not take place. - Runners do not die; for some reason OpenMPI's
mpiexec
does not seem to propagateSIGTERM
and the OpenMPI code executed in the runner is steadfastly callingpoll()
on sockets that won't ever receive a message because the sending process does not exist anymore. Similarly, ZeroMQ keeps processes alive.
Expected behavior:
- Spawned processes are killed or die on their because launcher and server processes ended.
GDB backtrace of a runner refusing to die and eating 100% CPU:
(gdb) bt fu
#0 0x00007f384257c093 in ?? ()
from /usr/lib/x86_64-linux-gnu/libopen-pal.so.20
No symbol table info available.
#1 0x00007f38425209a9 in opal_progress ()
from /usr/lib/x86_64-linux-gnu/libopen-pal.so.20
No symbol table info available.
#2 0x00007f3843ac8c0d in ompi_request_default_wait ()
from /usr/lib/x86_64-linux-gnu/libmpi.so.20
No symbol table info available.
#3 0x00007f3843b15f90 in ompi_coll_base_bcast_intra_generic ()
from /usr/lib/x86_64-linux-gnu/libmpi.so.20
No symbol table info available.
#4 0x00007f3843b16354 in ompi_coll_base_bcast_intra_binomial ()
from /usr/lib/x86_64-linux-gnu/libmpi.so.20
No symbol table info available.
#5 0x00007f382e47816b in ompi_coll_tuned_bcast_intra_dec_fixed ()
from /usr/lib/x86_64-linux-gnu/openmpi/lib/openmpi/mca_coll_tuned.so
No symbol table info available.
#6 0x00007f3843adbde5 in PMPI_Bcast ()
from /usr/lib/x86_64-linux-gnu/libmpi.so.20
No symbol table info available.
#7 0x00007f3843fb53e0 in first_melissa_init (
comm_=0x55bce8c9f020 <ompi_mpi_comm_world>)
at /home/christoph/melissa-da/api/melissa_api.cxx:573
register_field = false
#8 0x00007f3843fb58b9 in melissa_init_with_index_map (
field_name=0x55bce8a99348 "variableX", local_vect_size=20,
local_hidden_vect_size=0, comm_=0x55bce8c9f020 <ompi_mpi_comm_world>,
local_index_map=0x0, local_index_map_hidden=0x0)
at /home/christoph/melissa-da/api/melissa_api.cxx:644
register_field = false
local_vect_sizes = std::vector of length 0, capacity 0
---Type <return> to continue, or q <return> to quit---
local_hidden_vect_sizes = std::vector of length 0, capacity 0
global_index_map = std::vector of length 0, capacity 5
global_index_map_hidden = std::vector of length -11614890390219, capacity -35182351799431 = {40, 0, 2, 0, 508777072, 32766, 1140544900, 32568,
508777072, 32766, -389418976, 21948, 0, 20, -391539896, 21948,
508777904, 32766, -391565727, 21948, 0, 0, 0, 0, 508778136, 32766,
8064, 1, 0, 0, 2, 1, 16, 20, 40, 2, 0, 0, 0, 0, 0, 0, 0, 0, 5656, 0,
5663, 0, 30591, 0, 5663, 0, 939541568, 32568, 939541576, 32568,
939541576, 32568, 0, 0, 939541600, 32568, 939541608, 32568,
939541608, 32568, 1264538707, 1598835785, -363423456, 21948,
-363423296, 21948, -363423296, 21948, 1229734751, 5525061, 0, 0, 0,
0, 0, 0, 0, 0, -391566956, 21948, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
268435456, 0, 0, 0, 0, 0, 791834928, 1818850658, 1701654116,
1936943468, 1633955169, 1936028719, 1769156468, 1634497901,
1852795252, 1769156401, 1634497901, 1852795252, 49, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 2, 0, 1040, 0, 0, 64, 7, 0, 0, 0, -363425264,
21948, 1088, 0, -112, -1, 16, 0, 66, 64, 2, 0, 0, 0, 0, 0, 124, 119,
110, 93, 0, 0, 0, 0, 1129190952, 32568, 1132059200, 32568,
1125336377, 32568, 508777824, 32766, 1, 0, 508777824, 32766,
-389425504, 21948, 1, 0, -391564173, 21948...}
#9 0x00007f3843fb5584 in melissa_init (field_name=0x55bce8a99348 "variableX",
local_vect_size=20, local_hidden_vect_size=0,
comm_=0x55bce8c9f020 <ompi_mpi_comm_world>)
at /home/christoph/melissa-da/api/melissa_api.cxx:602
No locals.
#10 0x000055bce8a92e61 in main (argc=1, args=0x7ffe1e535698)
at /home/christoph/melissa-da/test/simulation1/simulation.cxx:184
ppid = 5656
pid = 5663
sid = 30591
pg = 5663
---Type <return> to continue, or q <return> to quit---
comm_size = 2
comm_rank = 1
local_vect_size = 20
offsets = std::vector of length 2, capacity 2 = {0, 20}
counts = std::vector of length 2, capacity 2 = {20, 20}
next_offset = 40
state1 = std::vector of length 20, capacity 20 = {0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
is_first_timestep = true
nsteps = 0
ret = 0
__PRETTY_FUNCTION__ = "int main(int, char**)"
strace -p 5663
output:
poll([{fd=5, events=POLLIN}, {fd=18, events=POLLIN}], 2, 0) = 0 (Timeout)
poll([{fd=5, events=POLLIN}, {fd=18, events=POLLIN}], 2, 0) = 0 (Timeout)
poll([{fd=5, events=POLLIN}, {fd=18, events=POLLIN}], 2, 0) = 0 (Timeout)
poll([{fd=5, events=POLLIN}, {fd=18, events=POLLIN}], 2, 0) = 0 (Timeout)
poll([{fd=5, events=POLLIN}, {fd=18, events=POLLIN}], 2, 0) = 0 (Timeout)
^Cstrace: Process 5663 detached
gdb backtrace of a process refusing to die with low CPU consumption:
(gdb) bt fu
#0 0x00007f7c38834cf9 in __GI___poll (fds=0x7ffddc491ea0, nfds=1, timeout=-1)
at ../sysdeps/unix/sysv/linux/poll.c:29
resultvar = 18446744073709551100
sc_cancel_oldtype = 0
sc_ret = <optimized out>
#1 0x00007f7c384d6bf7 in ?? () from /usr/lib/x86_64-linux-gnu/libzmq.so.5
No symbol table info available.
#2 0x00007f7c384b42ec in ?? () from /usr/lib/x86_64-linux-gnu/libzmq.so.5
No symbol table info available.
#3 0x00007f7c384d8932 in ?? () from /usr/lib/x86_64-linux-gnu/libzmq.so.5
No symbol table info available.
#4 0x00007f7c384d93f6 in ?? () from /usr/lib/x86_64-linux-gnu/libzmq.so.5
No symbol table info available.
#5 0x00007f7c384f9fa9 in ?? () from /usr/lib/x86_64-linux-gnu/libzmq.so.5
No symbol table info available.
#6 0x00007f7c39811dba in zmq_msg_recv (msg=0x7ffddc492110,
socket=0x5604506372a0, flags=0)
at /home/christoph/melissa-da/test/preload.cxx:252
fn = 0x7f7c384fa200 <zmq_msg_recv>
__func__ = "zmq_msg_recv"
#7 0x00007f7c395ecd76 in ConfigurationConnection::register_runner_id (
this=0x5604506338f0, out_server=0x7f7c39803c00 <server>)
at /home/christoph/melissa-da/api/melissa_api.cxx:459
msg_request = {
_ = '\000' <repeats 42 times>, "e", '\000' <repeats 20 times>}
msg_reply = {
_ = '\000' <repeats 42 times>, "e\000\000\000\000\000\036\000\000\000\000\000\000\000(\000\000\000\000\000\000"}
header = 0x7ffddc4920d8
__PRETTY_FUNCTION__ = "bool ConfigurationConnection::register_runner_id(Server*)"
buf = 0x0
---Type <return> to continue, or q <return> to quit---
request_register_field = false
port_names_size = 0
#8 0x00007f7c395e43ad in first_melissa_init (
comm_=0x56044e5aa020 <ompi_mpi_comm_world>)
at /home/christoph/melissa-da/api/melissa_api.cxx:570
register_field = false
#9 0x00007f7c395e48b9 in melissa_init_with_index_map (
field_name=0x56044e3a4348 "variableX", local_vect_size=20,
local_hidden_vect_size=0, comm_=0x56044e5aa020 <ompi_mpi_comm_world>,
local_index_map=0x0, local_index_map_hidden=0x0)
at /home/christoph/melissa-da/api/melissa_api.cxx:644
register_field = false
local_vect_sizes = std::vector of length 0, capacity 0
local_hidden_vect_sizes = std::vector of length 0, capacity 0
global_index_map = std::vector of length 0, capacity 0
global_index_map_hidden = std::vector of length 0, capacity 5
#10 0x00007f7c395e4584 in melissa_init (field_name=0x56044e3a4348 "variableX",
local_vect_size=20, local_hidden_vect_size=0,
comm_=0x56044e5aa020 <ompi_mpi_comm_world>)
at /home/christoph/melissa-da/api/melissa_api.cxx:602
No locals.
#11 0x000056044e39de61 in main (argc=1, args=0x7ffddc4926f8)
at /home/christoph/melissa-da/test/simulation1/simulation.cxx:184
ppid = 5656
pid = 5661
sid = 30591
pg = 5661
comm_size = 2
comm_rank = 0
local_vect_size = 20
offsets = std::vector of length 2, capacity 2 = {0, 20}
counts = std::vector of length 2, capacity 2 = {20, 20}
---Type <return> to continue, or q <return> to quit---
next_offset = 40
state1 = std::vector of length 20, capacity 20 = {0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
is_first_timestep = true
nsteps = 0
ret = 0
__PRETTY_FUNCTION__ = "int main(int, char**)"
strace -p 5661
output:
strace: Process 5661 attached
restart_syscall(<... resuming interrupted restart_syscall ...>^Cstrace: Process 5661 detached
<detached ...>
Edited by Christoph Conrads