Mentions légales du service

Skip to content

Do not leave processes running after test ends

Currently a test may leave spawned processes running. Attempts are made in the code to combat this problem by killing all server and runner processes with pkill, cf. LocalCluster.CleanUp().

This does not work for multiple reasons:

  • The test may time out. CTest will then send SIGKILL to the test; the resource clean-up does not take place.
  • Runners do not die; for some reason OpenMPI's mpiexec does not seem to propagate SIGTERM and the OpenMPI code executed in the runner is steadfastly calling poll() on sockets that won't ever receive a message because the sending process does not exist anymore. Similarly, ZeroMQ keeps processes alive.

Expected behavior:

  • Spawned processes are killed or die on their because launcher and server processes ended.

GDB backtrace of a runner refusing to die and eating 100% CPU:

(gdb) bt fu
#0  0x00007f384257c093 in ?? ()
   from /usr/lib/x86_64-linux-gnu/libopen-pal.so.20
No symbol table info available.
#1  0x00007f38425209a9 in opal_progress ()
   from /usr/lib/x86_64-linux-gnu/libopen-pal.so.20
No symbol table info available.
#2  0x00007f3843ac8c0d in ompi_request_default_wait ()
   from /usr/lib/x86_64-linux-gnu/libmpi.so.20
No symbol table info available.
#3  0x00007f3843b15f90 in ompi_coll_base_bcast_intra_generic ()
   from /usr/lib/x86_64-linux-gnu/libmpi.so.20
No symbol table info available.
#4  0x00007f3843b16354 in ompi_coll_base_bcast_intra_binomial ()
   from /usr/lib/x86_64-linux-gnu/libmpi.so.20
No symbol table info available.
#5  0x00007f382e47816b in ompi_coll_tuned_bcast_intra_dec_fixed ()
   from /usr/lib/x86_64-linux-gnu/openmpi/lib/openmpi/mca_coll_tuned.so
No symbol table info available.
#6  0x00007f3843adbde5 in PMPI_Bcast ()
   from /usr/lib/x86_64-linux-gnu/libmpi.so.20
No symbol table info available.
#7  0x00007f3843fb53e0 in first_melissa_init (
    comm_=0x55bce8c9f020 <ompi_mpi_comm_world>)
    at /home/christoph/melissa-da/api/melissa_api.cxx:573
        register_field = false
#8  0x00007f3843fb58b9 in melissa_init_with_index_map (
    field_name=0x55bce8a99348 "variableX", local_vect_size=20, 
    local_hidden_vect_size=0, comm_=0x55bce8c9f020 <ompi_mpi_comm_world>, 
    local_index_map=0x0, local_index_map_hidden=0x0)
    at /home/christoph/melissa-da/api/melissa_api.cxx:644
        register_field = false
        local_vect_sizes = std::vector of length 0, capacity 0
---Type <return> to continue, or q <return> to quit---
        local_hidden_vect_sizes = std::vector of length 0, capacity 0
        global_index_map = std::vector of length 0, capacity 5
        global_index_map_hidden = std::vector of length -11614890390219, capacity -35182351799431 = {40, 0, 2, 0, 508777072, 32766, 1140544900, 32568, 
          508777072, 32766, -389418976, 21948, 0, 20, -391539896, 21948, 
          508777904, 32766, -391565727, 21948, 0, 0, 0, 0, 508778136, 32766, 
          8064, 1, 0, 0, 2, 1, 16, 20, 40, 2, 0, 0, 0, 0, 0, 0, 0, 0, 5656, 0, 
          5663, 0, 30591, 0, 5663, 0, 939541568, 32568, 939541576, 32568, 
          939541576, 32568, 0, 0, 939541600, 32568, 939541608, 32568, 
          939541608, 32568, 1264538707, 1598835785, -363423456, 21948, 
          -363423296, 21948, -363423296, 21948, 1229734751, 5525061, 0, 0, 0, 
          0, 0, 0, 0, 0, -391566956, 21948, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
          268435456, 0, 0, 0, 0, 0, 791834928, 1818850658, 1701654116, 
          1936943468, 1633955169, 1936028719, 1769156468, 1634497901, 
          1852795252, 1769156401, 1634497901, 1852795252, 49, 0, 0, 0, 0, 0, 
          0, 0, 0, 0, 0, 0, 2, 0, 1040, 0, 0, 64, 7, 0, 0, 0, -363425264, 
          21948, 1088, 0, -112, -1, 16, 0, 66, 64, 2, 0, 0, 0, 0, 0, 124, 119, 
          110, 93, 0, 0, 0, 0, 1129190952, 32568, 1132059200, 32568, 
          1125336377, 32568, 508777824, 32766, 1, 0, 508777824, 32766, 
          -389425504, 21948, 1, 0, -391564173, 21948...}
#9  0x00007f3843fb5584 in melissa_init (field_name=0x55bce8a99348 "variableX", 
    local_vect_size=20, local_hidden_vect_size=0, 
    comm_=0x55bce8c9f020 <ompi_mpi_comm_world>)
    at /home/christoph/melissa-da/api/melissa_api.cxx:602
No locals.
#10 0x000055bce8a92e61 in main (argc=1, args=0x7ffe1e535698)
    at /home/christoph/melissa-da/test/simulation1/simulation.cxx:184
        ppid = 5656
        pid = 5663
        sid = 30591
        pg = 5663
---Type <return> to continue, or q <return> to quit---
        comm_size = 2
        comm_rank = 1
        local_vect_size = 20
        offsets = std::vector of length 2, capacity 2 = {0, 20}
        counts = std::vector of length 2, capacity 2 = {20, 20}
        next_offset = 40
        state1 = std::vector of length 20, capacity 20 = {0, 0, 0, 0, 0, 0, 0, 
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
        is_first_timestep = true
        nsteps = 0
        ret = 0
        __PRETTY_FUNCTION__ = "int main(int, char**)"

strace -p 5663 output:

poll([{fd=5, events=POLLIN}, {fd=18, events=POLLIN}], 2, 0) = 0 (Timeout)
poll([{fd=5, events=POLLIN}, {fd=18, events=POLLIN}], 2, 0) = 0 (Timeout)
poll([{fd=5, events=POLLIN}, {fd=18, events=POLLIN}], 2, 0) = 0 (Timeout)
poll([{fd=5, events=POLLIN}, {fd=18, events=POLLIN}], 2, 0) = 0 (Timeout)
poll([{fd=5, events=POLLIN}, {fd=18, events=POLLIN}], 2, 0) = 0 (Timeout)
^Cstrace: Process 5663 detached

gdb backtrace of a process refusing to die with low CPU consumption:

(gdb) bt fu
#0  0x00007f7c38834cf9 in __GI___poll (fds=0x7ffddc491ea0, nfds=1, timeout=-1)
    at ../sysdeps/unix/sysv/linux/poll.c:29
        resultvar = 18446744073709551100
        sc_cancel_oldtype = 0
        sc_ret = <optimized out>
#1  0x00007f7c384d6bf7 in ?? () from /usr/lib/x86_64-linux-gnu/libzmq.so.5
No symbol table info available.
#2  0x00007f7c384b42ec in ?? () from /usr/lib/x86_64-linux-gnu/libzmq.so.5
No symbol table info available.
#3  0x00007f7c384d8932 in ?? () from /usr/lib/x86_64-linux-gnu/libzmq.so.5
No symbol table info available.
#4  0x00007f7c384d93f6 in ?? () from /usr/lib/x86_64-linux-gnu/libzmq.so.5
No symbol table info available.
#5  0x00007f7c384f9fa9 in ?? () from /usr/lib/x86_64-linux-gnu/libzmq.so.5
No symbol table info available.
#6  0x00007f7c39811dba in zmq_msg_recv (msg=0x7ffddc492110, 
    socket=0x5604506372a0, flags=0)
    at /home/christoph/melissa-da/test/preload.cxx:252
        fn = 0x7f7c384fa200 <zmq_msg_recv>
        __func__ = "zmq_msg_recv"
#7  0x00007f7c395ecd76 in ConfigurationConnection::register_runner_id (
    this=0x5604506338f0, out_server=0x7f7c39803c00 <server>)
    at /home/christoph/melissa-da/api/melissa_api.cxx:459
        msg_request = {
          _ = '\000' <repeats 42 times>, "e", '\000' <repeats 20 times>}
        msg_reply = {
          _ = '\000' <repeats 42 times>, "e\000\000\000\000\000\036\000\000\000\000\000\000\000(\000\000\000\000\000\000"}
        header = 0x7ffddc4920d8
        __PRETTY_FUNCTION__ = "bool ConfigurationConnection::register_runner_id(Server*)"
        buf = 0x0
---Type <return> to continue, or q <return> to quit---
        request_register_field = false
        port_names_size = 0
#8  0x00007f7c395e43ad in first_melissa_init (
    comm_=0x56044e5aa020 <ompi_mpi_comm_world>)
    at /home/christoph/melissa-da/api/melissa_api.cxx:570
        register_field = false
#9  0x00007f7c395e48b9 in melissa_init_with_index_map (
    field_name=0x56044e3a4348 "variableX", local_vect_size=20, 
    local_hidden_vect_size=0, comm_=0x56044e5aa020 <ompi_mpi_comm_world>, 
    local_index_map=0x0, local_index_map_hidden=0x0)
    at /home/christoph/melissa-da/api/melissa_api.cxx:644
        register_field = false
        local_vect_sizes = std::vector of length 0, capacity 0
        local_hidden_vect_sizes = std::vector of length 0, capacity 0
        global_index_map = std::vector of length 0, capacity 0
        global_index_map_hidden = std::vector of length 0, capacity 5
#10 0x00007f7c395e4584 in melissa_init (field_name=0x56044e3a4348 "variableX", 
    local_vect_size=20, local_hidden_vect_size=0, 
    comm_=0x56044e5aa020 <ompi_mpi_comm_world>)
    at /home/christoph/melissa-da/api/melissa_api.cxx:602
No locals.
#11 0x000056044e39de61 in main (argc=1, args=0x7ffddc4926f8)
    at /home/christoph/melissa-da/test/simulation1/simulation.cxx:184
        ppid = 5656
        pid = 5661
        sid = 30591
        pg = 5661
        comm_size = 2
        comm_rank = 0
        local_vect_size = 20
        offsets = std::vector of length 2, capacity 2 = {0, 20}
        counts = std::vector of length 2, capacity 2 = {20, 20}
---Type <return> to continue, or q <return> to quit---
        next_offset = 40
        state1 = std::vector of length 20, capacity 20 = {0, 0, 0, 0, 0, 0, 0, 
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
        is_first_timestep = true
        nsteps = 0
        ret = 0
        __PRETTY_FUNCTION__ = "int main(int, char**)"

strace -p 5661 output:

strace: Process 5661 attached
restart_syscall(<... resuming interrupted restart_syscall ...>^Cstrace: Process 5661 detached
 <detached ...>
Edited by Christoph Conrads