[restored issue] - segfault with cuda+mpi
Experiments on sirocco14,15 with
starpu@26f1c06e48e7fa595e71fe3c7139349a04def056
and
chameleon@8145265f24e8013275c701e1a3401e8a87ada9b9
. /home/pruvost/set_env_chameleon_mpi_cuda.sh openmpi cuda
mpiexec -np 2 ./new-testing/snew-testing -o gemm -m 64000 -n 8000 -k 64000 -t 30 -g 2 -b 1600 -P 2
==== backtrace (tid: 113725) ====
0 0x0000000000036280 killpg() ???:0
1 0x000000000045e92f cti_pack_data() /home/pruvost/git/chameleon/runtime/starpu/interface/cham_tile_interface.c:260
2 0x000000000002b0c1 _starpu_mpi_early_data_cb() /home/pruvost/git/starpu/mpi/src/mpi/starpu_mpi_mpi.c:870
3 0x000000000012ad5e _starpu_data_acquire_fetch_data_callback() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:161
4 0x000000000010389d _starpu_create_request_to_fetch_data() /home/pruvost/git/starpu/src/datawizard/coherency.c:532
5 0x00000000001046ad _starpu_fetch_data_on_node() /home/pruvost/git/starpu/src/datawizard/coherency.c:768
6 0x000000000012ac4e _starpu_data_acquire_launch_fetch() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:135
7 0x000000000012adc7 _starpu_data_acquire_continuation_non_blocking() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:170
8 0x000000000012ae35 starpu_data_acquire_cb_pre_sync_callback() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:185
9 0x000000000012b6d2 starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:259
10 0x000000000012b891 starpu_data_acquire_on_node_cb_sequential_consistency_quick() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:281
11 0x000000000012b902 starpu_data_acquire_on_node_cb_sequential_consistency() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:288
12 0x000000000012b968 starpu_data_acquire_on_node_cb() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:295
13 0x000000000012b9db starpu_data_acquire_cb() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:304
14 0x000000000002651a _starpu_mpi_submit_ready_request() /home/pruvost/git/starpu/mpi/src/mpi/starpu_mpi_mpi.c:242
15 0x000000000012ad5e _starpu_data_acquire_fetch_data_callback() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:161
16 0x0000000000103ae0 _starpu_create_request_to_fetch_data() /home/pruvost/git/starpu/src/datawizard/coherency.c:575
17 0x00000000001046ad _starpu_fetch_data_on_node() /home/pruvost/git/starpu/src/datawizard/coherency.c:768
18 0x000000000012ac4e _starpu_data_acquire_launch_fetch() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:135
19 0x000000000012adc7 _starpu_data_acquire_continuation_non_blocking() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:170
20 0x000000000012ae35 starpu_data_acquire_cb_pre_sync_callback() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:185
21 0x000000000012b6d2 starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:259
22 0x00000000000091e3 _starpu_mpi_isend_irecv_common() /home/pruvost/git/starpu/mpi/src/starpu_mpi.c:49
23 0x0000000000009bcb _starpu_mpi_irecv_common() /home/pruvost/git/starpu/mpi/src/starpu_mpi.c:186
24 0x0000000000009ebd starpu_mpi_irecv_detached() /home/pruvost/git/starpu/mpi/src/starpu_mpi.c:211
25 0x00000000000137e1 _starpu_mpi_exchange_data_before_execution() /home/pruvost/git/starpu/mpi/src/starpu_mpi_task_insert.c:123
26 0x00000000000158b1 _starpu_mpi_task_build_v() /home/pruvost/git/starpu/mpi/src/starpu_mpi_task_insert.c:526
27 0x0000000000015bda _starpu_mpi_task_insert_v() /home/pruvost/git/starpu/mpi/src/starpu_mpi_task_insert.c:595
28 0x0000000000015eec starpu_mpi_insert_task() /home/pruvost/git/starpu/mpi/src/starpu_mpi_task_insert.c:642
29 0x0000000000489179 INSERT_TASK_slacpyx() /home/pruvost/git/chameleon/build-cuda/runtime/starpu/codelets/codelet_slacpy.c:76
30 0x00000000004891e4 INSERT_TASK_slacpy() /home/pruvost/git/chameleon/build-cuda/runtime/starpu/codelets/codelet_slacpy.c:98
31 0x0000000000440c34 chameleon_psgemm_summa() /home/pruvost/git/chameleon/build-cuda/compute/psgemm.c:131
32 0x00000000004414e2 chameleon_psgemm() /home/pruvost/git/chameleon/build-cuda/compute/psgemm.c:310
33 0x000000000041e9aa CHAMELEON_sgemm_Tile_Async() /home/pruvost/git/chameleon/build-cuda/compute/sgemm.c:462
34 0x000000000041e4e8 CHAMELEON_sgemm_Tile() /home/pruvost/git/chameleon/build-cuda/compute/sgemm.c:312
35 0x000000000040d46f testing_sgemm() /home/pruvost/git/chameleon/build-cuda/new-testing/testing_sgemm.c:89
36 0x0000000000407a0b main() /home/pruvost/git/chameleon/build-cuda/new-testing/testing_sauxiliary.c:543
37 0x00000000000223d5 __libc_start_main() ???:0
38 0x0000000000406d99 _start() ???:0
=================================
mpiexec -np 2 ./new-testing/snew-testing -o gemm -m 64000 -n 9600 -k 64000 -t 30 -g 2 -b 1600 -P 2
==== backtrace (tid: 60838) ====
0 0x0000000000036280 killpg() ???:0
1 0x000000000045e92f cti_pack_data() /home/pruvost/git/chameleon/runtime/starpu/interface/cham_tile_interface.c:260
2 0x000000000002b0c1 _starpu_mpi_early_data_cb() /home/pruvost/git/starpu/mpi/src/mpi/starpu_mpi_mpi.c:870
3 0x000000000012ad5e _starpu_data_acquire_fetch_data_callback() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:161
4 0x000000000010389d _starpu_create_request_to_fetch_data() /home/pruvost/git/starpu/src/datawizard/coherency.c:532
5 0x00000000001046ad _starpu_fetch_data_on_node() /home/pruvost/git/starpu/src/datawizard/coherency.c:768
6 0x000000000012ac4e _starpu_data_acquire_launch_fetch() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:135
7 0x000000000012adc7 _starpu_data_acquire_continuation_non_blocking() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:170
8 0x0000000000093d0c _starpu_notify_data_dependencies() /home/pruvost/git/starpu/src/core/dependencies/data_concurrency.c:618
9 0x0000000000104d32 _starpu_release_data_on_node() /home/pruvost/git/starpu/src/datawizard/coherency.c:864
10 0x000000000012cc85 starpu_data_release_on_node() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:498
11 0x000000000012cde5 starpu_data_release() /home/pruvost/git/starpu/src/datawizard/user_interactions.c:519
12 0x000000000000c7d0 _starpu_mpi_release_req_data() /home/pruvost/git/starpu/mpi/src/starpu_mpi_coop_sends.c:58
13 0x000000000002abf2 _starpu_mpi_handle_request_termination() /home/pruvost/git/starpu/mpi/src/mpi/starpu_mpi_mpi.c:839
14 0x000000000002c1d3 _starpu_mpi_test_detached_requests() /home/pruvost/git/starpu/mpi/src/mpi/starpu_mpi_mpi.c:989
15 0x000000000002f3cb _starpu_mpi_progress_thread_func() /home/pruvost/git/starpu/mpi/src/mpi/starpu_mpi_mpi.c:1286
16 0x0000000000007dd5 start_thread() pthread_create.c:0
17 0x00000000000fdead __clone() ???:0
=================================
Warning: this issue has been restored from backup and can have been changed. For example, all comments have been lost.