Bug#888879: rheolef FTBFS on several architectures: test runs forever

Adrian Bunk bunk at debian.org
Tue Jan 30 20:24:18 UTC 2018


Source: rheolef
Version: 6.7-5
Severity: serious

https://buildd.debian.org/status/package.php?p=rheolef&suite=sid

...
      mpirun -np 1 ./form_mass_bdr_tst -app P2 -weight yz -I my_cube_TP-5-v2 left >/dev/null 2>/dev/null
      mpirun -np 2 ./form_mass_bdr_tst -app P2 -weight yz -I my_cube_TP-5-v2 left >/dev/null 2>/dev/null
      mpirun -np 3 ./form_mass_bdr_tst -app P2 -weight yz -I my_cube_TP-5-v2 left >/dev/null 2>/dev/null
      mpirun -np 1 ./form_mass_bdr_tst -app P2 -weight yz -I my_cube_TP-5-v2 right >/dev/null 2>/dev/null
      mpirun -np 2 ./form_mass_bdr_tst -app P2 -weight yz -I my_cube_TP-5-v2 right >/dev/null 2>/dev/null
E: Build killed with signal TERM after 150 minutes of inactivity


I've reproduced this on i386, two processes are running
forever (aborted after 6 hours on a fast CPU) with 100% CPU.

Backtraces:

Thread 3 (Thread 0xf50ffb40 (LWP 29032)):
#0  0xf7ed6db9 in __kernel_vsyscall ()
#1  0xf70fabd3 in __GI___poll (fds=0xf47005d0, nfds=2, timeout=3600000) at ../sysdeps/unix/sysv/linux/poll.c:29
#2  0xf5caed4a in poll (__timeout=3600000, __nfds=2, __fds=0xf47005d0) at /usr/include/i386-linux-gnu/bits/poll2.h:46
#3  poll_dispatch (base=0x578eb9c0, tv=0xf50f9bfc) at poll.c:165
#4  0xf5ca59e9 in opal_libevent2022_event_base_loop (base=<optimized out>, flags=<optimized out>) at event.c:1630
#5  0xf5c6b3bd in progress_engine (obj=0x578eb950) at runtime/opal_progress_threads.c:105
#6  0xf5df6316 in start_thread (arg=0xf50ffb40) at pthread_create.c:465
#7  0xf7105296 in clone () at ../sysdeps/unix/sysv/linux/i386/clone.S:108

Thread 2 (Thread 0xf5ac5b40 (LWP 29031)):
#0  0xf7ed6db9 in __kernel_vsyscall ()
#1  0xf71053fa in __GI_epoll_pwait (epfd=7, events=0x578ea930, maxevents=32, timeout=-1, set=0x0)
    at ../sysdeps/unix/sysv/linux/epoll_pwait.c:42
#2  0xf710569a in epoll_wait (epfd=7, events=0x578ea930, maxevents=32, timeout=-1)
    at ../sysdeps/unix/sysv/linux/epoll_wait.c:30
#3  0xf5ca199a in epoll_dispatch (base=0x578ea7a0, tv=0x0) at epoll.c:407
#4  0xf5ca59e9 in opal_libevent2022_event_base_loop (base=<optimized out>, flags=<optimized out>) at event.c:1630
#5  0xf5af23eb in progress_engine (obj=0x578ea7a0) at src/util/progress_threads.c:52
#6  0xf5df6316 in start_thread (arg=0xf5ac5b40) at pthread_create.c:465
#7  0xf7105296 in clone () at ../sysdeps/unix/sysv/linux/i386/clone.S:108

Thread 1 (Thread 0xf5b4fe00 (LWP 29002)):
#0  0xf7ed67f5 in ?? ()
#1  0xf7ed6b43 in __vdso_clock_gettime ()
#2  0xf7112961 in __GI___clock_gettime (clock_id=1, tp=0xffb74194) at ../sysdeps/unix/clock_gettime.c:115
#3  0xf5cc3297 in opal_timer_linux_get_usec_clock_gettime () at timer_linux_component.c:197
#4  0xf5c669c3 in opal_progress () at runtime/opal_progress.c:197
#5  0xf74b5e05 in sync_wait_st (sync=<optimized out>) at ../opal/threads/wait_sync.h:80
#6  ompi_request_default_wait_all (count=2, requests=0xffb742e4, statuses=0x0) at request/req_wait.c:221
#7  0xf750640d in ompi_coll_base_allreduce_intra_recursivedoubling (sbuf=0x57951030, rbuf=0x57a9b400, count=2, 
    dtype=0xf7565140 <ompi_mpi_unsigned>, op=0xf7573e60 <ompi_mpi_op_sum>, comm=0xf7569520 <ompi_mpi_comm_world>, 
    module=0x57976fa0) at base/coll_base_allreduce.c:225
#8  0xe991f640 in ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf=0x57951030, rbuf=0x57a9b400, count=2, 
    dtype=0xf7565140 <ompi_mpi_unsigned>, op=0xf7573e60 <ompi_mpi_op_sum>, comm=0xf7569520 <ompi_mpi_comm_world>, 
    module=0x57976fa0) at coll_tuned_decision_fixed.c:66
#9  0xf74c5b77 in PMPI_Allreduce (sendbuf=0x57951030, recvbuf=0x57a9b400, count=2, 
    datatype=0xf7565140 <ompi_mpi_unsigned>, op=0xf7573e60 <ompi_mpi_op_sum>, comm=0xf7569520 <ompi_mpi_comm_world>)
    at pallreduce.c:107
#10 0xf7b476cf in boost::mpi::detail::all_reduce_impl<unsigned int, std::plus<unsigned int> > (comm=..., 
    in_values=0x57951030, n=n at entry=2, out_values=0x57a9b400) at /usr/include/boost/mpi/collectives/all_reduce.hpp:36
#11 0xf7b58fc0 in boost::mpi::all_reduce<unsigned int, std::plus<unsigned int> > (out_values=<optimized out>, n=2, 
    in_values=<optimized out>, comm=..., op=...) at /usr/include/boost/mpi/collectives/all_reduce.hpp:93
#12 rheolef::mpi_assembly_begin<std::multimap<unsigned int, unsigned int, std::less<unsigned int>, rheolef::heap_allocator<std::pair<unsigned int, unsigned int> > >, rheolef::disarray_rep<rheolef::index_set, rheolef::distributed, std::allocator<rheolef::index_set> >::message_type, rheolef::apply_iterator<std::_Rb_tree_iterator<std::pair<unsigned int const, unsigned int> >, rheolef::first_op<std::pair<unsigned int const, unsigned int> > > > (stash=..., first_stash_idx=..., 
    last_stash_idx=..., ownership=..., receive=..., send=...) at ../../include/rheolef/mpi_assembly_begin.h:113
#13 0xf7b5a346 in rheolef::disarray_rep<rheolef::index_set, rheolef::distributed, std::allocator<rheolef::index_set> >::dis_entry_assembly_begin<rheolef::index_set_add_op<rheolef::index_set> > (this=0x57acab70, my_set_op=...)
    at ../../include/rheolef/disarray_mpi.icc:223
#14 rheolef::disarray<rheolef::index_set, rheolef::distributed, std::allocator<rheolef::index_set> >::dis_entry_assembly_begin (this=<optimized out>) at ../../include/rheolef/disarray.h:592
#15 rheolef::disarray<rheolef::index_set, rheolef::distributed, std::allocator<rheolef::index_set> >::dis_entry_assembly (this=<optimized out>) at ../../include/rheolef/disarray.h:594
#16 rheolef::geo_rep<double, rheolef::distributed>::set_element_side_index (this=<optimized out>, 
    side_dim=<optimized out>) at geo_mpi_get.cc:461
#17 0xf7b5f25a in rheolef::geo_rep<double, rheolef::distributed>::get (this=<optimized out>, ips=...)
    at geo_mpi_get.cc:965
#18 0xf7b60a48 in rheolef::geo_rep<double, rheolef::distributed>::load (this=<optimized out>, filename=..., comm=...)
    at geo_mpi_get.cc:989
#19 0xf7b3030a in rheolef::geo_load<double, rheolef::distributed> (name=...) at geo.cc:172
#20 0x56592bf8 in rheolef::geo_basic<double, rheolef::distributed>::geo_basic (comm=..., 
    name="\360\265\225W\030\000\000\000\030\000\000\000\227 at YV\320<ZV\002\000\000\000\000\060\bn\003\000\000\000y+YV at K\267\377\000\000\000\000\000\360\035\367\000\000\000\000\000\000\000\000\203w\002\367\000\360\035\367\000\360\035\367\000\000\000\000\203w\002\367\003\000\000\000\324K\267\377\344K\267\377dK\267\377\003\000\000\000\324K\267\377\000\360\035\367\352w\356\367\001\000\000\000\000\000\000\000\000\360\035\367\000\000\000\000\000\000\000\000Tl\324\tEf\254c", '\000' <repeats 12 times>, "\320K\267\377\000\200\355\367\354\202\355\367\350\210\355\367\003\000\000\000\320<ZV\003\000\000\000\310 at YV\000\000\000\000\371 at YV`+YV\003\000\000\000\324K\267\377"..., this=0xffb74ac0)
    at ../../include/rheolef/geo.h:1460
#21 main (argc=<optimized out>, argv=<optimized out>) at space_tst.cc:26




Thread 3 (Thread 0xf51ffb40 (LWP 29033)):
#0  0xf7fb7db9 in __kernel_vsyscall ()
#1  0xf71dbbd3 in __GI___poll (fds=0xf48005d0, nfds=2, timeout=3600000) at ../sysdeps/unix/sysv/linux/poll.c:29
#2  0xf5d8fd4a in poll (__timeout=3600000, __nfds=2, __fds=0xf48005d0) at /usr/include/i386-linux-gnu/bits/poll2.h:46
#3  poll_dispatch (base=0x57eef9c0, tv=0xf51f9bfc) at poll.c:165
#4  0xf5d869e9 in opal_libevent2022_event_base_loop (base=<optimized out>, flags=<optimized out>) at event.c:1630
#5  0xf5d4c3bd in progress_engine (obj=0x57eef950) at runtime/opal_progress_threads.c:105
#6  0xf5ed7316 in start_thread (arg=0xf51ffb40) at pthread_create.c:465
#7  0xf71e6296 in clone () at ../sysdeps/unix/sysv/linux/i386/clone.S:108

Thread 2 (Thread 0xf5ba6b40 (LWP 29030)):
#0  0xf7fb7db9 in __kernel_vsyscall ()
#1  0xf71e63fa in __GI_epoll_pwait (epfd=7, events=0x57eee930, maxevents=32, timeout=-1, set=0x0) at ../sysdeps/unix/sysv/linux/epoll_pwait.c:42
#2  0xf71e669a in epoll_wait (epfd=7, events=0x57eee930, maxevents=32, timeout=-1) at ../sysdeps/unix/sysv/linux/epoll_wait.c:30
#3  0xf5d8299a in epoll_dispatch (base=0x57eee7a0, tv=0x0) at epoll.c:407
#4  0xf5d869e9 in opal_libevent2022_event_base_loop (base=<optimized out>, flags=<optimized out>) at event.c:1630
#5  0xf5bd33eb in progress_engine (obj=0x57eee7a0) at src/util/progress_threads.c:52
#6  0xf5ed7316 in start_thread (arg=0xf5ba6b40) at pthread_create.c:465
#7  0xf71e6296 in clone () at ../sysdeps/unix/sysv/linux/i386/clone.S:108

Thread 1 (Thread 0xf5c30e00 (LWP 29003)):
#0  0xf7fb77f5 in ?? ()
#1  0xf7fb7b43 in __vdso_clock_gettime ()
#2  0xf71f3961 in __GI___clock_gettime (clock_id=1, tp=0xffa8c4b4) at ../sysdeps/unix/clock_gettime.c:115
#3  0xf5da4297 in opal_timer_linux_get_usec_clock_gettime () at timer_linux_component.c:197
#4  0xf5d479c3 in opal_progress () at runtime/opal_progress.c:197
#5  0xf7596e05 in sync_wait_st (sync=<optimized out>) at ../opal/threads/wait_sync.h:80
#6  ompi_request_default_wait_all (count=2, requests=0xffa8c604, statuses=0x0) at request/req_wait.c:221
#7  0xf75e740d in ompi_coll_base_allreduce_intra_recursivedoubling (sbuf=0x5809b980, rbuf=0x580805b0, count=139, 
    dtype=0xf7646140 <ompi_mpi_unsigned>, op=0xf7655660 <ompi_mpi_op_max>, comm=0xf764a520 <ompi_mpi_comm_world>, module=0x57f74810)
    at base/coll_base_allreduce.c:225
#8  0xf1a05640 in ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf=0x5809b980, rbuf=0x580805b0, count=139, dtype=0xf7646140 <ompi_mpi_unsigned>, 
    op=0xf7655660 <ompi_mpi_op_max>, comm=0xf764a520 <ompi_mpi_comm_world>, module=0x57f74810) at coll_tuned_decision_fixed.c:66
#9  0xf75a6b77 in PMPI_Allreduce (sendbuf=0x5809b980, recvbuf=0x580805b0, count=139, datatype=0xf7646140 <ompi_mpi_unsigned>, 
    op=0xf7655660 <ompi_mpi_op_max>, comm=0xf764a520 <ompi_mpi_comm_world>) at pallreduce.c:107
#10 0xf7c2862f in boost::mpi::detail::all_reduce_impl<unsigned int, boost::mpi::maximum<unsigned int> > (comm=..., in_values=0x5809b980, n=139, 
    out_values=0x580805b0) at /usr/include/boost/mpi/collectives/all_reduce.hpp:36
#11 0xf7c4019f in boost::mpi::all_reduce<unsigned int, boost::mpi::maximum<unsigned int> > (out_values=<optimized out>, n=<optimized out>, 
    in_values=<optimized out>, comm=..., op=...) at /usr/include/boost/mpi/collectives/all_reduce.hpp:93
#12 rheolef::geo_rep<double, rheolef::distributed>::get (this=<optimized out>, ips=...) at geo_mpi_get.cc:942
#13 0xf7c41a48 in rheolef::geo_rep<double, rheolef::distributed>::load (this=<optimized out>, filename=..., comm=...) at geo_mpi_get.cc:989
#14 0xf7c1130a in rheolef::geo_load<double, rheolef::distributed> (name=...) at geo.cc:172
#15 0x5658abf8 in rheolef::geo_basic<double, rheolef::distributed>::geo_basic (comm=..., 
    name="\000\327\365W\030\000\000\000\030\000\000\000\227\300XV\320\274YV\002\000\000\000\000\240x1\003\000\000\000y\253XV\360\313\250\377\000\000\000\000\000\000,\367\000\000\000\000\000\000\000\000\203\207\020\367\000\000,\367\000\000,\367\000\000\000\000\203\207\020\367\003\000\000\000\204\314\250\377\224\314\250\377\024\314\250\377\003\000\000\000\204\314\250\377\000\000,\367\352\207\374\367\001\000\000\000\000\000\000\000\000\000,\367\000\000\000\000\000\000\000\000\276\"\333\071\257HBI", '\000' <repeats 12 times>, "\200\314\250\377\000\220\373\367\354\222\373\367\350\230\373\367\003\000\000\000\320\274YV\003\000\000\000\310\300XV\000\000\000\000\371\300XV`\253XV\003\000\000\000\204\314\250\377"..., 
    this=0xffa8cb70) at ../../include/rheolef/geo.h:1460
#16 main (argc=<optimized out>, argv=<optimized out>) at space_tst.cc:26



More information about the debian-science-maintainers mailing list