[Top][All Lists]

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Libunwind-devel] A patch to libunwind that fixes some aborts we are see

From: Jim Galarowicz
Subject: [Libunwind-devel] A patch to libunwind that fixes some aborts we are seeing
Date: Tue, 23 Feb 2010 09:29:52 -0600
User-agent: Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv: Gecko/20100111 Thunderbird/3.0.1


I'm sending this information to the devel list to see if our fix would be a candidate to be added to the mainline source tree. This fix was developed by a member of the OpenSpeedShop ( team, Don Maghrak, to solve aborts we were seeing when trying to unwind callstacks from Intel compiled MVAPICH MPI applications.

Here is the patch that is applied to the source as of 01/23/10:

*** libunwind-20100123/src/x86_64/Gis_signal_frame.c 2010-02-08 11:34:10.000000000 -0500 --- libunwind-0.99-X/src/x86_64/Gis_signal_frame.c 2009-05-12 15:27:21.000000000 -0500
*** 38,43 ****
--- 38,44 ----
    void *arg;
    int ret;

+   c->validate = 1;
    as = c->;
    a = unw_get_accessors (as);
    arg = c->dwarf.as_arg;
*** libunwind-20100123/src/x86_64/Gstep.c 2010-02-08 11:34:10.000000000 -0500 --- libunwind-0.99-X/src/x86_64/Gstep.c 2009-05-12 15:28:27.000000000 -0500
*** 39,44 ****
--- 39,47 ----
       c, (unsigned long long) c->dwarf.ip);

    /* Try DWARF-based unwinding... */
+   /* need to validate here too.  Intel compiler generated code
+    * crashes with segv and sigbus on large mvapich jobs. */
+   c->validate = 1;
    ret = dwarf_step (&c->dwarf);

    if (ret < 0 && ret != -UNW_ENOINFO)

There are two slightly different known aborts that don't happen with the patch that did occur prior to the patch:

Loaded symbols for /usr/lib64/
Core was generated by `./smg2000 -n 40 40 40'.
Program terminated with signal 11, Segmentation fault.
[New process 5386]
[New process 5447]
#0 0x00002aaaad094729 in access_mem () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/
(gdb) where
#0 0x00002aaaad094729 in access_mem () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #1 0x00002aaaad0925fd in dwarf_get () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #2 0x00002aaaad092458 in apply_reg_state () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #3 0x00002aaaad0929a3 in _ULx86_64_dwarf_find_save_locs () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #4 0x00002aaaad0935d5 in _ULx86_64_dwarf_step () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #5 0x00002aaaad0954bd in _ULx86_64_step () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #6 0x00002aaaaacce31a in OpenSS_GetStackTraceFromContext (signal_context=0x7fffffffcbf0, skip_signal_frames=0, skip_frames=0, max_frames=100, stacktrace_size=0x7fffffffca7c, stacktrace=0x7fffffffc750) at OpenSS_GetStackTraceFromContext.c:180 #7 0x00002aaaaaccb5b5 in usertimeTimerHandler (context=0x7fffffffcbf0) at runtime.c:195 #8 0x00002aaaaaccdbca in signalHandler (signal=27, info=0x7fffffffcd20, ptr=0x7fffffffcbf0) at OpenSS_Timer.c:117 #9 0x00002aaaaaedb66c in monitor_signal_handler (sig=27, info=0x7fffffffcd20, context=0x7fffffffcbf0) at signal.c:193
#10 <signal handler called>
#11 0x00002aaaabfbb882 in _int_free (av=0x2aaaac29b9c0, mem=0x11f3780) at malloc.c:4770 #12 0x000000000041fac6 in hypre_StructGridDestroy (grid=0x2aaaac29b9c0) at struct_grid.c:77 #13 0x000000000041f55e in hypre_ComputePkgDestroy (compute_pkg=0x2aaaac29b9c0) at computation.c:338 #14 0x000000000040f6d7 in hypre_CyclicReductionDestroy (cyc_red_vdata=0x2aaaac29b9c0) at cyclic_reduction.c:1198 #15 0x00000000004048c4 in hypre_SMGRelaxDestroy (relax_vdata=0x2aaaac29b9c0) at smg_relax.c:210 #16 0x0000000000403d5a in hypre_SMGDestroy (smg_vdata=0x2aaaac29b9c0) at smg.c:75 #17 0x00000000004048b2 in hypre_SMGRelaxDestroy (relax_vdata=0x2aaaac29b9c0) at smg_relax.c:210 #18 0x0000000000403d5a in hypre_SMGDestroy (smg_vdata=0x2aaaac29b9c0) at smg.c:75 #19 0x0000000000402f76 in HYPRE_StructSMGDestroy (solver=0x2aaaac29b9c0) at HYPRE_struct_smg.c:36 #20 0x0000000000402e09 in main (argc=5, argv=0x7fffffffd4e8) at smg2000.c:520

Core was generated by `./smg2000 -n 40 40 40'.
Program terminated with signal 11, Segmentation fault.
[New process 32040]
[New process 32107]
#0 0x00002aaaabf7c215 in raise (sig=<value optimized out>) at ../nptl/sysdeps/unix/sysv/linux/raise.c:64
64        return INLINE_SYSCALL (tgkill, 3, pid, selftid, sig);
(gdb) where
#0 0x00002aaaabf7c215 in raise (sig=<value optimized out>) at ../nptl/sysdeps/unix/sysv/linux/raise.c:64 #1 0x00002aaaaaedd63c in monitor_signal_handler (sig=11, info=0x7fffffffad30, context=0x7fffffffac00) at signal.c:186
#2 <signal handler called>
#3 0x00002aaaac9b4729 in access_mem () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #4 0x00002aaaac9b25fd in dwarf_get () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #5 0x00002aaaac9b2458 in apply_reg_state () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #6 0x00002aaaac9b29a3 in _ULx86_64_dwarf_find_save_locs () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #7 0x00002aaaac9b35d5 in _ULx86_64_dwarf_step () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #8 0x00002aaaac9b54bd in _ULx86_64_step () from /g/g24/jeg/chaos_4_x86_64_ib/opt/OSS-mrnet/lib64/ #9 0x00002aaaaacceb1e in OpenSS_GetStackTraceFromContext (signal_context=0x7fffffffc200, skip_signal_frames=0, skip_frames=0, max_frames=100, stacktrace_size=0x7fffffffc01c, stacktrace=0x7fffffffbcf0) at OpenSS_GetStackTraceFromContext.c:180 #10 0x00002aaaaaccc242 in hwctimePAPIHandler (EventSet=0, address=0x2aaaabc0dae0, overflow_vector=1, context=0x7fffffffc200)
    at runtime.c:189
#11 0x00002aaaacbdd89b in _papi_hwi_dispatch_overflow_signal (papiContext=0x7fffffffc0e0, address=46912514349792, isHardware=0x7fffffffc0fc, overflow_bit=1, genOverflowBit=0, t=0x7fffffffc0f0) at extras.c:334 #12 0x00002aaaacbd3912 in _papi_hwd_dispatch_timer (signal=<value optimized out>, si=<value optimized out>,
    context=<value optimized out>) at linux.c:447
#13 0x00002aaaaaedd66c in monitor_signal_handler (sig=27, info=0x7fffffffc330, context=0x7fffffffc200) at signal.c:193
#14 <signal handler called>
#15 0x00002aaaabc0dae0 in _intel_fast_memcpy.A () from /usr/local/tools/icc-11.1.046/lib/
#16 0x00002aaaab761358 in smpi_net_lookup () at mpid_smpi.c:1405
#17 0x00002aaaab7609c8 in MPID_SMP_Check_incoming () at mpid_smpi.c:1360
#18 0x00002aaaab77d039 in MPID_DeviceCheck (blocking=24639728) at viacheck.c:289 #19 0x00002aaaab75e78f in MPID_RecvComplete (request=0x177f8f0, status=0x2aaaaebe2ae8, error_code=0x320) at mpid_recv.c:89 #20 0x00002aaaab78b8d6 in PMPI_Waitall (count=24639728, array_of_requests=0x2aaaaebe2ae8, array_of_statuses=0x320) at waitall.c:190 #21 0x000000000042e7cd in hypre_FinalizeCommunication (comm_handle=0x177f8f0) at communication.c:667 #22 0x000000000041f576 in hypre_FinalizeIndtComputations (comm_handle=0x177f8f0) at computation.c:402 #23 0x0000000000406f19 in hypre_SMGResidual (residual_vdata=0x177f8f0, A=0x2aaaaebe2ae8, x=0x320, b=0x2, r=0x13) at smg_residual.c:247 #24 0x0000000000404d4b in hypre_SMGRelax (relax_vdata=0x177f8f0, A=0x2aaaaebe2ae8, b=0x320, x=0x2) at smg_relax.c:322 #25 0x0000000000409129 in hypre_SMGSolve (smg_vdata=0x177f8f0, A=0x2aaaaebe2ae8, b=0x320, x=0x2) at smg_solve.c:279 #26 0x0000000000404d67 in hypre_SMGRelax (relax_vdata=0x177f8f0, A=0x2aaaaebe2ae8, b=0x320, x=0x2) at smg_relax.c:325 #27 0x0000000000408176 in hypre_SMGSetupInterpOp (relax_data=0x177f8f0, A=0x2aaaaebe2ae8, b=0x320, x=0x2, PT=0x13, cdir=0, cindex=0x7fffffffd0ac, findex=0x7fffffffd088, stride=0x7fffffffd0a0) at smg_setup_interp.c:200 #28 0x00000000004079ee in hypre_SMGSetup (smg_vdata=0x177f8f0, A=0x2aaaaebe2ae8, b=0x320, x=0x2) at smg_setup.c:337 #29 0x0000000000402f86 in HYPRE_StructSMGSetup (solver=0x177f8f0, A=0x2aaaaebe2ae8, b=0x320, x=0x2) at HYPRE_struct_smg.c:49 #30 0x0000000000402d71 in main (argc=5, argv=0x7fffffffd4d8) at smg2000.c:501

Jim G.

reply via email to

[Prev in Thread] Current Thread [Next in Thread]