Stephane, You should post any crash bugs with stack trace to ceph-devel ceph-de...@vger.kernel.org.
On Mon, Aug 12, 2013 at 9:02 AM, Stephane Boisvert < stephane.boisv...@gameloft.com> wrote: > Hi, > It seems my OSD processes keep crashing randomly and I don't know > why. It seems to happens when the cluster is trying to re-balance... In > normal usange I didn't notice any crash like that. > > We running ceph 0.61.7 on an up to date ubuntu 12.04 (all packages > including kernel are current). > > > Anyone have an idea ? > > > TRACE: > > > ceph version 0.61.7 (8f010aff684e820ecc837c25ac77c7a05d7191ff) > 1: /usr/bin/ceph-osd() [0x79219a] > 2: (()+0xfcb0) [0x7fd692da1cb0] > 3: (gsignal()+0x35) [0x7fd69155a425] > 4: (abort()+0x17b) [0x7fd69155db8b] > 5: (__gnu_cxx::__verbose_terminate_handler()+0x11d) [0x7fd691eac69d] > 6: (()+0xb5846) [0x7fd691eaa846] > 7: (()+0xb5873) [0x7fd691eaa873] > 8: (()+0xb596e) [0x7fd691eaa96e] > 9: (ceph::__ceph_assert_fail(char const*, char const*, int, char > const*)+0x1df) [0x84303f] > 10: > (PG::RecoveryState::Recovered::Recovered(boost::statechart::state<PG::RecoveryState::Recovered, > PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na>, > (boost::statechart::history_mode)0>::my_context)+0x38f) [0x6d932f] > 11: (boost::statechart::state<PG::RecoveryState::Recovered, > PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na>, > (boost::statechart::history_mode)0>::shallow_construct(boost::intrusive_ptr<PG::RecoveryState::Active> > const&, > boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, > PG::RecoveryState::Initial, std::allocator<void>, > boost::statechart::null_exception_translator>&)+0x5c) [0x6f270c] > 12: (PG::RecoveryState::Recovering::react(PG::AllReplicasRecovered > const&)+0xb4) [0x6d9454] > 13: (boost::statechart::simple_state<PG::RecoveryState::Recovering, > PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na>, > (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base > const&, void const*)+0xda) [0x6f296a] > 14: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, > PG::RecoveryState::Initial, std::allocator<void>, > boost::statechart::null_exception_translator>::send_event(boost::statechart::event_base > const&)+0x5b) [0x6e320b] > 15: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, > PG::RecoveryState::Initial, std::allocator<void>, > boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base > const&)+0x11) [0x6e34e1] > 16: (PG::handle_peering_event(std::tr1::shared_ptr<PG::CephPeeringEvt>, > PG::RecoveryCtx*)+0x347) [0x69aaf7] > 17: (OSD::process_peering_events(std::list<PG*, std::allocator<PG*> > > const&, ThreadPool::TPHandle&)+0x2f5) [0x632fc5] > 18: (OSD::PeeringWQ::_process(std::list<PG*, std::allocator<PG*> > > const&, ThreadPool::TPHandle&)+0x12) [0x66e2d2] > 19: (ThreadPool::worker(ThreadPool::WorkThread*)+0x4e6) [0x838476] > 20: (ThreadPool::WorkThread::entry()+0x10) [0x83a2a0] > 21: (()+0x7e9a) [0x7fd692d99e9a] > 22: (clone()+0x6d) [0x7fd691617ccd] > NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed > to interpret this. > > --- begin dump of recent events --- > -3> 2013-08-12 15:58:15.561005 7fd683d78700 1 -- > 10.136.48.18:6814/21240 <== osd.56 10.136.48.14:0/17437 44 ==== > osd_ping(ping e8959 stamp 2013-08-12 15:58:15.556022) v2 ==== 47+0+0 > (355096560 0 0) 0xc4e81c0 con 0x12fbeb00 > -2> 2013-08-12 15:58:15.561038 7fd683d78700 1 -- > 10.136.48.18:6814/21240 --> 10.136.48.14:0/17437 -- osd_ping(ping_reply > e8959 stamp 2013-08-12 15:58:15.556022) v2 -- ?+0 0x1683ec40 con 0x12fbeb00 > -1> 2013-08-12 15:58:15.568600 7fd67e56d700 1 -- > 10.136.48.18:6813/21240 --> osd.44 10.136.48.15:6820/25671 -- > osd_sub_op(osd.20.0:1293 25.328 > 699ac328/rbd_data.ae2732ae8944a.0000000000240828/head//25 [push] v 8424'11 > snapset=0=[]:[] snapc=0=[]) v7 -- ?+0 0x2df0f400 > 0> 2013-08-12 15:58:15.581608 7fd681d74700 -1 *** Caught signal > (Aborted) ** > in thread 7fd681d74700 > > ceph version 0.61.7 (8f010aff684e820ecc837c25ac77c7a05d7191ff) > 1: /usr/bin/ceph-osd() [0x79219a] > 2: (()+0xfcb0) [0x7fd692da1cb0] > 3: (gsignal()+0x35) [0x7fd69155a425] > 4: (abort()+0x17b) [0x7fd69155db8b] > 5: (__gnu_cxx::__verbose_terminate_handler()+0x11d) [0x7fd691eac69d] > 6: (()+0xb5846) [0x7fd691eaa846] > 7: (()+0xb5873) [0x7fd691eaa873] > 8: (()+0xb596e) [0x7fd691eaa96e] > 9: (ceph::__ceph_assert_fail(char const*, char const*, int, char > const*)+0x1df) [0x84303f] > 10: > (PG::RecoveryState::Recovered::Recovered(boost::statechart::state<PG::RecoveryState::Recovered, > PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na>, > (boost::statechart::history_mode)0>::my_context)+0x38f) [0x6d932f] > 11: (boost::statechart::state<PG::RecoveryState::Recovered, > PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na>, > (boost::statechart::history_mode)0>::shallow_construct(boost::intrusive_ptr<PG::RecoveryState::Active> > const&, > boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, > PG::RecoveryState::Initial, std::allocator<void>, > boost::statechart::null_exception_translator>&)+0x5c) [0x6f270c] > 12: (PG::RecoveryState::Recovering::react(PG::AllReplicasRecovered > const&)+0xb4) [0x6d9454] > 13: (boost::statechart::simple_state<PG::RecoveryState::Recovering, > PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na>, > (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base > const&, void const*)+0xda) [0x6f296a] > 14: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, > PG::RecoveryState::Initial, std::allocator<void>, > boost::statechart::null_exception_translator>::send_event(boost::statechart::event_base > const&)+0x5b) [0x6e320b] > 15: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, > PG::RecoveryState::Initial, std::allocator<void>, > boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base > const&)+0x11) [0x6e34e1] > 16: (PG::handle_peering_event(std::tr1::shared_ptr<PG::CephPeeringEvt>, > PG::RecoveryCtx*)+0x347) [0x69aaf7] > 17: (OSD::process_peering_events(std::list<PG*, std::allocator<PG*> > > const&, ThreadPool::TPHandle&)+0x2f5) [0x632fc5] > 18: (OSD::PeeringWQ::_process(std::list<PG*, std::allocator<PG*> > > const&, ThreadPool::TPHandle&)+0x12) [0x66e2d2] > 19: (ThreadPool::worker(ThreadPool::WorkThread*)+0x4e6) [0x838476] > 20: (ThreadPool::WorkThread::entry()+0x10) [0x83a2a0] > 21: (()+0x7e9a) [0x7fd692d99e9a] > 22: (clone()+0x6d) [0x7fd691617ccd] > NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed > to interpret this. > > --- logging levels --- > 0/ 5 none > 0/ 1 lockdep > 0/ 1 context > 1/ 1 crush > 1/ 5 mds > 1/ 5 mds_balancer > 1/ 5 mds_locker > 1/ 5 mds_log > 1/ 5 mds_log_expire > 1/ 5 mds_migrator > 0/ 1 buffer > 0/ 1 timer > 0/ 1 filer > 0/ 1 striper > 0/ 1 objecter > 0/ 5 rados > 0/ 5 rbd > 0/ 5 journaler > 0/ 5 objectcacher > 0/ 5 client > 0/ 5 osd > 0/ 5 optracker > 0/ 5 objclass > 1/ 3 filestore > 1/ 3 journal > 0/ 5 ms > 1/ 5 mon > 0/10 monc > 0/ 5 paxos > 0/ 5 tp > 1/ 5 auth > 1/ 5 crypto > 1/ 1 finisher > 1/ 5 heartbeatmap > 1/ 5 perfcounter > 1/ 5 rgw > 1/ 5 hadoop > 1/ 5 javaclient > 1/ 5 asok > 1/ 1 throttle > -2/-2 (syslog threshold) > -1/-1 (stderr threshold) > max_recent 10000 > max_new 1000 > log_file /var/log/ceph/ceph-osd.20.log > --- end dump of recent events --- > > > > -- > *Stéphane Boisvert* GNS-Shop Technical Coordinator 5800 St-Denis > suite 1001 Montreal (QC), H2S 3L5 *MSN:* stephane.boisv...@gameloft.com > *E-mail:* stephane.boisv...@gameloft.com > > _______________________________________________ > ceph-users mailing list > ceph-users@lists.ceph.com > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com > > -- John Wilkins Senior Technical Writer Intank john.wilk...@inktank.com (415) 425-9599 http://inktank.com
<<Inbox.jpg>>
_______________________________________________ ceph-users mailing list ceph-users@lists.ceph.com http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com