I have 30 or so OSDs on a cluster with 240 that just keep crashing. Below is 
the last part of one of the log files showing the crash, can anyone please help 
me read this to figure out what is going on and how to correct it? When I start 
the OSDs they generally seem to work for 5-30 minutes, and then one by one they 
will start dropping out with logs similar to this.

Thanks.

   -29> 2020-02-04 06:00:23.447 7fe300d41700  5 osd.168 pg_epoch: 459335 
pg[6.1217s2( v 443432'5 (0'0,443432'5] local-lis/les=459328/459329 n=1 
ec=260267/6574 lis/c 459331/428950 les/c/f 459332/440468/290442 
459333/459334/459294) 
[2147483647,107,168,2147483647,102]/[81,107,168,89,102]p81(0) r=2 lpr=459334 
pi=[428950,459334)/82 crt=443432'5 lcod 0'0 remapped NOTIFY mbc={}] exit 
Started/Stray 1.017145 6 0.000323
   -28> 2020-02-04 06:00:23.447 7fe300d41700  5 osd.168 pg_epoch: 459335 
pg[6.1217s2( v 443432'5 (0'0,443432'5] local-lis/les=459328/459329 n=1 
ec=260267/6574 lis/c 459331/428950 les/c/f 459332/440468/290442 
459333/459334/459294) 
[2147483647,107,168,2147483647,102]/[81,107,168,89,102]p81(0) r=2 lpr=459334 
pi=[428950,459334)/82 crt=443432'5 lcod 0'0 remapped NOTIFY mbc={}] enter 
Started/ReplicaActive
   -27> 2020-02-04 06:00:23.447 7fe300d41700  5 osd.168 pg_epoch: 459335 
pg[6.1217s2( v 443432'5 (0'0,443432'5] local-lis/les=459328/459329 n=1 
ec=260267/6574 lis/c 459331/428950 les/c/f 459332/440468/290442 
459333/459334/459294) 
[2147483647,107,168,2147483647,102]/[81,107,168,89,102]p81(0) r=2 lpr=459334 
pi=[428950,459334)/82 crt=443432'5 lcod 0'0 remapped NOTIFY mbc={}] enter 
Started/ReplicaActive/RepNotRecovering
   -26> 2020-02-04 06:00:23.455 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -25> 2020-02-04 06:00:23.455 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -24> 2020-02-04 06:00:23.455 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -23> 2020-02-04 06:00:23.459 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -22> 2020-02-04 06:00:23.459 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -21> 2020-02-04 06:00:23.459 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -20> 2020-02-04 06:00:23.459 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -19> 2020-02-04 06:00:23.463 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -18> 2020-02-04 06:00:23.463 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -17> 2020-02-04 06:00:23.471 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -16> 2020-02-04 06:00:23.471 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -15> 2020-02-04 06:00:23.471 7fe300d41700  5 osd.168 pg_epoch: 459335 
pg[6.1217s2( v 443432'5 (0'0,443432'5] local-lis/les=459334/459335 n=1 
ec=260267/6574 lis/c 459334/428950 les/c/f 459335/440468/290442 
459333/459334/459294) 
[2147483647,107,168,2147483647,102]/[81,107,168,89,102]p81(0) r=2 lpr=459334 
pi=[428950,459334)/82 luod=0'0 crt=443432'5 lcod 0'0 active+remapped mbc={}] 
exit Started/ReplicaActive/RepNotRecovering 0.021923 2 0.000098
   -14> 2020-02-04 06:00:23.471 7fe300d41700  5 osd.168 pg_epoch: 459335 
pg[6.1217s2( v 443432'5 (0'0,443432'5] local-lis/les=459334/459335 n=1 
ec=260267/6574 lis/c 459334/428950 les/c/f 459335/440468/290442 
459333/459334/459294) 
[2147483647,107,168,2147483647,102]/[81,107,168,89,102]p81(0) r=2 lpr=459334 
pi=[428950,459334)/82 luod=0'0 crt=443432'5 lcod 0'0 active+remapped mbc={}] 
enter Started/ReplicaActive/RepWaitRecoveryReserved
   -13> 2020-02-04 06:00:23.471 7fe300d41700  5 osd.168 pg_epoch: 459335 
pg[6.1217s2( v 443432'5 (0'0,443432'5] local-lis/les=459334/459335 n=1 
ec=260267/6574 lis/c 459334/428950 les/c/f 459335/440468/290442 
459333/459334/459294) 
[2147483647,107,168,2147483647,102]/[81,107,168,89,102]p81(0) r=2 lpr=459334 
pi=[428950,459334)/82 luod=0'0 crt=443432'5 lcod 0'0 active+remapped mbc={}] 
exit Started/ReplicaActive/RepWaitRecoveryReserved 0.000137 1 0.000080
   -12> 2020-02-04 06:00:23.471 7fe300d41700  5 osd.168 pg_epoch: 459335 
pg[6.1217s2( v 443432'5 (0'0,443432'5] local-lis/les=459334/459335 n=1 
ec=260267/6574 lis/c 459334/428950 les/c/f 459335/440468/290442 
459333/459334/459294) 
[2147483647,107,168,2147483647,102]/[81,107,168,89,102]p81(0) r=2 lpr=459334 
pi=[428950,459334)/82 luod=0'0 crt=443432'5 lcod 0'0 active+remapped mbc={}] 
enter Started/ReplicaActive/RepRecovering
   -11> 2020-02-04 06:00:23.471 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
   -10> 2020-02-04 06:00:23.475 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
    -9> 2020-02-04 06:00:23.483 7fe300540700  5 osd.168 pg_epoch: 459335 
pg[6.1961s0( v 436281'1 (0'0,436281'1] local-lis/les=459334/459335 n=0 
ec=260267/6574 lis/c 459334/406589 les/c/f 459335/436403/290442 
459333/459334/459334) 
[168,2147483647,2147483647,196,151]/[168,116,60,196,151]p168(0) r=0 lpr=459334 
pi=[436277,459334)/33 crt=436281'1 lcod 0'0 mlcod 0'0 active+degraded+remapped 
m=30 mbc={0={(0+0)=30},1={(0+2)=30},2={(0+3)=30},3={(1+1)=30},4={(0+0)=30}}] 
exit Started/Primary/Active/Activating 0.034400 28 0.002950
    -8> 2020-02-04 06:00:23.483 7fe300540700  5 osd.168 pg_epoch: 459335 
pg[6.1961s0( v 436281'1 (0'0,436281'1] local-lis/les=459334/459335 n=0 
ec=260267/6574 lis/c 459334/406589 les/c/f 459335/436403/290442 
459333/459334/459334) 
[168,2147483647,2147483647,196,151]/[168,116,60,196,151]p168(0) r=0 lpr=459334 
pi=[436277,459334)/33 crt=436281'1 lcod 0'0 mlcod 0'0 active+degraded+remapped 
m=30 mbc={0={(0+0)=30},1={(0+2)=30},2={(0+3)=30},3={(1+1)=30},4={(0+0)=30}}] 
enter Started/Primary/Active/WaitLocalRecoveryReserved
    -7> 2020-02-04 06:00:23.483 7fe300540700  5 osd.168 pg_epoch: 459335 
pg[6.1961s0( v 436281'1 (0'0,436281'1] local-lis/les=459334/459335 n=0 
ec=260267/6574 lis/c 459334/406589 les/c/f 459335/436403/290442 
459333/459334/459334) 
[168,2147483647,2147483647,196,151]/[168,116,60,196,151]p168(0) r=0 lpr=459334 
pi=[436277,459334)/33 crt=436281'1 lcod 0'0 mlcod 0'0 
active+recovery_wait+degraded+remapped m=30 
mbc={0={(0+0)=30},1={(0+2)=30},2={(0+3)=30},3={(1+1)=30},4={(0+0)=30}}] exit 
Started/Primary/Active/WaitLocalRecoveryReserved 0.000213 1 0.000221
    -6> 2020-02-04 06:00:23.483 7fe300540700  5 osd.168 pg_epoch: 459335 
pg[6.1961s0( v 436281'1 (0'0,436281'1] local-lis/les=459334/459335 n=0 
ec=260267/6574 lis/c 459334/406589 les/c/f 459335/436403/290442 
459333/459334/459334) 
[168,2147483647,2147483647,196,151]/[168,116,60,196,151]p168(0) r=0 lpr=459334 
pi=[436277,459334)/33 crt=436281'1 lcod 0'0 mlcod 0'0 
active+recovery_wait+degraded+remapped m=30 
mbc={0={(0+0)=30},1={(0+2)=30},2={(0+3)=30},3={(1+1)=30},4={(0+0)=30}}] enter 
Started/Primary/Active/WaitRemoteRecoveryReserved
    -5> 2020-02-04 06:00:23.483 7fe300540700  5 osd.168 pg_epoch: 459335 
pg[6.1961s0( v 436281'1 (0'0,436281'1] local-lis/les=459334/459335 n=0 
ec=260267/6574 lis/c 459334/406589 les/c/f 459335/436403/290442 
459333/459334/459334) 
[168,2147483647,2147483647,196,151]/[168,116,60,196,151]p168(0) r=0 lpr=459334 
pi=[436277,459334)/33 crt=436281'1 lcod 0'0 mlcod 0'0 
active+recovery_wait+degraded+remapped m=30 
mbc={0={(0+0)=30},1={(0+2)=30},2={(0+3)=30},3={(1+1)=30},4={(0+0)=30}}] exit 
Started/Primary/Active/WaitRemoteRecoveryReserved 0.002796 5 0.000231
    -4> 2020-02-04 06:00:23.483 7fe300540700  5 osd.168 pg_epoch: 459335 
pg[6.1961s0( v 436281'1 (0'0,436281'1] local-lis/les=459334/459335 n=0 
ec=260267/6574 lis/c 459334/406589 les/c/f 459335/436403/290442 
459333/459334/459334) 
[168,2147483647,2147483647,196,151]/[168,116,60,196,151]p168(0) r=0 lpr=459334 
pi=[436277,459334)/33 crt=436281'1 lcod 0'0 mlcod 0'0 
active+recovery_wait+degraded+remapped m=30 
mbc={0={(0+0)=30},1={(0+2)=30},2={(0+3)=30},3={(1+1)=30},4={(0+0)=30}}] enter 
Started/Primary/Active/Recovering
    -3> 2020-02-04 06:00:23.491 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
    -2> 2020-02-04 06:00:23.519 7fe309d53700  3 osd.168 459335 handle_osd_map 
epochs [459335,459335], i have 459335, src has [403399,459335]
    -1> 2020-02-04 06:00:23.779 7fe300540700 -1 
/build/ceph-14.2.7/src/osd/ECBackend.cc: In function 'void 
ECBackend::do_read_op(ECBackend::ReadOp&)' thread 7fe300540700 time 2020-02-04 
06:00:23.774430
/build/ceph-14.2.7/src/osd/ECBackend.cc: 1742: FAILED ceph_assert(!need_attrs)

 ceph version 14.2.7 (3d58626ebeec02d8385a4cefb92c6cbc3a45bfe8) nautilus 
(stable)
 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char 
const*)+0x152) [0x5579b9ecce4c]
 2: (ceph::__ceph_assertf_fail(char const*, char const*, int, char const*, char 
const*, ...)+0) [0x5579b9ecd027]
 3: (ECBackend::do_read_op(ECBackend::ReadOp&)+0xf76) [0x5579ba295106]
 4: (ECBackend::send_all_remaining_reads(hobject_t const&, 
ECBackend::ReadOp&)+0x4d1) [0x5579ba2a6cf1]
 5: (ECBackend::handle_sub_read_reply(pg_shard_t, ECSubReadReply&, 
RecoveryMessages*, ZTracer::Trace const&)+0xcf6) [0x5579ba2a7f26]
 6: (ECBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0xbc) 
[0x5579ba2a8a8c]
 7: (PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x97) 
[0x5579ba17f757]
 8: (PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, 
ThreadPool::TPHandle&)+0x705) [0x5579ba12dee5]
 9: (OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, 
ThreadPool::TPHandle&)+0x1bf) [0x5579b9f4ff2f]
 10: (PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, 
ThreadPool::TPHandle&)+0x62) [0x5579ba206e82]
 11: (OSD::ShardedOpWQ::_process(unsigned int, 
ceph::heartbeat_handle_d*)+0xbf5) [0x5579b9f6ea05]
 12: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x4ac) 
[0x5579ba5714cc]
 13: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x5579ba574690]
 14: (()+0x7fa3) [0x7fe31ff5cfa3]
 15: (clone()+0x3f) [0x7fe31fb0c4cf]

     0> 2020-02-04 06:00:23.787 7fe300540700 -1 *** Caught signal (Aborted) **
 in thread 7fe300540700 thread_name:tp_osd_tp

 ceph version 14.2.7 (3d58626ebeec02d8385a4cefb92c6cbc3a45bfe8) nautilus 
(stable)
 1: (()+0x12730) [0x7fe31ff67730]
 2: (gsignal()+0x10b) [0x7fe31fa4a7bb]
 3: (abort()+0x121) [0x7fe31fa35535]
 4: (ceph::__ceph_assert_fail(char const*, char const*, int, char 
const*)+0x1a3) [0x5579b9ecce9d]
 5: (ceph::__ceph_assertf_fail(char const*, char const*, int, char const*, char 
const*, ...)+0) [0x5579b9ecd027]
 6: (ECBackend::do_read_op(ECBackend::ReadOp&)+0xf76) [0x5579ba295106]
 7: (ECBackend::send_all_remaining_reads(hobject_t const&, 
ECBackend::ReadOp&)+0x4d1) [0x5579ba2a6cf1]
 8: (ECBackend::handle_sub_read_reply(pg_shard_t, ECSubReadReply&, 
RecoveryMessages*, ZTracer::Trace const&)+0xcf6) [0x5579ba2a7f26]
 9: (ECBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0xbc) 
[0x5579ba2a8a8c]
 10: (PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x97) 
[0x5579ba17f757]
 11: (PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, 
ThreadPool::TPHandle&)+0x705) [0x5579ba12dee5]
 12: (OSD::dequeue_op(boost::intrusive_ptr<PG>, 
boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x1bf) [0x5579b9f4ff2f]
 13: (PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, 
ThreadPool::TPHandle&)+0x62) [0x5579ba206e82]
 14: (OSD::ShardedOpWQ::_process(unsigned int, 
ceph::heartbeat_handle_d*)+0xbf5) [0x5579b9f6ea05]
 15: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x4ac) 
[0x5579ba5714cc]
 16: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x5579ba574690]
 17: (()+0x7fa3) [0x7fe31ff5cfa3]
 18: (clone()+0x3f) [0x7fe31fb0c4cf]
 NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to 
interpret this.

--- logging levels ---
   0/ 5 none
   0/ 1 lockdep
   0/ 1 context
   1/ 1 crush
   1/ 5 mds
   1/ 5 mds_balancer
   1/ 5 mds_locker
   1/ 5 mds_log
   1/ 5 mds_log_expire
   1/ 5 mds_migrator
   0/ 1 buffer
   0/ 1 timer
   0/ 1 filer
   0/ 1 striper
   0/ 1 objecter
   0/ 5 rados
   0/ 5 rbd
   0/ 5 rbd_mirror
   0/ 5 rbd_replay
   0/ 5 journaler
   0/ 5 objectcacher
   0/ 5 client
   1/ 5 osd
   0/ 5 optracker
   0/ 5 objclass
   1/ 3 filestore
   1/ 3 journal
   0/ 0 ms
   1/ 5 mon
   0/10 monc
   1/ 5 paxos
   0/ 5 tp
   1/ 5 auth
   1/ 5 crypto
   1/ 1 finisher
   1/ 1 reserver
   1/ 5 heartbeatmap
   1/ 5 perfcounter
   1/ 5 rgw
   1/ 5 rgw_sync
   1/10 civetweb
   1/ 5 javaclient
   1/ 5 asok
   1/ 1 throttle
   0/ 0 refs
   1/ 5 xio
   1/ 5 compressor
   1/ 5 bluestore
   1/ 5 bluefs
   1/ 3 bdev
   1/ 5 kstore
   4/ 5 rocksdb
   4/ 5 leveldb
   4/ 5 memdb
   1/ 5 kinetic
   1/ 5 fuse
   1/ 5 mgr
   1/ 5 mgrc
   1/ 5 dpdk
   1/ 5 eventtrace
   1/ 5 prioritycache
  -2/-2 (syslog threshold)
  -1/-1 (stderr threshold)
  max_recent     10000
  max_new         1000
  log_file /var/log/ceph/ceph-osd.168.log
--- end dump of recent events ---

-- ray


_______________________________________________
ceph-users mailing list -- ceph-users@ceph.io
To unsubscribe send an email to ceph-users-le...@ceph.io

Reply via email to