Re: [ceph-users] 3 OSDs stopped and unable to restart

Brett Chancellor Wed, 10 Jul 2019 11:39:31 -0700

Once backfilling finished, the cluster was super slow, most osd's were
filled with heartbeat_map errors.  When an OSD restarts it causes a cascade
of other osd's to follow suit and restart.. logs like..
  -3> 2019-07-10 18:34:50.046 7f34abf5b700 -1 osd.69 1348581
get_health_metrics reporting 21 slow ops, oldest is
osd_op(client.115295041.0:17575966 15.c37fa482 15.c37fa482 (undecoded)
ack+ondisk+write+known_if_redirected e1348522)
    -2> 2019-07-10 18:34:50.967 7f34acf5d700  1 heartbeat_map is_healthy
'OSD::osd_op_tp thread 0x7f3493f2b700' had timed out after 90
    -1> 2019-07-10 18:34:50.967 7f34acf5d700  1 heartbeat_map is_healthy
'OSD::osd_op_tp thread 0x7f3493f2b700' had suicide timed out after 150
     0> 2019-07-10 18:34:51.025 7f3493f2b700 -1 *** Caught signal (Aborted)
**
 in thread 7f3493f2b700 thread_name:tp_osd_tp


 ceph version 14.2.1 (d555a9489eb35f84f2e1ef49b77e19da9d113972) nautilus
(stable)
 1: (()+0xf5d0) [0x7f34b57c25d0]
 2: (pread64()+0x33) [0x7f34b57c1f63]
 3: (KernelDevice::read_random(unsigned long, unsigned long, char*,
bool)+0x238) [0x55bfdae5a448]
 4: (BlueFS::_read_random(BlueFS::FileReader*, unsigned long, unsigned
long, char*)+0xca) [0x55bfdae1271a]
 5: (BlueRocksRandomAccessFile::Read(unsigned long, unsigned long,
rocksdb::Slice*, char*) const+0x20) [0x55bfdae3b440]
 6: (rocksdb::RandomAccessFileReader::Read(unsigned long, unsigned long,
rocksdb::Slice*, char*) const+0x960) [0x55bfdb466ba0]
 7: (rocksdb::BlockFetcher::ReadBlockContents()+0x3e7) [0x55bfdb420c27]
 8: (()+0x11146a4) [0x55bfdb40d6a4]
 9:
(rocksdb::BlockBasedTable::MaybeLoadDataBlockToCache(rocksdb::FilePrefetchBuffer*,
rocksdb::BlockBasedTable::Rep*, rocksdb::ReadOptions const&,
rocksdb::BlockHandle const&, rocksdb::Slice,
rocksdb::BlockBasedTable::CachableEntry<rocksdb::Block>*, bool,
rocksdb::GetContext*)+0x2cc) [0x55bfdb40f63c]
 10: (rocksdb::DataBlockIter*
rocksdb::BlockBasedTable::NewDataBlockIterator<rocksdb::DataBlockIter>(rocksdb::BlockBasedTable::Rep*,
rocksdb::ReadOptions const&, rocksdb::BlockHandle const&,
rocksdb::DataBlockIter*, bool, bool, bool, rocksdb::GetContext*,
rocksdb::Status, rocksdb::FilePrefetchBuffer*)+0x169) [0x55bfdb41cb29]
 11: (rocksdb::BlockBasedTableIterator<rocksdb::DataBlockIter,
rocksdb::Slice>::InitDataBlock()+0xc8) [0x55bfdb41e588]
 12: (rocksdb::BlockBasedTableIterator<rocksdb::DataBlockIter,
rocksdb::Slice>::FindKeyForward()+0x8d) [0x55bfdb41e89d]
 13: (()+0x10adde9) [0x55bfdb3a6de9]
 14: (rocksdb::MergingIterator::Next()+0x44) [0x55bfdb4357c4]
 15: (rocksdb::DBIter::FindNextUserEntryInternal(bool, bool)+0x762)
[0x55bfdb32a092]
 16: (rocksdb::DBIter::Next()+0x1d6) [0x55bfdb32b6c6]
 17: (RocksDBStore::RocksDBWholeSpaceIteratorImpl::next()+0x2d)
[0x55bfdad9fa8d]
 18: (BlueStore::_collection_list(BlueStore::Collection*, ghobject_t
const&, ghobject_t const&, int, std::vector<ghobject_t,
std::allocator<ghobject_t> >*, ghobject_t*)+0xdf6) [0x55bfdad12466]
 19:
(BlueStore::collection_list(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
ghobject_t const&, ghobject_t const&, int, std::vector<ghobject_t,
std::allocator<ghobject_t> >*, ghobject_t*)+0x9b) [0x55bfdad1393b]
 20: (PG::_delete_some(ObjectStore::Transaction*)+0x1e0) [0x55bfda984120]
 21: (PG::RecoveryState::Deleting::react(PG::DeleteSome const&)+0x38)
[0x55bfda985598]
 22: (boost::statechart::simple_state<PG::RecoveryState::Deleting,
PG::RecoveryState::ToDelete, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na>,
(boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base
const&, void const*)+0x16a) [0x55bfda9c45ca]
 23: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine,
PG::RecoveryState::Initial, std::allocator<void>,
boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base
const&)+0x5a) [0x55bfda9a20ca]
 24: (PG::do_peering_event(std::shared_ptr<PGPeeringEvent>,
PG::RecoveryCtx*)+0x119) [0x55bfda991389]
 25: (OSD::dequeue_peering_evt(OSDShard*, PG*,
std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x1b4)
[0x55bfda8cb3c4]
 26: (OSD::dequeue_delete(OSDShard*, PG*, unsigned int,
ThreadPool::TPHandle&)+0x234) [0x55bfda8cb804]
 27: (OSD::ShardedOpWQ::_process(unsigned int,
ceph::heartbeat_handle_d*)+0x9f4) [0x55bfda8bfb44]
 28: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x433)
[0x55bfdaeb9e93]
 29: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55bfdaebcf30]
 30: (()+0x7dd5) [0x7f34b57badd5]
 31: (clone()+0x6d) [0x7f34b4680ead]
 NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed
to interpret this.

--- logging levels ---
   0/ 5 none
   0/ 1 lockdep
   0/ 1 context
   1/ 1 crush
   1/ 5 mds
   1/ 5 mds_balancer
   1/ 5 mds_locker
   1/ 5 mds_log
   1/ 5 mds_log_expire
   1/ 5 mds_migrator
   0/ 1 buffer
   0/ 1 timer
   0/ 1 filer
   0/ 1 striper
   0/ 1 objecter
   0/ 5 rados
   0/ 5 rbd
   0/ 5 rbd_mirror
   0/ 5 rbd_replay
   0/ 5 journaler
   0/ 5 objectcacher
   0/ 5 client
   1/ 5 osd
   0/ 5 optracker
   0/ 5 objclass
   1/ 3 filestore
   1/ 3 journal
   0/ 0 ms
   1/ 5 mon
   0/10 monc
   1/ 5 paxos
   0/ 5 tp
   1/ 5 auth
   1/ 5 crypto
   1/ 1 finisher
   1/ 1 reserver
   1/ 5 heartbeatmap
   1/ 5 perfcounter
   1/ 5 rgw
   1/ 5 rgw_sync
   1/10 civetweb
   1/ 5 javaclient
   1/ 5 asok
   1/ 1 throttle
   0/ 0 refs
   1/ 5 xio
   1/ 5 compressor
   1/ 5 bluestore
   1/ 5 bluefs
   1/ 3 bdev
   1/ 5 kstore
   4/ 5 rocksdb
   4/ 5 leveldb
   4/ 5 memdb
   1/ 5 kinetic
   1/ 5 fuse
   1/ 5 mgr
   1/ 5 mgrc
   1/ 5 dpdk
   1/ 5 eventtrace
  -2/-2 (syslog threshold)
  -1/-1 (stderr threshold)
  max_recent     10000
  max_new         1000
  log_file /var/log/ceph/ceph-osd.69.log
--- end dump of recent events ---

On Tue, Jul 9, 2019 at 1:38 PM Igor Fedotov <ifedo...@suse.de> wrote:

> This will cap single bluefs space allocation. Currently it attempts to
> allocate 70Gb which seems to overflow some 32-bit length fields. With the
> adjustment such allocation should be capped at ~700MB.
>
> I doubt there is any relation between this specific failure and the pool.
> At least at the moment.
>
> In short the history is: starting OSD tries to flush bluefs data to disk,
> detects lack of space and asks for more from main device - allocations
> succeeds but returned extent has length field set to 0.
> On 7/9/2019 8:33 PM, Brett Chancellor wrote:
>
> What does bluestore_bluefs_gift_ratio do?  I can't find any documentation
> on it.  Also do you think this could be related to the .rgw.meta pool
> having too many objects per PG? The disks that die always seem to be
> backfilling a pg from that pool, and they have ~550k objects per PG.
>
> -Brett
>
> On Tue, Jul 9, 2019 at 1:03 PM Igor Fedotov <ifedo...@suse.de> wrote:
>
>> Please try to set bluestore_bluefs_gift_ratio to 0.0002
>>
>>
>> On 7/9/2019 7:39 PM, Brett Chancellor wrote:
>>
>> Too large for pastebin.. The problem is continually crashing new OSDs.
>> Here is the latest one.
>>
>> On Tue, Jul 9, 2019 at 11:46 AM Igor Fedotov <ifedo...@suse.de> wrote:
>>
>>> could you please set debug bluestore to 20 and collect startup log for
>>> this specific OSD once again?
>>>
>>>
>>> On 7/9/2019 6:29 PM, Brett Chancellor wrote:
>>>
>>> I restarted most of the OSDs with the stupid allocator (6 of them
>>> wouldn't start unless bitmap allocator was set), but I'm still seeing
>>> issues with OSDs crashing.  Interestingly it seems that the dying OSDs are
>>> always working on a pg from the .rgw.meta pool when they crash.
>>>
>>> Log : https://pastebin.com/yuJKcPvX
>>>
>>> On Tue, Jul 9, 2019 at 5:14 AM Igor Fedotov <ifedo...@suse.de> wrote:
>>>
>>>> Hi Brett,
>>>>
>>>> in Nautilus you can do that via
>>>>
>>>> ceph config set osd.N bluestore_allocator stupid
>>>>
>>>> ceph config set osd.N bluefs_allocator stupid
>>>>
>>>> See
>>>> https://ceph.com/community/new-mimic-centralized-configuration-management/
>>>> for more details on a new way of configuration options setting.
>>>>
>>>>
>>>> A known issue with Stupid allocator is gradual write request latency
>>>> increase (occurred within several days after OSD restart). Seldom observed
>>>> though. There were some posts about that behavior in the mail list  this
>>>> year.
>>>>
>>>> Thanks,
>>>>
>>>> Igor.
>>>>
>>>>
>>>> On 7/8/2019 8:33 PM, Brett Chancellor wrote:
>>>>
>>>>
>>>> I'll give that a try.  Is it something like...
>>>> ceph tell 'osd.*' bluestore_allocator stupid
>>>> ceph tell 'osd.*' bluefs_allocator stupid
>>>>
>>>> And should I expect any issues doing this?
>>>>
>>>>
>>>> On Mon, Jul 8, 2019 at 1:04 PM Igor Fedotov <ifedo...@suse.de> wrote:
>>>>
>>>>> I should read call stack more carefully... It's not about lacking free
>>>>> space - this is rather the bug from this ticket:
>>>>>
>>>>> http://tracker.ceph.com/issues/40080
>>>>>
>>>>>
>>>>> You should upgrade to v14.2.2 (once it's available) or temporarily
>>>>> switch to stupid allocator as a workaround.
>>>>>
>>>>>
>>>>> Thanks,
>>>>>
>>>>> Igor
>>>>>
>>>>>
>>>>>
>>>>> On 7/8/2019 8:00 PM, Igor Fedotov wrote:
>>>>>
>>>>> Hi Brett,
>>>>>
>>>>> looks like BlueStore is unable to allocate additional space for BlueFS
>>>>> at main device. It's either lacking free space or it's too fragmented...
>>>>>
>>>>> Would you share osd log, please?
>>>>>
>>>>> Also please run "ceph-bluestore-tool --path <substitute with
>>>>> path-to-osd!!!> bluefs-bdev-sizes" and share the output.
>>>>>
>>>>> Thanks,
>>>>>
>>>>> Igor
>>>>> On 7/3/2019 9:59 PM, Brett Chancellor wrote:
>>>>>
>>>>> Hi All! Today I've had 3 OSDs stop themselves and are unable to
>>>>> restart, all with the same error. These OSDs are all on different hosts.
>>>>> All are running 14.2.1
>>>>>
>>>>> I did try the following two commands
>>>>> - ceph-kvstore-tool bluestore-kv /var/lib/ceph/osd/ceph-80 list > keys
>>>>>   ## This failed with the same error below
>>>>> - ceph-bluestore-tool --path /var/lib/ceph/osd/ceph-80 fsck
>>>>>  ## After a couple of hours returned...
>>>>> 2019-07-03 18:30:02.095 7fe7c1c1ef00 -1
>>>>> bluestore(/var/lib/ceph/osd/ceph-80) fsck warning: legacy statfs record
>>>>> found, suggest to run store repair to get consistent statistic reports
>>>>> fsck success
>>>>>
>>>>>
>>>>> ## Error when trying to start one of the OSDs
>>>>>    -12> 2019-07-03 18:36:57.450 7f5e42366700 -1 *** Caught signal
>>>>> (Aborted) **
>>>>>  in thread 7f5e42366700 thread_name:rocksdb:low0
>>>>>
>>>>>  ceph version 14.2.1 (d555a9489eb35f84f2e1ef49b77e19da9d113972)
>>>>> nautilus (stable)
>>>>>  1: (()+0xf5d0) [0x7f5e50bd75d0]
>>>>>  2: (gsignal()+0x37) [0x7f5e4f9ce207]
>>>>>  3: (abort()+0x148) [0x7f5e4f9cf8f8]
>>>>>  4: (ceph::__ceph_assert_fail(char const*, char const*, int, char
>>>>> const*)+0x199) [0x55a7aaee96ab]
>>>>>  5: (ceph::__ceph_assertf_fail(char const*, char const*, int, char
>>>>> const*, char const*, ...)+0) [0x55a7aaee982a]
>>>>>  6: (interval_set<unsigned long, std::map<unsigned long, unsigned
>>>>> long, std::less<unsigned long>, std::allocator<std::pair<unsigned long
>>>>> const, unsigned long> > > >::insert(unsigned long, unsigned long, unsigned
>>>>> long*, unsigned long*)+0x3c6) [0x55a7ab212a66]
>>>>>  7: (BlueStore::allocate_bluefs_freespace(unsigned long, unsigned
>>>>> long, std::vector<bluestore_pextent_t,
>>>>> mempool::pool_allocator<(mempool::pool_index_t)4, bluestore_pextent_t>
>>>>> >*)+0x74e) [0x55a7ab48253e]
>>>>>  8: (BlueFS::_expand_slow_device(unsigned long,
>>>>> std::vector<bluestore_pextent_t,
>>>>> mempool::pool_allocator<(mempool::pool_index_t)4, bluestore_pextent_t>
>>>>> >&)+0x111) [0x55a7ab59e921]
>>>>>  9: (BlueFS::_allocate(unsigned char, unsigned long,
>>>>> bluefs_fnode_t*)+0x68b) [0x55a7ab59f68b]
>>>>>  10: (BlueFS::_flush_range(BlueFS::FileWriter*, unsigned long,
>>>>> unsigned long)+0xe5) [0x55a7ab59fce5]
>>>>>  11: (BlueFS::_flush(BlueFS::FileWriter*, bool)+0x10b) [0x55a7ab5a1b4b]
>>>>>  12: (BlueRocksWritableFile::Flush()+0x3d) [0x55a7ab5bf84d]
>>>>>  13: (rocksdb::WritableFileWriter::Flush()+0x19e) [0x55a7abbedd0e]
>>>>>  14: (rocksdb::WritableFileWriter::Sync(bool)+0x2e) [0x55a7abbedfee]
>>>>>  15:
>>>>> (rocksdb::CompactionJob::FinishCompactionOutputFile(rocksdb::Status 
>>>>> const&,
>>>>> rocksdb::CompactionJob::SubcompactionState*, rocksdb::RangeDelAggregator*,
>>>>> CompactionIterationStats*, rocksdb::Slice const*)+0xbaa) [0x55a7abc3b73a]
>>>>>  16:
>>>>> (rocksdb::CompactionJob::ProcessKeyValueCompaction(rocksdb::CompactionJob::SubcompactionState*)+0x7d0)
>>>>> [0x55a7abc3f150]
>>>>>  17: (rocksdb::CompactionJob::Run()+0x298) [0x55a7abc40618]
>>>>>  18: (rocksdb::DBImpl::BackgroundCompaction(bool*,
>>>>> rocksdb::JobContext*, rocksdb::LogBuffer*,
>>>>> rocksdb::DBImpl::PrepickedCompaction*)+0xcb7) [0x55a7aba7fb67]
>>>>>  19:
>>>>> (rocksdb::DBImpl::BackgroundCallCompaction(rocksdb::DBImpl::PrepickedCompaction*,
>>>>> rocksdb::Env::Priority)+0xd0) [0x55a7aba813c0]
>>>>>  20: (rocksdb::DBImpl::BGWorkCompaction(void*)+0x3a) [0x55a7aba8190a]
>>>>>  21: (rocksdb::ThreadPoolImpl::Impl::BGThread(unsigned long)+0x264)
>>>>> [0x55a7abc8d9c4]
>>>>>  22: (rocksdb::ThreadPoolImpl::Impl::BGThreadWrapper(void*)+0x4f)
>>>>> [0x55a7abc8db4f]
>>>>>  23: (()+0x129dfff) [0x55a7abd1afff]
>>>>>  24: (()+0x7dd5) [0x7f5e50bcfdd5]
>>>>>  25: (clone()+0x6d) [0x7f5e4fa95ead]
>>>>>  NOTE: a copy of the executable, or `objdump -rdS <executable>` is
>>>>> needed to interpret this.
>>>>>
>>>>> _______________________________________________
>>>>> ceph-users mailing 
>>>>> listceph-us...@lists.ceph.comhttp://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> ceph-users mailing 
>>>>> listceph-us...@lists.ceph.comhttp://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>>>>
>>>>>

_______________________________________________
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

Re: [ceph-users] 3 OSDs stopped and unable to restart

Reply via email to