Glad you got it working and thanks for the logs! Looks like we've seen
this once or twice before so I added them to
https://tracker.ceph.com/issues/38724.
-Greg

On Fri, Apr 26, 2019 at 5:52 PM Elise Burke <[email protected]> wrote:
>
> Thanks for the pointer to ceph-objectstore-tool, it turns out that removing 
> and exporting the PG from all three disks was enough to make it boot! I've 
> exported the three copies of the bad PG, let me know if you'd like me to 
> upload them anywhere for inspection.
>
> All data has been recovered (since I was originally removing the pool that 
> contained pg 25.0 anyway) and all systems are go on my end. Ceph's 
> architecture is very solid.
>
>
> $ ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-6 --op 
> export-remove --pgid 25.0 --file pg_25_0_from_osd_6.bin
> Exporting 25.0 info 25.0( v 7592'106 (0'0,7592'106] lb MIN (bitwise) 
> local-lis/les=7488/7489 n=9 ec=5191/5191 lis/c 7488/7488 les/c/f 7489/7489/0 
> 7593/7593/7593)
> Export successful
>  marking collection for removal
> setting '_remove' omap key
> finish_remove_pgs 25.0_head removing 25.0
> Remove successful
>
>
> On Fri, Apr 26, 2019 at 8:33 PM Elise Burke <[email protected]> wrote:
>>
>> Using ceph-objectstore-info on PG 25.0 (which indeed, was the one I remember 
>> having the error) shows this:
>>
>> struct_v 10
>> {
>>     "pgid": "25.0",
>>     "last_update": "7592'106",
>>     "last_complete": "7592'106",
>>     "log_tail": "0'0",
>>     "last_user_version": 106,
>>     "last_backfill": "MIN",
>>     "last_backfill_bitwise": 1,
>>     "purged_snaps": [],
>>     "history": {
>>         "epoch_created": 5191,
>>         "epoch_pool_created": 5191,
>>         "last_epoch_started": 7489,
>>         "last_interval_started": 7488,
>>         "last_epoch_clean": 7489,
>>         "last_interval_clean": 7488,
>>         "last_epoch_split": 0,
>>         "last_epoch_marked_full": 0,
>>         "same_up_since": 7593,
>>         "same_interval_since": 7593,
>>         "same_primary_since": 7593,
>>         "last_scrub": "7592'106",
>>         "last_scrub_stamp": "2019-04-25 21:34:52.079721",
>>         "last_deep_scrub": "7485'70",
>>         "last_deep_scrub_stamp": "2019-04-22 10:15:40.532014",
>>         "last_clean_scrub_stamp": "2019-04-19 14:52:44.047548"
>>     },
>>     "stats": {
>>         "version": "7592'105",
>>         "reported_seq": "2621",
>>         "reported_epoch": "7592",
>>         "state": "active+clean+inconsistent",
>>         "last_fresh": "2019-04-25 20:02:55.620028",
>>         "last_change": "2019-04-24 19:52:45.072473",
>>         "last_active": "2019-04-25 20:02:55.620028",
>>         "last_peered": "2019-04-25 20:02:55.620028",
>>         "last_clean": "2019-04-25 20:02:55.620028",
>>         "last_became_active": "2019-04-22 17:55:37.578239",
>>         "last_became_peered": "2019-04-22 17:55:37.578239",
>>         "last_unstale": "2019-04-25 20:02:55.620028",
>>         "last_undegraded": "2019-04-25 20:02:55.620028",
>>         "last_fullsized": "2019-04-25 20:02:55.620028",
>>         "mapping_epoch": 7593,
>>         "log_start": "0'0",
>>         "ondisk_log_start": "0'0",
>>         "created": 5191,
>>         "last_epoch_clean": 7489,
>>         "parent": "0.0",
>>         "parent_split_bits": 0,
>>         "last_scrub": "7592'88",
>>         "last_scrub_stamp": "2019-04-24 19:52:45.072367",
>>         "last_deep_scrub": "7485'70",
>>         "last_deep_scrub_stamp": "2019-04-22 10:15:40.532014",
>>         "last_clean_scrub_stamp": "2019-04-19 14:52:44.047548",
>>         "log_size": 105,
>>         "ondisk_log_size": 105,
>>         "stats_invalid": false,
>>         "dirty_stats_invalid": false,
>>         "omap_stats_invalid": false,
>>         "hitset_stats_invalid": false,
>>         "hitset_bytes_stats_invalid": false,
>>         "pin_stats_invalid": false,
>>         "manifest_stats_invalid": false,
>>         "snaptrimq_len": 0,
>>         "stat_sum": {
>>             "num_bytes": 0,
>>             "num_objects": 9,
>>             "num_object_clones": 0,
>>             "num_object_copies": 27,
>>             "num_objects_missing_on_primary": 0,
>>             "num_objects_missing": 0,
>>             "num_objects_degraded": 0,
>>             "num_objects_misplaced": 0,
>>             "num_objects_unfound": 0,
>>             "num_objects_dirty": 9,
>>             "num_whiteouts": 0,
>>             "num_read": 87,
>>             "num_read_kb": 87,
>>             "num_write": 98,
>>             "num_write_kb": 98,
>>             "num_scrub_errors": 0,
>>             "num_shallow_scrub_errors": 0,
>>             "num_deep_scrub_errors": 0,
>>             "num_objects_recovered": 0,
>>             "num_bytes_recovered": 0,
>>             "num_keys_recovered": 0,
>>             "num_objects_omap": 9,
>>             "num_objects_hit_set_archive": 0,
>>             "num_bytes_hit_set_archive": 0,
>>             "num_flush": 0,
>>             "num_flush_kb": 0,
>>             "num_evict": 0,
>>             "num_evict_kb": 0,
>>             "num_promote": 0,
>>             "num_flush_mode_high": 0,
>>             "num_flush_mode_low": 0,
>>             "num_evict_mode_some": 0,
>>             "num_evict_mode_full": 0,
>>             "num_objects_pinned": 0,
>>             "num_legacy_snapsets": 0,
>>             "num_large_omap_objects": 0,
>>             "num_objects_manifest": 0,
>>             "num_omap_bytes": 0,
>>             "num_omap_keys": 0
>>         },
>>         "up": [],
>>         "acting": [],
>>         "blocked_by": [],
>>         "up_primary": -1,
>>         "acting_primary": -1,
>>         "purged_snaps": []
>>     },
>>     "empty": 0,
>>     "dne": 0,
>>     "incomplete": 1,
>>     "last_epoch_started": 7489,
>>     "hit_set_history": {
>>         "current_last_update": "0'0",
>>         "history": []
>>     }
>> }
>>
>>
>> On Fri, Apr 26, 2019 at 7:35 PM Elise Burke <[email protected]> wrote:
>>>
>>> Thanks for the suggestions. I've uploaded the surprisingly large (1.5G!) 
>>> log file: ceph-post-file: 2d8d22f4-580b-4b57-a13a-f49dade34ba7
>>>
>>> Looks like these are the relevant lines:
>>>
>>>    -52> 2019-04-26 19:23:05.190 7fb2657dc700 20 osd.2 op_wq(2) _process 
>>> empty q, waiting
>>>    -51> 2019-04-26 19:23:05.190 7fb2667de700 20 osd.2 op_wq(0) _process 
>>> OpQueueItem(25.0 PGDelete(25.0 e7648) prio 5 cost 1048576 e7648) queued
>>>    -50> 2019-04-26 19:23:05.190 7fb2717f4700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _kv_finalize_thread sleep
>>>    -49> 2019-04-26 19:23:05.190 7fb2667de700 20 osd.2 op_wq(0) _process 
>>> 25.0 to_process <OpQueueItem(25.0 PGDelete(25.0 e7648) prio 5 cost 1048576 
>>> e7648)> waiting <> waiting_peering {}
>>>    -48> 2019-04-26 19:23:05.190 7fb2667de700 20 osd.2 op_wq(0) _process 
>>> OpQueueItem(25.0 PGDelete(25.0 e7648) prio 5 cost 1048576 e7648) pg 
>>> 0x7c20a00000
>>>    -47> 2019-04-26 19:23:05.190 7fb2667de700 10 osd.2 pg_epoch: 7648 
>>> pg[25.0( v 7592'106 (0'0,7592'106] lb MIN (bitwise) local-lis/les=7488/7489 
>>> n=9 ec=5191/5191 lis/c 7488/7488 les/c/f 7489/7489/0 7593/7593/7593) [] 
>>> r=-1 lpr=7648 D
>>> ELETING pi=[7488,7593)/1 crt=7592'106 lcod 0'0 unknown NOTIFY mbc={}] 
>>> do_peering_event: epoch_sent: 7648 epoch_requested: 7648 DeleteSome
>>>    -46> 2019-04-26 19:23:05.190 7fb2667de700 10 osd.2 pg_epoch: 7648 
>>> pg[25.0( v 7592'106 (0'0,7592'106] lb MIN (bitwise) local-lis/les=7488/7489 
>>> n=9 ec=5191/5191 lis/c 7488/7488 les/c/f 7489/7489/0 7593/7593/7593) [] 
>>> r=-1 lpr=7648 D
>>> ELETING pi=[7488,7593)/1 crt=7592'106 lcod 0'0 unknown NOTIFY mbc={}] 
>>> _delete_some
>>>    -45> 2019-04-26 19:23:05.190 7fb2667de700 15 
>>> bluestore(/var/lib/ceph/osd/ceph-2) collection_list 25.0_head start GHMIN 
>>> end GHMAX max 30
>>>    -44> 2019-04-26 19:23:05.190 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list range 
>>> 0x7f7fffffffffffffe500000000 to 0x7f7fffffffffffffe5ffffffff and 
>>> 0x7f800000000000001900000000 to 0x7f8000000000000019ffffffff
>>>  start GHMIN
>>>    -43> 2019-04-26 19:23:05.191 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list pend 
>>> 0x7f7fffffffffffffe5ffffffff
>>>    -42> 2019-04-26 19:23:05.191 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list key 
>>> 0x7f7fffffffffffffff100020'P!osdmap.7114!='0x0000000000000000ffffffffffffffff'o'
>>>  >= GHMAX
>>>    -41> 2019-04-26 19:23:05.191 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list oid 
>>> #25:00000000::::head# end GHMAX
>>>    -40> 2019-04-26 19:23:05.191 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list oid 
>>> #25:b08b92bd::::head# end GHMAX
>>>    -39> 2019-04-26 19:23:05.191 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list key 
>>> 0x80800000000000000c1c00000021213dfffffffffffffffeffffffffffffffff'o' >= 
>>> GHMAX
>>>    -38> 2019-04-26 19:23:05.191 7fb2667de700 10 
>>> bluestore(/var/lib/ceph/osd/ceph-2) collection_list 25.0_head start GHMAX 
>>> end GHMAX max 30 = 0, ls.size() = 2, next = GHMAX
>>>    -37> 2019-04-26 19:23:05.191 7fb2667de700 20 osd.2 pg_epoch: 7648 
>>> pg[25.0( v 7592'106 (0'0,7592'106] lb MIN (bitwise) local-lis/les=7488/7489 
>>> n=9 ec=5191/5191 lis/c 7488/7488 les/c/f 7489/7489/0 7593/7593/7593) [] 
>>> r=-1 lpr=7648 D
>>> ELETING pi=[7488,7593)/1 crt=7592'106 lcod 0'0 unknown NOTIFY mbc={}] 
>>> _delete_some [#25:00000000::::head#,#25:b08b92bd::::head#]
>>>    -36> 2019-04-26 19:23:05.191 7fb2667de700 20 osd.2 pg_epoch: 7648 
>>> pg[25.0( v 7592'106 (0'0,7592'106] lb MIN (bitwise) local-lis/les=7488/7489 
>>> n=9 ec=5191/5191 lis/c 7488/7488 les/c/f 7489/7489/0 7593/7593/7593) [] 
>>> r=-1 lpr=7648 D
>>> ELETING pi=[7488,7593)/1 crt=7592'106 lcod 0'0 unknown NOTIFY mbc={}] 
>>> _delete_some finished
>>>    -35> 2019-04-26 19:23:05.191 7fb2667de700 10 
>>> bluestore(/var/lib/ceph/osd/ceph-2) queue_transactions ch 0x7c27d94b40 
>>> 25.0_head
>>>    -34> 2019-04-26 19:23:05.191 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _txc_create osr 0x7c274f6c60 = 
>>> 0x7c2ef34c00 seq 3
>>>    -33> 2019-04-26 19:23:05.191 7fb2667de700 15 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _remove 25.0_head #25:00000000::::head# 
>>> onode 0x7c20aaf200 txc 0x7c2ef34c00
>>>    -32> 2019-04-26 19:23:05.191 7fb2667de700 15 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _do_truncate 25.0_head 
>>> #25:00000000::::head# 0x0
>>>    -31> 2019-04-26 19:23:05.191 7fb2667de700 20 
>>> bluestore.onode(0x7c20aaf200).flush flush done
>>>    -30> 2019-04-26 19:23:05.191 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _do_omap_clear remove range start: 
>>> 0x0000000000b0295a'-' end: 0x0000000000b0295a'~'
>>>    -29> 2019-04-26 19:23:05.191 7fb2667de700 10 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _remove 25.0_head #25:00000000::::head# 
>>> = 0
>>>    -28> 2019-04-26 19:23:05.191 7fb2667de700 15 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection 25.0_head
>>>    -27> 2019-04-26 19:23:05.191 7fb2667de700 20 
>>> bluestore.OnodeSpace(0x7c27d94c98 in 0x7c1f7e10a0) map_any
>>>    -26> 2019-04-26 19:23:05.191 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list range 
>>> 0x7f7fffffffffffffe500000000 to 0x7f7fffffffffffffe5ffffffff and 
>>> 0x7f800000000000001900000000 to 0x7f8000000000000019ffffffff
>>>  start GHMIN
>>>    -25> 2019-04-26 19:23:05.192 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list pend 
>>> 0x7f7fffffffffffffe5ffffffff
>>>    -24> 2019-04-26 19:23:05.192 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list key 
>>> 0x7f7fffffffffffffff100020'P!osdmap.7114!='0x0000000000000000ffffffffffffffff'o'
>>>  >= GHMAX
>>>    -23> 2019-04-26 19:23:05.192 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list oid 
>>> #25:00000000::::head# end GHMAX
>>>    -22> 2019-04-26 19:23:05.192 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list oid 
>>> #25:b08b92bd::::head# end GHMAX
>>>    -21> 2019-04-26 19:23:05.192 7fb2667de700 20 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list key 
>>> 0x80800000000000000c1c00000021213dfffffffffffffffeffffffffffffffff'o' >= 
>>> GHMAX
>>>    -20> 2019-04-26 19:23:05.192 7fb2667de700 10 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection oid 
>>> #25:00000000::::head#
>>>    -19> 2019-04-26 19:23:05.192 7fb2667de700 10 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection oid 
>>> #25:b08b92bd::::head#
>>>    -18> 2019-04-26 19:23:05.192 7fb2667de700 10 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection 
>>> #25:b08b92bd::::head# exists in db, not present in ram
>>>    -17> 2019-04-26 19:23:05.192 7fb2667de700 10 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection 25.0_head is 
>>> non-empty
>>>    -16> 2019-04-26 19:23:05.192 7fb2667de700 10 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection 25.0_head = -39
>>>    -15> 2019-04-26 19:23:05.192 7fb2667de700 -1 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _txc_add_transaction error (39) 
>>> Directory not empty not handled on operation 21 (op 1, counting from 0)
>>>    -14> 2019-04-26 19:23:05.192 7fb2667de700  0 
>>> bluestore(/var/lib/ceph/osd/ceph-2) _dump_transaction transaction dump:
>>> {
>>>     "ops": [
>>>         {
>>>             "op_num": 0,
>>>             "op_name": "remove",
>>>             "collection": "25.0_head",
>>>             "oid": "#25:00000000::::head#"
>>>         },
>>>         {
>>>             "op_num": 1,
>>>             "op_name": "rmcoll",
>>>             "collection": "25.0_head"
>>>         }
>>>
>>>    -13> 2019-04-26 19:23:05.199 7fb2667de700 -1 
>>> /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14.2.0/rpm/el7/BUILD/ceph-14.2.0/src/os/bluestore/BlueStore.cc:
>>>  In function 'void 
>>> BlueStore::_txc_add_transaction(BlueStore::TransContext*, 
>>> ObjectStore::Transaction*)' thread 7fb2667de700 time 2019-04-26 
>>> 19:23:05.193826
>>> /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14.2.0/rpm/el7/BUILD/ceph-14.2.0/src/os/bluestore/BlueStore.cc:
>>>  11069: abort()
>>>
>>>  ceph version 14.2.0 (3a54b2b6d167d4a2a19e003a705696d4fe619afc) nautilus 
>>> (stable)
>>>  1: (ceph::__ceph_abort(char const*, int, char const*, std::string 
>>> const&)+0xd8) [0x7c1454ee40]
>>>  2: (BlueStore::_txc_add_transaction(BlueStore::TransContext*, 
>>> ObjectStore::Transaction*)+0x2a85) [0x7c14b2d5f5]
>>>  3: 
>>> (BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
>>>  std::vector<ObjectStore::Transaction, 
>>> std::allocator<ObjectStore::Transaction> >&, 
>>> boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)+0x526) 
>>> [0x7c14b2e366]
>>>  4: 
>>> (ObjectStore::queue_transaction(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
>>>  ObjectStore::Transaction&&, boost::intrusive_ptr<TrackedOp>, 
>>> ThreadPool::TPHandle*)+0x7f) [0x7c1470a81f]
>>>  5: (PG::_delete_some(ObjectStore::Transaction*)+0x83d) [0x7c1476d70d]
>>>  6: (PG::RecoveryState::Deleting::react(PG::DeleteSome const&)+0x38) 
>>> [0x7c1476e528]
>>>  7: (boost::statechart::simple_state<PG::RecoveryState::Deleting, 
>>> PG::RecoveryState::ToDelete, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, 
>>> mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, 
>>> mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, 
>>> mpl_::na, mpl_::na, mpl_::na>, 
>>> (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base
>>>  const&, void const*)+0x16a) [0x7c147acc8a]
>>>  8: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, 
>>> PG::RecoveryState::Initial, std::allocator<void>, 
>>> boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base
>>>  const&)+0x5a) [0x7c1478a91a]
>>>  9: (PG::do_peering_event(std::shared_ptr<PGPeeringEvent>, 
>>> PG::RecoveryCtx*)+0x119) [0x7c14779c99]
>>>  10: (OSD::dequeue_peering_evt(OSDShard*, PG*, 
>>> std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x1b4) 
>>> [0x7c146b4494]
>>>  11: (OSD::dequeue_delete(OSDShard*, PG*, unsigned int, 
>>> ThreadPool::TPHandle&)+0x234) [0x7c146b48d4]
>>>  12: (OSD::ShardedOpWQ::_process(unsigned int, 
>>> ceph::heartbeat_handle_d*)+0x9f4) [0x7c146a8c14]
>>>  13: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x433) 
>>> [0x7c14ca0f43]
>>>  14: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x7c14ca3fe0]
>>>  15: (()+0x7dd5) [0x7fb284e3bdd5]
>>>  16: (clone()+0x6d) [0x7fb283d01ead]
>>>
>>>    -12> 2019-04-26 19:23:05.199 7fb280b24700  1 -- 
>>> [v2:192.168.1.152:6800/1887364,v1:192.168.1.152:6801/1887364] <== 
>>> client.244681 v1:192.168.1.50:0/337348516 6 ==== 
>>> osd_op(client.244681.0:2571786 12.5es0 12.43dfa65e (undecoded) 
>>> ondisk+retry+write+known_if_redirected e7648) v8 ==== 229+0+4194304 
>>> (unknown 2519985655 0 813231621) 0x7c2f3ee840 con 0x7c2f2fac00
>>>    -11> 2019-04-26 19:23:05.199 7fb280b24700 15 osd.2 7648 enqueue_op 
>>> 0x7c300f8420 prio 63 cost 4194304 latency 0.035786 epoch 7648 
>>> osd_op(client.244681.0:2571786 12.5es0 12.43dfa65e (undecoded) 
>>> ondisk+retry+write+known_if_redirected e7648) v8
>>>    -10> 2019-04-26 19:23:05.199 7fb280b24700 20 osd.2 op_wq(4) _enqueue 
>>> OpQueueItem(12.5es0 PGOpItem(op=osd_op(client.244681.0:2571786 12.5es0 
>>> 12.43dfa65e (undecoded) ondisk+retry+write+known_if_redirected e7648) v8) 
>>> prio 63 cost 4194304 e7648)
>>>     -9> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 op_wq(4) _process 
>>> 12.5es0 to_process <> waiting <> waiting_peering {}
>>>     -8> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 op_wq(4) _process 
>>> OpQueueItem(12.5es0 PGOpItem(op=osd_op(client.244681.0:2571786 12.5es0 
>>> 12.43dfa65e (undecoded) ondisk+retry+write+known_if_redirected e7648) v8) 
>>> prio 63 cost 4194304 e7648) queued
>>>     -7> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 op_wq(4) _process 
>>> 12.5es0 to_process <OpQueueItem(12.5es0 
>>> PGOpItem(op=osd_op(client.244681.0:2571786 12.5es0 12.43dfa65e (undecoded) 
>>> ondisk+retry+write+known_if_redirected e7648) v8) prio 63 cost 4194304 
>>> e7648)> waiting <> waiting_peering {}
>>>     -6> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 op_wq(4) _process 
>>> OpQueueItem(12.5es0 PGOpItem(op=osd_op(client.244681.0:2571786 12.5es0 
>>> 12.43dfa65e (undecoded) ondisk+retry+write+known_if_redirected e7648) v8) 
>>> prio 63 cost 4194304 e7648) pg 0x7c271c6000
>>>     -5> 2019-04-26 19:23:05.200 7fb2647da700 10 osd.2 7648 dequeue_op 
>>> 0x7c300f8420 prio 63 cost 4194304 latency 0.036428 
>>> osd_op(client.244681.0:2571786 12.5es0 12.43dfa65e (undecoded) 
>>> ondisk+retry+write+known_if_redirected e7648) v8 pg pg[12.5es0( v 
>>> 7592'250137 (5872'250020,7592'250137] local-lis/les=7488/7489 n=22756 
>>> ec=5088/733 lis/c 7488/7488 les/c/f 7489/7489/5043 7648/7648/7648) 
>>> [2,2147483647,0,7,4]p2(0) r=0 lpr=7648 pi=[7488,7648)/1 crt=7592'250135 
>>> lcod 0'0 mlcod 0'0 peering mbc={}]
>>>     -4> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 7648 share_map 
>>> client.244681 v1:192.168.1.50:0/337348516 7648
>>>     -3> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 7648 should_share_map 
>>> client.244681 v1:192.168.1.50:0/337348516 7648
>>>     -2> 2019-04-26 19:23:05.200 7fb2647da700 10 osd.2 7648 dequeue_op 
>>> 0x7c300f8420 finish
>>>     -1> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 op_wq(4) _process 
>>> empty q, waiting
>>>      0> 2019-04-26 19:23:05.206 7fb2667de700 -1 *** Caught signal (Aborted) 
>>> **
>>>  in thread 7fb2667de700 thread_name:tp_osd_tp
>>>
>>>
>>> On Fri, Apr 26, 2019 at 3:57 PM Gregory Farnum <[email protected]> wrote:
>>>>
>>>> You'll probably want to generate a log with "debug osd = 20" and
>>>> "debug bluestore = 20", then share that or upload it with
>>>> ceph-post-file, to get more useful info about which PGs are breaking
>>>> (is it actually the ones that were supposed to delete?).
>>>>
>>>> If there's a particular set of PGs you need to rescue, you can also
>>>> look at using the ceph-objectstore-tool to export them off the busted
>>>> OSD stores and import them into OSDs that still work.
>>>>
>>>>
>>>> On Fri, Apr 26, 2019 at 12:01 PM Elise Burke <[email protected]> wrote:
>>>> >
>>>> > Hi,
>>>> >
>>>> > I upgraded to Nautilus a week or two ago and things had been mostly 
>>>> > fine. I was interested in trying the device health stats feature and 
>>>> > enabled it. In doing so it created a pool, device_health_metrics, which 
>>>> > contained zero bytes.
>>>> >
>>>> > Unfortunately this pool developed a PG that could not be repaired with 
>>>> > `ceph pg repair`. That's okay, I thought, this pool is empty (zero 
>>>> > bytes), so I'll just remove it and discard the PG entirely.
>>>> >
>>>> > So I did: `ceph osd pool rm device_health_metrics device_health_metrics 
>>>> > --yes-i-really-really-mean-it`
>>>> >
>>>> > Within a few seconds three OSDs had gone missing (this pool was size=3) 
>>>> > and now crashloop at startup.
>>>> >
>>>> > Any assistance in getting these OSDs up (such as by discarding the 
>>>> > errant PG) would be appreciated. I'm most concerned about the other 
>>>> > pools in the system, as losing three OSDs at once has not been ideal.
>>>> >
>>>> > This is made more difficult as these are in the Bluestore configuration 
>>>> > and were set up with ceph-deploy to bare metal (using LVM mode).
>>>> >
>>>> > Here's the traceback as noted in journalctl:
>>>> >
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: -5381> 2019-04-26 
>>>> > 11:01:08.902 7f8a00866d80 -1 Falling back to public interface
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: -4241> 2019-04-26 
>>>> > 11:01:41.835 7f8a00866d80 -1 osd.2 7630 log_to_monitors {default=true}
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: -3> 2019-04-26 11:01:43.203 
>>>> > 7f89dee53700 -1 bluestore(/var/lib/ceph/osd/ceph-2) _txc_add_transaction 
>>>> > error (39) Directory not empty not handled on operation 21 (op 1, 
>>>> > counting from 0)
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: -1> 2019-04-26 11:01:43.209 
>>>> > 7f89dee53700 -1 
>>>> > /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 
>>>> > /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14.2.0/rpm/el7/BUILD/ceph-14.2.0/src/os/bluest
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: ceph version 14.2.0 
>>>> > (3a54b2b6d167d4a2a19e003a705696d4fe619afc) nautilus (stable)
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 1: (ceph::__ceph_abort(char 
>>>> > const*, int, char const*, std::string const&)+0xd8) [0xfc63afe40]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 2: 
>>>> > (BlueStore::_txc_add_transaction(BlueStore::TransContext*, 
>>>> > ObjectStore::Transaction*)+0x2a85) [0xfc698e5f5]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 3: 
>>>> > (BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
>>>> >  std::vector<ObjectStore::Transaction, 
>>>> > std::allocator<ObjectStore::Transaction> >&, boost::intrusive_ptr
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 4: 
>>>> > (ObjectStore::queue_transaction(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
>>>> >  ObjectStore::Transaction&&, boost::intrusive_ptr<TrackedOp>, 
>>>> > ThreadPool::TPHandle*)+0x7f) [0xfc656b81f
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 5: 
>>>> > (PG::_delete_some(ObjectStore::Transaction*)+0x83d) [0xfc65ce70d]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 6: 
>>>> > (PG::RecoveryState::Deleting::react(PG::DeleteSome const&)+0x38) 
>>>> > [0xfc65cf528]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 7: 
>>>> > (boost::statechart::simple_state<PG::RecoveryState::Deleting, 
>>>> > PG::RecoveryState::ToDelete, boost::mpl::list<mpl_::na, mpl_::na, 
>>>> > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 8: 
>>>> > (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, 
>>>> > PG::RecoveryState::Initial, std::allocator<void>, 
>>>> > boost::statechart::null_exception_translator>::process_event(boost
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 9: 
>>>> > (PG::do_peering_event(std::shared_ptr<PGPeeringEvent>, 
>>>> > PG::RecoveryCtx*)+0x119) [0xfc65dac99]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 10: 
>>>> > (OSD::dequeue_peering_evt(OSDShard*, PG*, 
>>>> > std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x1b4) 
>>>> > [0xfc6515494]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 11: 
>>>> > (OSD::dequeue_delete(OSDShard*, PG*, unsigned int, 
>>>> > ThreadPool::TPHandle&)+0x234) [0xfc65158d4]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 12: 
>>>> > (OSD::ShardedOpWQ::_process(unsigned int, 
>>>> > ceph::heartbeat_handle_d*)+0x9f4) [0xfc6509c14]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 13: 
>>>> > (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x433) 
>>>> > [0xfc6b01f43]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 14: 
>>>> > (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0xfc6b04fe0]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 15: (()+0x7dd5) 
>>>> > [0x7f89fd4b0dd5]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 16: (clone()+0x6d) 
>>>> > [0x7f89fc376ead]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 0> 2019-04-26 11:01:43.217 
>>>> > 7f89dee53700 -1 *** Caught signal (Aborted) **
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: in thread 7f89dee53700 
>>>> > thread_name:tp_osd_tp
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: ceph version 14.2.0 
>>>> > (3a54b2b6d167d4a2a19e003a705696d4fe619afc) nautilus (stable)
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 1: (()+0xf5d0) 
>>>> > [0x7f89fd4b85d0]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 2: (gsignal()+0x37) 
>>>> > [0x7f89fc2af207]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 3: (abort()+0x148) 
>>>> > [0x7f89fc2b08f8]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 4: (ceph::__ceph_abort(char 
>>>> > const*, int, char const*, std::string const&)+0x19c) [0xfc63aff04]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 5: 
>>>> > (BlueStore::_txc_add_transaction(BlueStore::TransContext*, 
>>>> > ObjectStore::Transaction*)+0x2a85) [0xfc698e5f5]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 6: 
>>>> > (BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
>>>> >  std::vector<ObjectStore::Transaction, 
>>>> > std::allocator<ObjectStore::Transaction> >&, boost::intrusive_ptr
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 7: 
>>>> > (ObjectStore::queue_transaction(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
>>>> >  ObjectStore::Transaction&&, boost::intrusive_ptr<TrackedOp>, 
>>>> > ThreadPool::TPHandle*)+0x7f) [0xfc656b81f
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 8: 
>>>> > (PG::_delete_some(ObjectStore::Transaction*)+0x83d) [0xfc65ce70d]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 9: 
>>>> > (PG::RecoveryState::Deleting::react(PG::DeleteSome const&)+0x38) 
>>>> > [0xfc65cf528]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 10: 
>>>> > (boost::statechart::simple_state<PG::RecoveryState::Deleting, 
>>>> > PG::RecoveryState::ToDelete, boost::mpl::list<mpl_::na, mpl_::na, 
>>>> > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::n
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 11: 
>>>> > (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, 
>>>> > PG::RecoveryState::Initial, std::allocator<void>, 
>>>> > boost::statechart::null_exception_translator>::process_event(boos
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 12: 
>>>> > (PG::do_peering_event(std::shared_ptr<PGPeeringEvent>, 
>>>> > PG::RecoveryCtx*)+0x119) [0xfc65dac99]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 13: 
>>>> > (OSD::dequeue_peering_evt(OSDShard*, PG*, 
>>>> > std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x1b4) 
>>>> > [0xfc6515494]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 14: 
>>>> > (OSD::dequeue_delete(OSDShard*, PG*, unsigned int, 
>>>> > ThreadPool::TPHandle&)+0x234) [0xfc65158d4]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 15: 
>>>> > (OSD::ShardedOpWQ::_process(unsigned int, 
>>>> > ceph::heartbeat_handle_d*)+0x9f4) [0xfc6509c14]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 16: 
>>>> > (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x433) 
>>>> > [0xfc6b01f43]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 17: 
>>>> > (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0xfc6b04fe0]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 18: (()+0x7dd5) 
>>>> > [0x7f89fd4b0dd5]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 19: (clone()+0x6d) 
>>>> > [0x7f89fc376ead]
>>>> > Apr 26 11:01:43 databox ceph-osd[1878533]: NOTE: a copy of the 
>>>> > executable, or `objdump -rdS <executable>` is needed to interpret this.
>>>> >
>>>> > Thanks!
>>>> >
>>>> > -Elise
>>>> > _______________________________________________
>>>> > ceph-users mailing list
>>>> > [email protected]
>>>> > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
_______________________________________________
ceph-users mailing list
[email protected]
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

Reply via email to