Using ceph-objectstore-info on PG 25.0 (which indeed, was the one I
remember having the error) shows this:

struct_v 10
{
    "pgid": "25.0",
    "last_update": "7592'106",
    "last_complete": "7592'106",
    "log_tail": "0'0",
    "last_user_version": 106,
    "last_backfill": "MIN",
    "last_backfill_bitwise": 1,
    "purged_snaps": [],
    "history": {
        "epoch_created": 5191,
        "epoch_pool_created": 5191,
        "last_epoch_started": 7489,
        "last_interval_started": 7488,
        "last_epoch_clean": 7489,
        "last_interval_clean": 7488,
        "last_epoch_split": 0,
        "last_epoch_marked_full": 0,
        "same_up_since": 7593,
        "same_interval_since": 7593,
        "same_primary_since": 7593,
        "last_scrub": "7592'106",
        "last_scrub_stamp": "2019-04-25 21:34:52.079721",
        "last_deep_scrub": "7485'70",
        "last_deep_scrub_stamp": "2019-04-22 10:15:40.532014",
        "last_clean_scrub_stamp": "2019-04-19 14:52:44.047548"
    },
    "stats": {
        "version": "7592'105",
        "reported_seq": "2621",
        "reported_epoch": "7592",
        "state": "active+clean+inconsistent",
        "last_fresh": "2019-04-25 20:02:55.620028",
        "last_change": "2019-04-24 19:52:45.072473",
        "last_active": "2019-04-25 20:02:55.620028",
        "last_peered": "2019-04-25 20:02:55.620028",
        "last_clean": "2019-04-25 20:02:55.620028",
        "last_became_active": "2019-04-22 17:55:37.578239",
        "last_became_peered": "2019-04-22 17:55:37.578239",
        "last_unstale": "2019-04-25 20:02:55.620028",
        "last_undegraded": "2019-04-25 20:02:55.620028",
        "last_fullsized": "2019-04-25 20:02:55.620028",
        "mapping_epoch": 7593,
        "log_start": "0'0",
        "ondisk_log_start": "0'0",
        "created": 5191,
        "last_epoch_clean": 7489,
        "parent": "0.0",
        "parent_split_bits": 0,
        "last_scrub": "7592'88",
        "last_scrub_stamp": "2019-04-24 19:52:45.072367",
        "last_deep_scrub": "7485'70",
        "last_deep_scrub_stamp": "2019-04-22 10:15:40.532014",
        "last_clean_scrub_stamp": "2019-04-19 14:52:44.047548",
        "log_size": 105,
        "ondisk_log_size": 105,
        "stats_invalid": false,
        "dirty_stats_invalid": false,
        "omap_stats_invalid": false,
        "hitset_stats_invalid": false,
        "hitset_bytes_stats_invalid": false,
        "pin_stats_invalid": false,
        "manifest_stats_invalid": false,
        "snaptrimq_len": 0,
        "stat_sum": {
            "num_bytes": 0,
            "num_objects": 9,
            "num_object_clones": 0,
            "num_object_copies": 27,
            "num_objects_missing_on_primary": 0,
            "num_objects_missing": 0,
            "num_objects_degraded": 0,
            "num_objects_misplaced": 0,
            "num_objects_unfound": 0,
            "num_objects_dirty": 9,
            "num_whiteouts": 0,
            "num_read": 87,
            "num_read_kb": 87,
            "num_write": 98,
            "num_write_kb": 98,
            "num_scrub_errors": 0,
            "num_shallow_scrub_errors": 0,
            "num_deep_scrub_errors": 0,
            "num_objects_recovered": 0,
            "num_bytes_recovered": 0,
            "num_keys_recovered": 0,
            "num_objects_omap": 9,
            "num_objects_hit_set_archive": 0,
            "num_bytes_hit_set_archive": 0,
            "num_flush": 0,
            "num_flush_kb": 0,
            "num_evict": 0,
            "num_evict_kb": 0,
            "num_promote": 0,
            "num_flush_mode_high": 0,
            "num_flush_mode_low": 0,
            "num_evict_mode_some": 0,
            "num_evict_mode_full": 0,
            "num_objects_pinned": 0,
            "num_legacy_snapsets": 0,
            "num_large_omap_objects": 0,
            "num_objects_manifest": 0,
            "num_omap_bytes": 0,
            "num_omap_keys": 0
        },
        "up": [],
        "acting": [],
        "blocked_by": [],
        "up_primary": -1,
        "acting_primary": -1,
        "purged_snaps": []
    },
    "empty": 0,
    "dne": 0,
    "incomplete": 1,
    "last_epoch_started": 7489,
    "hit_set_history": {
        "current_last_update": "0'0",
        "history": []
    }
}


On Fri, Apr 26, 2019 at 7:35 PM Elise Burke <elise.n...@gmail.com> wrote:

> Thanks for the suggestions. I've uploaded the surprisingly large (1.5G!)
> log file: ceph-post-file: 2d8d22f4-580b-4b57-a13a-f49dade34ba7
>
> Looks like these are the relevant lines:
>
>    -52> 2019-04-26 19:23:05.190 7fb2657dc700 20 osd.2 op_wq(2) _process
> empty q, waiting
>    -51> 2019-04-26 19:23:05.190 7fb2667de700 20 osd.2 op_wq(0) _process
> OpQueueItem(25.0 PGDelete(25.0 e7648) prio 5 cost 1048576 e7648) queued
>    -50> 2019-04-26 19:23:05.190 7fb2717f4700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _kv_finalize_thread sleep
>    -49> 2019-04-26 19:23:05.190 7fb2667de700 20 osd.2 op_wq(0) _process
> 25.0 to_process <OpQueueItem(25.0 PGDelete(25.0 e7648) prio 5 cost 1048576
> e7648)> waiting <> waiting_peering {}
>    -48> 2019-04-26 19:23:05.190 7fb2667de700 20 osd.2 op_wq(0) _process
> OpQueueItem(25.0 PGDelete(25.0 e7648) prio 5 cost 1048576 e7648) pg
> 0x7c20a00000
>    -47> 2019-04-26 19:23:05.190 7fb2667de700 10 osd.2 pg_epoch: 7648
> pg[25.0( v 7592'106 (0'0,7592'106] lb MIN (bitwise) local-lis/les=7488/7489
> n=9 ec=5191/5191 lis/c 7488/7488 les/c/f 7489/7489/0 7593/7593/7593) []
> r=-1 lpr=7648 D
> ELETING pi=[7488,7593)/1 crt=7592'106 lcod 0'0 unknown NOTIFY mbc={}]
> do_peering_event: epoch_sent: 7648 epoch_requested: 7648 DeleteSome
>    -46> 2019-04-26 19:23:05.190 7fb2667de700 10 osd.2 pg_epoch: 7648
> pg[25.0( v 7592'106 (0'0,7592'106] lb MIN (bitwise) local-lis/les=7488/7489
> n=9 ec=5191/5191 lis/c 7488/7488 les/c/f 7489/7489/0 7593/7593/7593) []
> r=-1 lpr=7648 D
> ELETING pi=[7488,7593)/1 crt=7592'106 lcod 0'0 unknown NOTIFY mbc={}]
> _delete_some
>    -45> 2019-04-26 19:23:05.190 7fb2667de700 15
> bluestore(/var/lib/ceph/osd/ceph-2) collection_list 25.0_head start GHMIN
> end GHMAX max 30
>    -44> 2019-04-26 19:23:05.190 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list range
> 0x7f7fffffffffffffe500000000 to 0x7f7fffffffffffffe5ffffffff and
> 0x7f800000000000001900000000 to 0x7f8000000000000019ffffffff
>  start GHMIN
>    -43> 2019-04-26 19:23:05.191 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list pend
> 0x7f7fffffffffffffe5ffffffff
>    -42> 2019-04-26 19:23:05.191 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list key
> 0x7f7fffffffffffffff100020'P!osdmap.7114!='0x0000000000000000ffffffffffffffff'o'
> >= GHMAX
>    -41> 2019-04-26 19:23:05.191 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list oid
> #25:00000000::::head# end GHMAX
>    -40> 2019-04-26 19:23:05.191 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list oid
> #25:b08b92bd::::head# end GHMAX
>    -39> 2019-04-26 19:23:05.191 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list key
> 0x80800000000000000c1c00000021213dfffffffffffffffeffffffffffffffff'o' >=
> GHMAX
>    -38> 2019-04-26 19:23:05.191 7fb2667de700 10
> bluestore(/var/lib/ceph/osd/ceph-2) collection_list 25.0_head start GHMAX
> end GHMAX max 30 = 0, ls.size() = 2, next = GHMAX
>    -37> 2019-04-26 19:23:05.191 7fb2667de700 20 osd.2 pg_epoch: 7648
> pg[25.0( v 7592'106 (0'0,7592'106] lb MIN (bitwise) local-lis/les=7488/7489
> n=9 ec=5191/5191 lis/c 7488/7488 les/c/f 7489/7489/0 7593/7593/7593) []
> r=-1 lpr=7648 D
> ELETING pi=[7488,7593)/1 crt=7592'106 lcod 0'0 unknown NOTIFY mbc={}]
> _delete_some [#25:00000000::::head#,#25:b08b92bd::::head#]
>    -36> 2019-04-26 19:23:05.191 7fb2667de700 20 osd.2 pg_epoch: 7648
> pg[25.0( v 7592'106 (0'0,7592'106] lb MIN (bitwise) local-lis/les=7488/7489
> n=9 ec=5191/5191 lis/c 7488/7488 les/c/f 7489/7489/0 7593/7593/7593) []
> r=-1 lpr=7648 D
> ELETING pi=[7488,7593)/1 crt=7592'106 lcod 0'0 unknown NOTIFY mbc={}]
> _delete_some finished
>    -35> 2019-04-26 19:23:05.191 7fb2667de700 10
> bluestore(/var/lib/ceph/osd/ceph-2) queue_transactions ch 0x7c27d94b40
> 25.0_head
>    -34> 2019-04-26 19:23:05.191 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _txc_create osr 0x7c274f6c60 =
> 0x7c2ef34c00 seq 3
>    -33> 2019-04-26 19:23:05.191 7fb2667de700 15
> bluestore(/var/lib/ceph/osd/ceph-2) _remove 25.0_head #25:00000000::::head#
> onode 0x7c20aaf200 txc 0x7c2ef34c00
>    -32> 2019-04-26 19:23:05.191 7fb2667de700 15
> bluestore(/var/lib/ceph/osd/ceph-2) _do_truncate 25.0_head
> #25:00000000::::head# 0x0
>    -31> 2019-04-26 19:23:05.191 7fb2667de700 20
> bluestore.onode(0x7c20aaf200).flush flush done
>    -30> 2019-04-26 19:23:05.191 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _do_omap_clear remove range start:
> 0x0000000000b0295a'-' end: 0x0000000000b0295a'~'
>    -29> 2019-04-26 19:23:05.191 7fb2667de700 10
> bluestore(/var/lib/ceph/osd/ceph-2) _remove 25.0_head #25:00000000::::head#
> = 0
>    -28> 2019-04-26 19:23:05.191 7fb2667de700 15
> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection 25.0_head
>    -27> 2019-04-26 19:23:05.191 7fb2667de700 20
> bluestore.OnodeSpace(0x7c27d94c98 in 0x7c1f7e10a0) map_any
>    -26> 2019-04-26 19:23:05.191 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list range
> 0x7f7fffffffffffffe500000000 to 0x7f7fffffffffffffe5ffffffff and
> 0x7f800000000000001900000000 to 0x7f8000000000000019ffffffff
>  start GHMIN
>    -25> 2019-04-26 19:23:05.192 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list pend
> 0x7f7fffffffffffffe5ffffffff
>    -24> 2019-04-26 19:23:05.192 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list key
> 0x7f7fffffffffffffff100020'P!osdmap.7114!='0x0000000000000000ffffffffffffffff'o'
> >= GHMAX
>    -23> 2019-04-26 19:23:05.192 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list oid
> #25:00000000::::head# end GHMAX
>    -22> 2019-04-26 19:23:05.192 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list oid
> #25:b08b92bd::::head# end GHMAX
>    -21> 2019-04-26 19:23:05.192 7fb2667de700 20
> bluestore(/var/lib/ceph/osd/ceph-2) _collection_list key
> 0x80800000000000000c1c00000021213dfffffffffffffffeffffffffffffffff'o' >=
> GHMAX
>    -20> 2019-04-26 19:23:05.192 7fb2667de700 10
> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection oid
> #25:00000000::::head#
>    -19> 2019-04-26 19:23:05.192 7fb2667de700 10
> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection oid
> #25:b08b92bd::::head#
>    -18> 2019-04-26 19:23:05.192 7fb2667de700 10
> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection
> #25:b08b92bd::::head# exists in db, not present in ram
>    -17> 2019-04-26 19:23:05.192 7fb2667de700 10
> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection 25.0_head is
> non-empty
>    -16> 2019-04-26 19:23:05.192 7fb2667de700 10
> bluestore(/var/lib/ceph/osd/ceph-2) _remove_collection 25.0_head = -39
>    -15> 2019-04-26 19:23:05.192 7fb2667de700 -1
> bluestore(/var/lib/ceph/osd/ceph-2) _txc_add_transaction error (39)
> Directory not empty not handled on operation 21 (op 1, counting from 0)
>    -14> 2019-04-26 19:23:05.192 7fb2667de700  0
> bluestore(/var/lib/ceph/osd/ceph-2) _dump_transaction transaction dump:
> {
>     "ops": [
>         {
>             "op_num": 0,
>             "op_name": "remove",
>             "collection": "25.0_head",
>             "oid": "#25:00000000::::head#"
>         },
>         {
>             "op_num": 1,
>             "op_name": "rmcoll",
>             "collection": "25.0_head"
>         }
>
>    -13> 2019-04-26 19:23:05.199 7fb2667de700 -1
> /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14.2.0/rpm/el7/BUILD/ceph-14.2.0/src/os/bluestore/BlueStore.cc:
> In function 'void BlueStore::_txc_add_transaction(BlueStore::TransContext*,
> ObjectStore::Transaction*)' thread 7fb2667de700 time 2019-04-26
> 19:23:05.193826
> /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14.2.0/rpm/el7/BUILD/ceph-14.2.0/src/os/bluestore/BlueStore.cc:
> 11069: abort()
>
>  ceph version 14.2.0 (3a54b2b6d167d4a2a19e003a705696d4fe619afc) nautilus
> (stable)
>  1: (ceph::__ceph_abort(char const*, int, char const*, std::string
> const&)+0xd8) [0x7c1454ee40]
>  2: (BlueStore::_txc_add_transaction(BlueStore::TransContext*,
> ObjectStore::Transaction*)+0x2a85) [0x7c14b2d5f5]
>  3:
> (BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
> std::vector<ObjectStore::Transaction,
> std::allocator<ObjectStore::Transaction> >&,
> boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)+0x526)
> [0x7c14b2e366]
>  4:
> (ObjectStore::queue_transaction(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
> ObjectStore::Transaction&&, boost::intrusive_ptr<TrackedOp>,
> ThreadPool::TPHandle*)+0x7f) [0x7c1470a81f]
>  5: (PG::_delete_some(ObjectStore::Transaction*)+0x83d) [0x7c1476d70d]
>  6: (PG::RecoveryState::Deleting::react(PG::DeleteSome const&)+0x38)
> [0x7c1476e528]
>  7: (boost::statechart::simple_state<PG::RecoveryState::Deleting,
> PG::RecoveryState::ToDelete, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na,
> mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na,
> mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na,
> mpl_::na, mpl_::na, mpl_::na>,
> (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base
> const&, void const*)+0x16a) [0x7c147acc8a]
>  8: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine,
> PG::RecoveryState::Initial, std::allocator<void>,
> boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base
> const&)+0x5a) [0x7c1478a91a]
>  9: (PG::do_peering_event(std::shared_ptr<PGPeeringEvent>,
> PG::RecoveryCtx*)+0x119) [0x7c14779c99]
>  10: (OSD::dequeue_peering_evt(OSDShard*, PG*,
> std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x1b4)
> [0x7c146b4494]
>  11: (OSD::dequeue_delete(OSDShard*, PG*, unsigned int,
> ThreadPool::TPHandle&)+0x234) [0x7c146b48d4]
>  12: (OSD::ShardedOpWQ::_process(unsigned int,
> ceph::heartbeat_handle_d*)+0x9f4) [0x7c146a8c14]
>  13: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x433)
> [0x7c14ca0f43]
>  14: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x7c14ca3fe0]
>  15: (()+0x7dd5) [0x7fb284e3bdd5]
>  16: (clone()+0x6d) [0x7fb283d01ead]
>
>    -12> 2019-04-26 19:23:05.199 7fb280b24700  1 -- [v2:
> 192.168.1.152:6800/1887364,v1:192.168.1.152:6801/1887364] <==
> client.244681 v1:192.168.1.50:0/337348516 6 ====
> osd_op(client.244681.0:2571786 12.5es0 12.43dfa65e (undecoded)
> ondisk+retry+write+known_if_redirected e7648) v8 ==== 229+0+4194304
> (unknown 2519985655 0 813231621) 0x7c2f3ee840 con 0x7c2f2fac00
>    -11> 2019-04-26 19:23:05.199 7fb280b24700 15 osd.2 7648 enqueue_op
> 0x7c300f8420 prio 63 cost 4194304 latency 0.035786 epoch 7648
> osd_op(client.244681.0:2571786 12.5es0 12.43dfa65e (undecoded)
> ondisk+retry+write+known_if_redirected e7648) v8
>    -10> 2019-04-26 19:23:05.199 7fb280b24700 20 osd.2 op_wq(4) _enqueue
> OpQueueItem(12.5es0 PGOpItem(op=osd_op(client.244681.0:2571786 12.5es0
> 12.43dfa65e (undecoded) ondisk+retry+write+known_if_redirected e7648) v8)
> prio 63 cost 4194304 e7648)
>     -9> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 op_wq(4) _process
> 12.5es0 to_process <> waiting <> waiting_peering {}
>     -8> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 op_wq(4) _process
> OpQueueItem(12.5es0 PGOpItem(op=osd_op(client.244681.0:2571786 12.5es0
> 12.43dfa65e (undecoded) ondisk+retry+write+known_if_redirected e7648) v8)
> prio 63 cost 4194304 e7648) queued
>     -7> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 op_wq(4) _process
> 12.5es0 to_process <OpQueueItem(12.5es0
> PGOpItem(op=osd_op(client.244681.0:2571786 12.5es0 12.43dfa65e (undecoded)
> ondisk+retry+write+known_if_redirected e7648) v8) prio 63 cost 4194304
> e7648)> waiting <> waiting_peering {}
>     -6> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 op_wq(4) _process
> OpQueueItem(12.5es0 PGOpItem(op=osd_op(client.244681.0:2571786 12.5es0
> 12.43dfa65e (undecoded) ondisk+retry+write+known_if_redirected e7648) v8)
> prio 63 cost 4194304 e7648) pg 0x7c271c6000
>     -5> 2019-04-26 19:23:05.200 7fb2647da700 10 osd.2 7648 dequeue_op
> 0x7c300f8420 prio 63 cost 4194304 latency 0.036428
> osd_op(client.244681.0:2571786 12.5es0 12.43dfa65e (undecoded)
> ondisk+retry+write+known_if_redirected e7648) v8 pg pg[12.5es0( v
> 7592'250137 (5872'250020,7592'250137] local-lis/les=7488/7489 n=22756
> ec=5088/733 lis/c 7488/7488 les/c/f 7489/7489/5043 7648/7648/7648)
> [2,2147483647,0,7,4]p2(0) r=0 lpr=7648 pi=[7488,7648)/1 crt=7592'250135
> lcod 0'0 mlcod 0'0 peering mbc={}]
>     -4> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 7648 share_map
> client.244681 v1:192.168.1.50:0/337348516 7648
>     -3> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 7648
> should_share_map client.244681 v1:192.168.1.50:0/337348516 7648
>     -2> 2019-04-26 19:23:05.200 7fb2647da700 10 osd.2 7648 dequeue_op
> 0x7c300f8420 finish
>     -1> 2019-04-26 19:23:05.200 7fb2647da700 20 osd.2 op_wq(4) _process
> empty q, waiting
>      0> 2019-04-26 19:23:05.206 7fb2667de700 -1 *** Caught signal
> (Aborted) **
>  in thread 7fb2667de700 thread_name:tp_osd_tp
>
>
> On Fri, Apr 26, 2019 at 3:57 PM Gregory Farnum <gfar...@redhat.com> wrote:
>
>> You'll probably want to generate a log with "debug osd = 20" and
>> "debug bluestore = 20", then share that or upload it with
>> ceph-post-file, to get more useful info about which PGs are breaking
>> (is it actually the ones that were supposed to delete?).
>>
>> If there's a particular set of PGs you need to rescue, you can also
>> look at using the ceph-objectstore-tool to export them off the busted
>> OSD stores and import them into OSDs that still work.
>>
>>
>> On Fri, Apr 26, 2019 at 12:01 PM Elise Burke <elise.n...@gmail.com>
>> wrote:
>> >
>> > Hi,
>> >
>> > I upgraded to Nautilus a week or two ago and things had been mostly
>> fine. I was interested in trying the device health stats feature and
>> enabled it. In doing so it created a pool, device_health_metrics, which
>> contained zero bytes.
>> >
>> > Unfortunately this pool developed a PG that could not be repaired with
>> `ceph pg repair`. That's okay, I thought, this pool is empty (zero bytes),
>> so I'll just remove it and discard the PG entirely.
>> >
>> > So I did: `ceph osd pool rm device_health_metrics device_health_metrics
>> --yes-i-really-really-mean-it`
>> >
>> > Within a few seconds three OSDs had gone missing (this pool was size=3)
>> and now crashloop at startup.
>> >
>> > Any assistance in getting these OSDs up (such as by discarding the
>> errant PG) would be appreciated. I'm most concerned about the other pools
>> in the system, as losing three OSDs at once has not been ideal.
>> >
>> > This is made more difficult as these are in the Bluestore configuration
>> and were set up with ceph-deploy to bare metal (using LVM mode).
>> >
>> > Here's the traceback as noted in journalctl:
>> >
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: -5381> 2019-04-26
>> 11:01:08.902 7f8a00866d80 -1 Falling back to public interface
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: -4241> 2019-04-26
>> 11:01:41.835 7f8a00866d80 -1 osd.2 7630 log_to_monitors {default=true}
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: -3> 2019-04-26 11:01:43.203
>> 7f89dee53700 -1 bluestore(/var/lib/ceph/osd/ceph-2) _txc_add_transaction
>> error (39) Directory not empty not handled on operation 21 (op 1, counting
>> from 0)
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: -1> 2019-04-26 11:01:43.209
>> 7f89dee53700 -1
>> /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14
>> > Apr 26 11:01:43 databox ceph-osd[1878533]:
>> /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14.2.0/rpm/el7/BUILD/ceph-14.2.0/src/os/bluest
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: ceph version 14.2.0
>> (3a54b2b6d167d4a2a19e003a705696d4fe619afc) nautilus (stable)
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 1: (ceph::__ceph_abort(char
>> const*, int, char const*, std::string const&)+0xd8) [0xfc63afe40]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 2:
>> (BlueStore::_txc_add_transaction(BlueStore::TransContext*,
>> ObjectStore::Transaction*)+0x2a85) [0xfc698e5f5]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 3:
>> (BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
>> std::vector<ObjectStore::Transaction,
>> std::allocator<ObjectStore::Transaction> >&, boost::intrusive_ptr
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 4:
>> (ObjectStore::queue_transaction(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
>> ObjectStore::Transaction&&, boost::intrusive_ptr<TrackedOp>,
>> ThreadPool::TPHandle*)+0x7f) [0xfc656b81f
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 5:
>> (PG::_delete_some(ObjectStore::Transaction*)+0x83d) [0xfc65ce70d]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 6:
>> (PG::RecoveryState::Deleting::react(PG::DeleteSome const&)+0x38)
>> [0xfc65cf528]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 7:
>> (boost::statechart::simple_state<PG::RecoveryState::Deleting,
>> PG::RecoveryState::ToDelete, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na,
>> mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 8:
>> (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine,
>> PG::RecoveryState::Initial, std::allocator<void>,
>> boost::statechart::null_exception_translator>::process_event(boost
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 9:
>> (PG::do_peering_event(std::shared_ptr<PGPeeringEvent>,
>> PG::RecoveryCtx*)+0x119) [0xfc65dac99]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 10:
>> (OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>,
>> ThreadPool::TPHandle&)+0x1b4) [0xfc6515494]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 11:
>> (OSD::dequeue_delete(OSDShard*, PG*, unsigned int,
>> ThreadPool::TPHandle&)+0x234) [0xfc65158d4]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 12:
>> (OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x9f4)
>> [0xfc6509c14]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 13:
>> (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x433)
>> [0xfc6b01f43]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 14:
>> (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0xfc6b04fe0]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 15: (()+0x7dd5)
>> [0x7f89fd4b0dd5]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 16: (clone()+0x6d)
>> [0x7f89fc376ead]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 0> 2019-04-26 11:01:43.217
>> 7f89dee53700 -1 *** Caught signal (Aborted) **
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: in thread 7f89dee53700
>> thread_name:tp_osd_tp
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: ceph version 14.2.0
>> (3a54b2b6d167d4a2a19e003a705696d4fe619afc) nautilus (stable)
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 1: (()+0xf5d0)
>> [0x7f89fd4b85d0]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 2: (gsignal()+0x37)
>> [0x7f89fc2af207]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 3: (abort()+0x148)
>> [0x7f89fc2b08f8]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 4: (ceph::__ceph_abort(char
>> const*, int, char const*, std::string const&)+0x19c) [0xfc63aff04]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 5:
>> (BlueStore::_txc_add_transaction(BlueStore::TransContext*,
>> ObjectStore::Transaction*)+0x2a85) [0xfc698e5f5]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 6:
>> (BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
>> std::vector<ObjectStore::Transaction,
>> std::allocator<ObjectStore::Transaction> >&, boost::intrusive_ptr
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 7:
>> (ObjectStore::queue_transaction(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
>> ObjectStore::Transaction&&, boost::intrusive_ptr<TrackedOp>,
>> ThreadPool::TPHandle*)+0x7f) [0xfc656b81f
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 8:
>> (PG::_delete_some(ObjectStore::Transaction*)+0x83d) [0xfc65ce70d]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 9:
>> (PG::RecoveryState::Deleting::react(PG::DeleteSome const&)+0x38)
>> [0xfc65cf528]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 10:
>> (boost::statechart::simple_state<PG::RecoveryState::Deleting,
>> PG::RecoveryState::ToDelete, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na,
>> mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::n
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 11:
>> (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine,
>> PG::RecoveryState::Initial, std::allocator<void>,
>> boost::statechart::null_exception_translator>::process_event(boos
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 12:
>> (PG::do_peering_event(std::shared_ptr<PGPeeringEvent>,
>> PG::RecoveryCtx*)+0x119) [0xfc65dac99]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 13:
>> (OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>,
>> ThreadPool::TPHandle&)+0x1b4) [0xfc6515494]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 14:
>> (OSD::dequeue_delete(OSDShard*, PG*, unsigned int,
>> ThreadPool::TPHandle&)+0x234) [0xfc65158d4]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 15:
>> (OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x9f4)
>> [0xfc6509c14]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 16:
>> (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x433)
>> [0xfc6b01f43]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 17:
>> (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0xfc6b04fe0]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 18: (()+0x7dd5)
>> [0x7f89fd4b0dd5]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: 19: (clone()+0x6d)
>> [0x7f89fc376ead]
>> > Apr 26 11:01:43 databox ceph-osd[1878533]: NOTE: a copy of the
>> executable, or `objdump -rdS <executable>` is needed to interpret this.
>> >
>> > Thanks!
>> >
>> > -Elise
>> > _______________________________________________
>> > ceph-users mailing list
>> > ceph-users@lists.ceph.com
>> > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>
>
_______________________________________________
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

Reply via email to