Farther back in the logfile before it is reporting the missing objects,
I get these messages:
osd.33 pg_epoch: 256084 pg[8.484(unlocked)] enter Initial
osd.33 pg_epoch: 256084 pg[8.484(unlocked)] enter NotTrimming
It looks like it is processing placement group "8.484" on OSD 33. But
when I run "ceph pg 8.484 query" is looks like that placement group is
on two other OSDs [13,44] in a healthy state. Is this just a lingering
copy of the PG that needs to be manually removed?
# ceph pg 8.484 query
{
"state": "active+clean",
"snap_trimq": "[]",
"epoch": 256850,
"up": [
13,
44
],
"acting": [
13,
44
],
"actingbackfill": [
"13",
"44"
],
"info": {
"pgid": "8.484",
"last_update": "256816'16486",
"last_complete": "256816'16486",
"log_tail": "254158'13420",
"last_user_version": 16486,
"last_backfill": "MAX",
"purged_snaps": "[1~3a,44~1,46~1,4d~a,58~4,73~30,ba~11]",
"history": {
"epoch_created": 250794,
"last_epoch_started": 256849,
"last_epoch_clean": 256849,
"last_epoch_split": 0,
"same_up_since": 256848,
"same_interval_since": 256848,
"same_primary_since": 256848,
"last_scrub": "254220'13929",
"last_scrub_stamp": "2015-10-09 20:18:15.856071",
"last_deep_scrub": "251785'8629",
"last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
"last_clean_scrub_stamp": "2015-10-09 20:18:15.856071"
},
"stats": {
"version": "256816'16486",
"reported_seq": "67588",
"reported_epoch": "256850",
"state": "active+clean",
"last_fresh": "2015-10-11 17:09:46.972382",
"last_change": "2015-10-11 16:30:32.646205",
"last_active": "2015-10-11 17:09:46.972382",
"last_peered": "2015-10-11 17:09:46.972382",
"last_clean": "2015-10-11 17:09:46.972382",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "2015-10-11 17:09:46.972382",
"last_undegraded": "2015-10-11 17:09:46.972382",
"last_fullsized": "2015-10-11 17:09:46.972382",
"mapping_epoch": 256846,
"log_start": "254158'13420",
"ondisk_log_start": "254158'13420",
"created": 250794,
"last_epoch_clean": 256849,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "254220'13929",
"last_scrub_stamp": "2015-10-09 20:18:15.856071",
"last_deep_scrub": "251785'8629",
"last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
"last_clean_scrub_stamp": "2015-10-09 20:18:15.856071",
"log_size": 3066,
"ondisk_log_size": 3066,
"stats_invalid": "0",
"stat_sum": {
"num_bytes": 7075272704,
"num_objects": 1709,
"num_object_clones": 582,
"num_object_copies": 3418,
"num_objects_missing_on_primary": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 1709,
"num_whiteouts": 0,
"num_read": 92000,
"num_read_kb": 17658096,
"num_write": 23198,
"num_write_kb": 8868482,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 3928,
"num_bytes_recovered": 16447209472,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0
},
"up": [
13,
44
],
"acting": [
13,
44
],
"blocked_by": [],
"up_primary": 13,
"acting_primary": 13
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 256849,
"hit_set_history": {
"current_last_update": "0'0",
"current_last_stamp": "0.000000",
"current_info": {
"begin": "0.000000",
"end": "0.000000",
"version": "0'0"
},
"history": []
}
},
"peer_info": [
{
"peer": "44",
"pgid": "8.484",
"last_update": "256816'16486",
"last_complete": "256816'16486",
"log_tail": "254158'13420",
"last_user_version": 16486,
"last_backfill": "MAX",
"purged_snaps": "[1~3a,44~1,46~1,4d~a,58~4,73~30,ba~11]",
"history": {
"epoch_created": 250794,
"last_epoch_started": 256849,
"last_epoch_clean": 256849,
"last_epoch_split": 0,
"same_up_since": 256848,
"same_interval_since": 256848,
"same_primary_since": 256848,
"last_scrub": "254220'13929",
"last_scrub_stamp": "2015-10-09 20:18:15.856071",
"last_deep_scrub": "251785'8629",
"last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
"last_clean_scrub_stamp": "2015-10-09 20:18:15.856071"
},
"stats": {
"version": "256816'16486",
"reported_seq": "67580",
"reported_epoch": "256848",
"state": "active+undersized+degraded",
"last_fresh": "2015-10-11 16:30:10.656475",
"last_change": "2015-10-11 16:30:10.655808",
"last_active": "2015-10-11 16:30:10.656475",
"last_peered": "2015-10-11 16:30:10.656475",
"last_clean": "2015-10-11 11:21:32.645164",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "2015-10-11 16:30:10.656475",
"last_undegraded": "2015-10-11 16:30:06.790836",
"last_fullsized": "2015-10-11 16:30:06.790836",
"mapping_epoch": 256846,
"log_start": "254158'13420",
"ondisk_log_start": "254158'13420",
"created": 250794,
"last_epoch_clean": 256847,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "254220'13929",
"last_scrub_stamp": "2015-10-09 20:18:15.856071",
"last_deep_scrub": "251785'8629",
"last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
"last_clean_scrub_stamp": "2015-10-09 20:18:15.856071",
"log_size": 3066,
"ondisk_log_size": 3066,
"stats_invalid": "0",
"stat_sum": {
"num_bytes": 7075272704,
"num_objects": 1709,
"num_object_clones": 582,
"num_object_copies": 3418,
"num_objects_missing_on_primary": 0,
"num_objects_degraded": 1709,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 1709,
"num_whiteouts": 0,
"num_read": 92000,
"num_read_kb": 17658096,
"num_write": 23198,
"num_write_kb": 8868482,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 3928,
"num_bytes_recovered": 16447209472,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0
},
"up": [
13,
44
],
"acting": [
13,
44
],
"blocked_by": [],
"up_primary": 13,
"acting_primary": 13
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 256849,
"hit_set_history": {
"current_last_update": "0'0",
"current_last_stamp": "0.000000",
"current_info": {
"begin": "0.000000",
"end": "0.000000",
"version": "0'0"
},
"history": []
}
}
],
"recovery_state": [
{
"name": "Started\/Primary\/Active",
"enter_time": "2015-10-11 16:30:32.551400",
"might_have_unfound": [],
"recovery_progress": {
"backfill_targets": [],
"waiting_on_backfill": [],
"last_backfill_started": "0\/\/0\/\/-1",
"backfill_info": {
"begin": "0\/\/0\/\/-1",
"end": "0\/\/0\/\/-1",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
},
"scrub": {
"scrubber.epoch_start": "0",
"scrubber.active": 0,
"scrubber.waiting_on": 0,
"scrubber.waiting_on_whom": []
}
},
{
"name": "Started",
"enter_time": "2015-10-11 16:30:28.201561"
}
],
"agent_state": {}
}
On 2015-10-11 12:19 pm, Chris Taylor wrote:
> I have an OSD that went down while the cluster was recovering from another
> OSD being reweighted. The cluster appears to be stuck in recovery since the
> number of degraded and misplaced objects are not decreasing.
>
> It is a three node cluster in production and the pool size is 2. Ceph version
> 94.3.
>
> Here is a snip of the failing OSD log. The full log file was uploaded with
> ceph-post-file "ceph-post-file: dfcf6dff-11cb-49b0-81b8-60bf8ff898eb".
>
> 2015-10-11 10:45:06.182615 7f9270567900 20 read_log 254342'14799
> (251922'9664) delete d1aa1484/rb.0.ac3386.238e1f29.0000000bb0ad/44//8 by
> unknown.0.0:0 2015-10-11 00:40:34.981049
> 2015-10-11 10:45:06.182629 7f9270567900 20 read_log 254342'14800
> (251922'9665) modify d1aa1484/rb.0.ac3386.238e1f29.0000000bb0ad/head//8 by
> unknown.0.0:0 2015-10-11 00:40:34.981049
> 2015-10-11 10:45:06.182661 7f9270567900 20 read_log 6 divergent_priors
> 2015-10-11 10:45:06.184076 7f9270567900 10 read_log checking for missing
> items over interval (0'0,254342'14800]
> 2015-10-11 10:45:11.861683 7f9270567900 15 read_log missing
> 251925'9669,e9ea1484/rb.0.ac3386.238e1f29.000000187097/head//8
> 2015-10-11 10:45:11.861767 7f9270567900 15 read_log missing
> 251925'9668,e9ea1484/rb.0.ac3386.238e1f29.000000187097/44//8
> 2015-10-11 10:45:11.861823 7f9270567900 15 read_log missing
> 251925'9667,c4ea1484/rb.0.ac3386.238e1f29.00000022717d/head//8
> 2015-10-11 10:45:11.861877 7f9270567900 15 read_log missing
> 251925'9666,c4ea1484/rb.0.ac3386.238e1f29.00000022717d/68//8
> 2015-10-11 10:45:11.924425 7f9270567900 -1 osd/PGLog.cc: In function 'static
> void PGLog::read_log(ObjectStore*, coll_t, coll_t, ghobject_t, const
> pg_info_t&, std::map<eversion_t, hobject_t>&, PGLog::IndexedLog&,
> pg_missing_t&, std::ostringstream&, std::set<std::basic_string<char> >*)'
> thread 7f9270567900 time 2015-10-11 10:45:11.861976
> osd/PGLog.cc: 962: FAILED assert(oi.version == i->first)
>
> cluster d960d672-e035-413d-ba39-8341f4131760
> health HEALTH_WARN
> 54 pgs backfill
> 373 pgs degraded
> 1 pgs recovering
> 336 pgs recovery_wait
> 373 pgs stuck degraded
> 391 pgs stuck unclean
> 43 pgs stuck undersized
> 43 pgs undersized
> recovery 88034/14758314 objects degraded (0.597%)
> recovery 280423/14758314 objects misplaced (1.900%)
> recovery 28/7330234 unfound (0.000%)
> monmap e1: 3 mons at
> {ceph-mon1=10.20.0.11:6789/0,ceph-mon2=10.20.0.12:6789/0,ceph-mon3=10.20.0.13:6789/0}
> election epoch 6010, quorum 0,1,2 ceph-mon1,ceph-mon2,ceph-mon3
> osdmap e256816: 46 osds: 45 up, 45 in; 65 remapped pgs
> pgmap v19715504: 5184 pgs, 4 pools, 28323 GB data, 7158 kobjects
> 57018 GB used, 23027 GB / 80045 GB avail
> 88034/14758314 objects degraded (0.597%)
> 280423/14758314 objects misplaced (1.900%)
> 28/7330234 unfound (0.000%)
> 4790 active+clean
> 326 active+recovery_wait+degraded
> 36 active+undersized+degraded+remapped+wait_backfill
> 18 active+remapped+wait_backfill
> 6 active+recovery_wait+undersized+degraded+remapped
> 4 active+recovery_wait+degraded+remapped
> 3 active+clean+scrubbing+deep
> 1 active+recovering+undersized+degraded+remapped
> client io 11627 kB/s rd, 36433 B/s wr, 10 op/s
>
> Any help would be greatly appreciated!
>
> Thanks,
>
> Chris
>
> _______________________________________________
> ceph-users mailing list
> [email protected]
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com [1]
Links:
------
[1] http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
_______________________________________________
ceph-users mailing list
[email protected]
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com