Re: [ceph-users] OSD will not start

Chris Taylor Sun, 11 Oct 2015 17:15:02 -0700

 

Farther back in the logfile before it is reporting the missing objects,
I get these messages:


osd.33 pg_epoch: 256084 pg[8.484(unlocked)] enter Initial
 osd.33 pg_epoch: 256084 pg[8.484(unlocked)] enter NotTrimming 

It looks like it is processing placement group "8.484" on OSD 33. But
when I run "ceph pg 8.484 query" is looks like that placement group is
on two other OSDs [13,44] in a healthy state. Is this just a lingering
copy of the PG that needs to be manually removed? 

# ceph pg 8.484 query
{
 "state": "active+clean",
 "snap_trimq": "[]",
 "epoch": 256850,
 "up": [
 13,
 44
 ],
 "acting": [
 13,
 44
 ],
 "actingbackfill": [
 "13",
 "44"
 ],
 "info": {
 "pgid": "8.484",
 "last_update": "256816'16486",
 "last_complete": "256816'16486",
 "log_tail": "254158'13420",
 "last_user_version": 16486,
 "last_backfill": "MAX",
 "purged_snaps": "[1~3a,44~1,46~1,4d~a,58~4,73~30,ba~11]",
 "history": {
 "epoch_created": 250794,
 "last_epoch_started": 256849,
 "last_epoch_clean": 256849,
 "last_epoch_split": 0,
 "same_up_since": 256848,
 "same_interval_since": 256848,
 "same_primary_since": 256848,
 "last_scrub": "254220'13929",
 "last_scrub_stamp": "2015-10-09 20:18:15.856071",
 "last_deep_scrub": "251785'8629",
 "last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
 "last_clean_scrub_stamp": "2015-10-09 20:18:15.856071"
 },
 "stats": {
 "version": "256816'16486",
 "reported_seq": "67588",
 "reported_epoch": "256850",
 "state": "active+clean",
 "last_fresh": "2015-10-11 17:09:46.972382",
 "last_change": "2015-10-11 16:30:32.646205",
 "last_active": "2015-10-11 17:09:46.972382",
 "last_peered": "2015-10-11 17:09:46.972382",
 "last_clean": "2015-10-11 17:09:46.972382",
 "last_became_active": "0.000000",
 "last_became_peered": "0.000000",
 "last_unstale": "2015-10-11 17:09:46.972382",
 "last_undegraded": "2015-10-11 17:09:46.972382",
 "last_fullsized": "2015-10-11 17:09:46.972382",
 "mapping_epoch": 256846,
 "log_start": "254158'13420",
 "ondisk_log_start": "254158'13420",
 "created": 250794,
 "last_epoch_clean": 256849,
 "parent": "0.0",
 "parent_split_bits": 0,
 "last_scrub": "254220'13929",
 "last_scrub_stamp": "2015-10-09 20:18:15.856071",
 "last_deep_scrub": "251785'8629",
 "last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
 "last_clean_scrub_stamp": "2015-10-09 20:18:15.856071",
 "log_size": 3066,
 "ondisk_log_size": 3066,
 "stats_invalid": "0",
 "stat_sum": {
 "num_bytes": 7075272704,
 "num_objects": 1709,
 "num_object_clones": 582,
 "num_object_copies": 3418,
 "num_objects_missing_on_primary": 0,
 "num_objects_degraded": 0,
 "num_objects_misplaced": 0,
 "num_objects_unfound": 0,
 "num_objects_dirty": 1709,
 "num_whiteouts": 0,
 "num_read": 92000,
 "num_read_kb": 17658096,
 "num_write": 23198,
 "num_write_kb": 8868482,
 "num_scrub_errors": 0,
 "num_shallow_scrub_errors": 0,
 "num_deep_scrub_errors": 0,
 "num_objects_recovered": 3928,
 "num_bytes_recovered": 16447209472,
 "num_keys_recovered": 0,
 "num_objects_omap": 0,
 "num_objects_hit_set_archive": 0,
 "num_bytes_hit_set_archive": 0
 },
 "up": [
 13,
 44
 ],
 "acting": [
 13,
 44
 ],
 "blocked_by": [],
 "up_primary": 13,
 "acting_primary": 13
 },
 "empty": 0,
 "dne": 0,
 "incomplete": 0,
 "last_epoch_started": 256849,
 "hit_set_history": {
 "current_last_update": "0'0",
 "current_last_stamp": "0.000000",
 "current_info": {
 "begin": "0.000000",
 "end": "0.000000",
 "version": "0'0"
 },
 "history": []
 }
 },
 "peer_info": [
 {
 "peer": "44",
 "pgid": "8.484",
 "last_update": "256816'16486",
 "last_complete": "256816'16486",
 "log_tail": "254158'13420",
 "last_user_version": 16486,
 "last_backfill": "MAX",
 "purged_snaps": "[1~3a,44~1,46~1,4d~a,58~4,73~30,ba~11]",
 "history": {
 "epoch_created": 250794,
 "last_epoch_started": 256849,
 "last_epoch_clean": 256849,
 "last_epoch_split": 0,
 "same_up_since": 256848,
 "same_interval_since": 256848,
 "same_primary_since": 256848,
 "last_scrub": "254220'13929",
 "last_scrub_stamp": "2015-10-09 20:18:15.856071",
 "last_deep_scrub": "251785'8629",
 "last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
 "last_clean_scrub_stamp": "2015-10-09 20:18:15.856071"
 },
 "stats": {
 "version": "256816'16486",
 "reported_seq": "67580",
 "reported_epoch": "256848",
 "state": "active+undersized+degraded",
 "last_fresh": "2015-10-11 16:30:10.656475",
 "last_change": "2015-10-11 16:30:10.655808",
 "last_active": "2015-10-11 16:30:10.656475",
 "last_peered": "2015-10-11 16:30:10.656475",
 "last_clean": "2015-10-11 11:21:32.645164",
 "last_became_active": "0.000000",
 "last_became_peered": "0.000000",
 "last_unstale": "2015-10-11 16:30:10.656475",
 "last_undegraded": "2015-10-11 16:30:06.790836",
 "last_fullsized": "2015-10-11 16:30:06.790836",
 "mapping_epoch": 256846,
 "log_start": "254158'13420",
 "ondisk_log_start": "254158'13420",
 "created": 250794,
 "last_epoch_clean": 256847,
 "parent": "0.0",
 "parent_split_bits": 0,
 "last_scrub": "254220'13929",
 "last_scrub_stamp": "2015-10-09 20:18:15.856071",
 "last_deep_scrub": "251785'8629",
 "last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
 "last_clean_scrub_stamp": "2015-10-09 20:18:15.856071",
 "log_size": 3066,
 "ondisk_log_size": 3066,
 "stats_invalid": "0",
 "stat_sum": {
 "num_bytes": 7075272704,
 "num_objects": 1709,
 "num_object_clones": 582,
 "num_object_copies": 3418,
 "num_objects_missing_on_primary": 0,
 "num_objects_degraded": 1709,
 "num_objects_misplaced": 0,
 "num_objects_unfound": 0,
 "num_objects_dirty": 1709,
 "num_whiteouts": 0,
 "num_read": 92000,
 "num_read_kb": 17658096,
 "num_write": 23198,
 "num_write_kb": 8868482,
 "num_scrub_errors": 0,
 "num_shallow_scrub_errors": 0,
 "num_deep_scrub_errors": 0,
 "num_objects_recovered": 3928,
 "num_bytes_recovered": 16447209472,
 "num_keys_recovered": 0,
 "num_objects_omap": 0,
 "num_objects_hit_set_archive": 0,
 "num_bytes_hit_set_archive": 0
 },
 "up": [
 13,
 44
 ],
 "acting": [
 13,
 44
 ],
 "blocked_by": [],
 "up_primary": 13,
 "acting_primary": 13
 },
 "empty": 0,
 "dne": 0,
 "incomplete": 0,
 "last_epoch_started": 256849,
 "hit_set_history": {
 "current_last_update": "0'0",
 "current_last_stamp": "0.000000",
 "current_info": {
 "begin": "0.000000",
 "end": "0.000000",
 "version": "0'0"
 },
 "history": []
 }
 }
 ],
 "recovery_state": [
 {
 "name": "Started\/Primary\/Active",
 "enter_time": "2015-10-11 16:30:32.551400",
 "might_have_unfound": [],
 "recovery_progress": {
 "backfill_targets": [],
 "waiting_on_backfill": [],
 "last_backfill_started": "0\/\/0\/\/-1",
 "backfill_info": {
 "begin": "0\/\/0\/\/-1",
 "end": "0\/\/0\/\/-1",
 "objects": []
 },
 "peer_backfill_info": [],
 "backfills_in_flight": [],
 "recovering": [],
 "pg_backend": {
 "pull_from_peer": [],
 "pushing": []
 }
 },
 "scrub": {
 "scrubber.epoch_start": "0",
 "scrubber.active": 0,
 "scrubber.waiting_on": 0,
 "scrubber.waiting_on_whom": []
 }
 },
 {
 "name": "Started",
 "enter_time": "2015-10-11 16:30:28.201561"
 }
 ],
 "agent_state": {}
} 

On 2015-10-11 12:19 pm, Chris Taylor wrote: 

> I have an OSD that went down while the cluster was recovering from another 
> OSD being reweighted. The cluster appears to be stuck in recovery since the 
> number of degraded and misplaced objects are not decreasing. 
> 
> It is a three node cluster in production and the pool size is 2. Ceph version 
> 94.3. 
> 
> Here is a snip of the failing OSD log. The full log file was uploaded with 
> ceph-post-file "ceph-post-file: dfcf6dff-11cb-49b0-81b8-60bf8ff898eb". 
> 
> 2015-10-11 10:45:06.182615 7f9270567900 20 read_log 254342'14799 
> (251922'9664) delete d1aa1484/rb.0.ac3386.238e1f29.0000000bb0ad/44//8 by 
> unknown.0.0:0 2015-10-11 00:40:34.981049
> 2015-10-11 10:45:06.182629 7f9270567900 20 read_log 254342'14800 
> (251922'9665) modify d1aa1484/rb.0.ac3386.238e1f29.0000000bb0ad/head//8 by 
> unknown.0.0:0 2015-10-11 00:40:34.981049
> 2015-10-11 10:45:06.182661 7f9270567900 20 read_log 6 divergent_priors
> 2015-10-11 10:45:06.184076 7f9270567900 10 read_log checking for missing 
> items over interval (0'0,254342'14800]
> 2015-10-11 10:45:11.861683 7f9270567900 15 read_log missing 
> 251925'9669,e9ea1484/rb.0.ac3386.238e1f29.000000187097/head//8
> 2015-10-11 10:45:11.861767 7f9270567900 15 read_log missing 
> 251925'9668,e9ea1484/rb.0.ac3386.238e1f29.000000187097/44//8
> 2015-10-11 10:45:11.861823 7f9270567900 15 read_log missing 
> 251925'9667,c4ea1484/rb.0.ac3386.238e1f29.00000022717d/head//8
> 2015-10-11 10:45:11.861877 7f9270567900 15 read_log missing 
> 251925'9666,c4ea1484/rb.0.ac3386.238e1f29.00000022717d/68//8
> 2015-10-11 10:45:11.924425 7f9270567900 -1 osd/PGLog.cc: In function 'static 
> void PGLog::read_log(ObjectStore*, coll_t, coll_t, ghobject_t, const 
> pg_info_t&, std::map<eversion_t, hobject_t>&, PGLog::IndexedLog&, 
> pg_missing_t&, std::ostringstream&, std::set<std::basic_string<char> >*)' 
> thread 7f9270567900 time 2015-10-11 10:45:11.861976
> osd/PGLog.cc: 962: FAILED assert(oi.version == i->first) 
> 
> cluster d960d672-e035-413d-ba39-8341f4131760
> health HEALTH_WARN
> 54 pgs backfill
> 373 pgs degraded
> 1 pgs recovering
> 336 pgs recovery_wait
> 373 pgs stuck degraded
> 391 pgs stuck unclean
> 43 pgs stuck undersized
> 43 pgs undersized
> recovery 88034/14758314 objects degraded (0.597%)
> recovery 280423/14758314 objects misplaced (1.900%)
> recovery 28/7330234 unfound (0.000%)
> monmap e1: 3 mons at 
> {ceph-mon1=10.20.0.11:6789/0,ceph-mon2=10.20.0.12:6789/0,ceph-mon3=10.20.0.13:6789/0}
> election epoch 6010, quorum 0,1,2 ceph-mon1,ceph-mon2,ceph-mon3
> osdmap e256816: 46 osds: 45 up, 45 in; 65 remapped pgs
> pgmap v19715504: 5184 pgs, 4 pools, 28323 GB data, 7158 kobjects
> 57018 GB used, 23027 GB / 80045 GB avail
> 88034/14758314 objects degraded (0.597%)
> 280423/14758314 objects misplaced (1.900%)
> 28/7330234 unfound (0.000%)
> 4790 active+clean
> 326 active+recovery_wait+degraded
> 36 active+undersized+degraded+remapped+wait_backfill
> 18 active+remapped+wait_backfill
> 6 active+recovery_wait+undersized+degraded+remapped
> 4 active+recovery_wait+degraded+remapped
> 3 active+clean+scrubbing+deep
> 1 active+recovering+undersized+degraded+remapped
> client io 11627 kB/s rd, 36433 B/s wr, 10 op/s 
> 
> Any help would be greatly appreciated! 
> 
> Thanks, 
> 
> Chris 
> 
> _______________________________________________
> ceph-users mailing list
> [email protected]
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com [1]
 

Links:
------
[1] http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

_______________________________________________
ceph-users mailing list
[email protected]
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

Re: [ceph-users] OSD will not start

Reply via email to