[ceph-users] cannot revert lost objects

kevin horan Thu, 01 May 2014 10:12:12 -0700

I have an issue very similar to this thread:http://article.gmane.org/gmane.comp.file-systems.ceph.user/3197. I have19 unfound objects that are part of a VM image that I have alreadyrecovered from backup. If I query pg 4.30 ( the one with the unfoundobjects), it says it is still querying osd.8, looking for the unfoundobjects. Because of this, when I run:


# ceph pg 4.30 mark_unfound_lost revert

Error EINVAL: pg has 19 unfound objects but we haven't probed allsources, not marking lost

It refuses to remove them. It has been "querying" osd.8 for almost 2days now, and there is only 200GB on it, so I don't see why it wouldtake so long. So how can I force it to either stop querying, or revertthe unfound objects?

Here is how I got into this state. I have only 6 OSDs total, 3 on onehost (vashti) and 3 on another (zadok). I set the noout flag so I couldreboot zadok. Zadok was down for 2 minutes. When it came up ceph beganrecovering the objects that had not been replicated yet. Before recoveryfinished, osd.6, on vashti, died (IO errors on disk, whole driveun-recoverable). Since osd.6 had objects that had not yet had a chanceto replicate to any OSD on zadok, they were lost. I cannot recoveranything further from osd.6.



Here is the output of "ceph pg 4.30 query":

{ "state": "active+recovering+degraded+remapped",
  "epoch": 20364,
  "up": [
        2,
        0],
  "acting": [
        1,
        2],
  "info": { "pgid": "4.30",
      "last_update": "20364'10377395",
      "last_complete": "0'0",
      "log_tail": "20161'10325373",
      "last_user_version": 10377395,
      "last_backfill": "MAX",
      "purged_snaps": "[1~7,10~4]",
      "history": { "epoch_created": 386,
          "last_epoch_started": 20323,
          "last_epoch_clean": 20161,
          "last_epoch_split": 0,
          "same_up_since": 20322,
          "same_interval_since": 20322,
          "same_primary_since": 20311,
          "last_scrub": "20118'10315975",
          "last_scrub_stamp": "2014-04-29 11:54:57.358096",
          "last_deep_scrub": "20050'10061396",
          "last_deep_scrub_stamp": "2014-04-24 11:39:40.313745",
          "last_clean_scrub_stamp": "2014-04-29 11:54:57.358096"},
      "stats": { "version": "20364'10377395",
          "reported_seq": "17957416",
          "reported_epoch": "20364",
          "state": "active+recovering+degraded+remapped",
          "last_fresh": "2014-05-01 10:00:51.210564",
          "last_change": "2014-05-01 09:03:31.708198",
          "last_active": "2014-05-01 10:00:51.210564",
          "last_clean": "2014-04-29 16:14:12.127562",
          "last_became_active": "0.000000",
          "last_unstale": "2014-05-01 10:00:51.210564",
          "mapping_epoch": 20317,
          "log_start": "20161'10325373",
          "ondisk_log_start": "20161'10325373",
          "created": 386,
          "last_epoch_clean": 20161,
          "parent": "0.0",
          "parent_split_bits": 0,
          "last_scrub": "20118'10315975",
          "last_scrub_stamp": "2014-04-29 11:54:57.358096",
          "last_deep_scrub": "20050'10061396",
          "last_deep_scrub_stamp": "2014-04-24 11:39:40.313745",
          "last_clean_scrub_stamp": "2014-04-29 11:54:57.358096",
          "log_size": 52022,
          "ondisk_log_size": 52022,
          "stats_invalid": "0",
          "stat_sum": { "num_bytes": 9078859264,
              "num_objects": 2598,
              "num_object_clones": 360,
              "num_object_copies": 0,
              "num_objects_missing_on_primary": 0,
              "num_objects_degraded": 0,
              "num_objects_unfound": 0,
              "num_read": 703887,
              "num_read_kb": 164523202,
              "num_write": 8785487,
              "num_write_kb": 69327327,
              "num_scrub_errors": 0,
              "num_shallow_scrub_errors": 0,
              "num_deep_scrub_errors": 0,
              "num_objects_recovered": 24428,
              "num_bytes_recovered": 93261249024,
              "num_keys_recovered": 0},
          "stat_cat_sum": {},
          "up": [
                2,
                0],
          "acting": [
                1,
                2]},
      "empty": 0,
      "dne": 0,
      "incomplete": 0,
      "last_epoch_started": 20323},
  "recovery_state": [
        { "name": "Started\/Primary\/Active",
          "enter_time": "2014-05-01 09:03:30.557244",
          "might_have_unfound": [
                { "osd": 0,
                  "status": "already probed"},
                { "osd": 2,
                  "status": "already probed"},
                { "osd": 6,
                  "status": "osd is down"},
                { "osd": 8,
                  "status": "querying"}],
          "recovery_progress": { "backfill_target": 2,
              "waiting_on_backfill": 0,
              "last_backfill_started": "0\/\/0\/\/-1",
              "backfill_info": { "begin": "0\/\/0\/\/-1",
                  "end": "0\/\/0\/\/-1",
                  "objects": []},
              "peer_backfill_info": { "begin": "0\/\/0\/\/-1",
                  "end": "0\/\/0\/\/-1",
                  "objects": []},
              "backfills_in_flight": [],
              "recovering": [],
              "pg_backend": { "pull_from_peer": [],
                  "pushing": []}},
          "scrub": { "scrubber.epoch_start": "0",
              "scrubber.active": 0,
              "scrubber.block_writes": 0,
              "scrubber.finalizing": 0,
              "scrubber.waiting_on": 0,
              "scrubber.waiting_on_whom": []}},
        { "name": "Started",
          "enter_time": "2014-05-01 09:03:29.347540"}]}

Thanks.
Kevin

_______________________________________________
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

[ceph-users] cannot revert lost objects

Reply via email to