Hi, I got a functionnal and operationnal ceph cluster (in version 0.94.5), with 3 nodes (acting for MON and OSD), everything was fine.
I added a 4th osd node (same configuration than 3 others) and now cluster status is health warn (active+remapped). cluster e821c68f-995c-41a9-9c46-dbbd0a28b8c7 health HEALTH_WARN 256 pgs stuck unclean recovery 279/1245 objects degraded (22.410%) recovery 415/1245 objects misplaced (33.333%) pool rbd pg_num 128 > pgp_num 64 pool data pg_num 128 > pgp_num 100 monmap e1: 3 mons at {ceph-osd-1= 10.200.1.11:6789/0,ceph-osd-2=10.200.1.12:6789/0,ceph-osd-3=10.200.1.13:6789/0 } election epoch 4, quorum 0,1,2 ceph-osd-1,ceph-osd-2,ceph-osd-3 osdmap e57: 8 osds: 8 up, 8 in; 256 remapped pgs pgmap v948: 256 pgs, 2 pools, 1566 MB data, 415 objects 14929 MB used, 38237 MB / 55717 MB avail 279/1245 objects degraded (22.410%) 415/1245 objects misplaced (33.333%) 256 active+remapped OSD Tree root@ceph-osd-1:~# ceph osd tree ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY -8 4.00000 root default -7 4.00000 region eu-west-1 -5 1.00000 datacenter eu-west-1a -2 1.00000 host ceph-osd-1 0 1.00000 osd.0 up 1.00000 1.00000 1 1.00000 osd.1 up 1.00000 1.00000 -4 1.00000 host ceph-osd-3 4 1.00000 osd.4 up 1.00000 1.00000 5 1.00000 osd.5 up 1.00000 1.00000 -6 1.00000 datacenter eu-west-1b -3 1.00000 host ceph-osd-2 2 1.00000 osd.2 up 1.00000 1.00000 3 1.00000 osd.3 up 1.00000 1.00000 -9 1.00000 host ceph-osd-4 6 1.00000 osd.6 up 1.00000 1.00000 7 1.00000 osd.7 up 1.00000 1.00000 root@ceph-osd-1:~# I'm using this crush map : { "devices": [ { "id": 0, "name": "osd.0" }, { "id": 1, "name": "osd.1" }, { "id": 2, "name": "osd.2" }, { "id": 3, "name": "osd.3" }, { "id": 4, "name": "osd.4" }, { "id": 5, "name": "osd.5" }, { "id": 6, "name": "osd.6" }, { "id": 7, "name": "osd.7" } ], "types": [ { "type_id": 0, "name": "osd" }, { "type_id": 1, "name": "host" }, { "type_id": 2, "name": "chassis" }, { "type_id": 3, "name": "rack" }, { "type_id": 4, "name": "row" }, { "type_id": 5, "name": "pdu" }, { "type_id": 6, "name": "pod" }, { "type_id": 7, "name": "room" }, { "type_id": 8, "name": "datacenter" }, { "type_id": 9, "name": "region" }, { "type_id": 10, "name": "root" } ], "buckets": [ { "id": -2, "name": "ceph-osd-1", "type_id": 1, "type_name": "host", "weight": 131072, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 0, "weight": 65536, "pos": 0 }, { "id": 1, "weight": 65536, "pos": 1 } ] }, { "id": -3, "name": "ceph-osd-2", "type_id": 1, "type_name": "host", "weight": 131072, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 2, "weight": 65536, "pos": 0 }, { "id": 3, "weight": 65536, "pos": 1 } ] }, { "id": -4, "name": "ceph-osd-3", "type_id": 1, "type_name": "host", "weight": 131072, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 4, "weight": 65536, "pos": 0 }, { "id": 5, "weight": 65536, "pos": 1 } ] }, { "id": -5, "name": "eu-west-1a", "type_id": 8, "type_name": "datacenter", "weight": 131072, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": -2, "weight": 65536, "pos": 0 }, { "id": -4, "weight": 65536, "pos": 1 } ] }, { "id": -6, "name": "eu-west-1b", "type_id": 8, "type_name": "datacenter", "weight": 131072, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": -3, "weight": 65536, "pos": 0 }, { "id": -9, "weight": 65536, "pos": 1 } ] }, { "id": -7, "name": "eu-west-1", "type_id": 9, "type_name": "region", "weight": 131072, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": -5, "weight": 65536, "pos": 0 }, { "id": -6, "weight": 65536, "pos": 1 } ] }, { "id": -8, "name": "default", "type_id": 10, "type_name": "root", "weight": 262144, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": -7, "weight": 262144, "pos": 0 } ] }, { "id": -9, "name": "ceph-osd-4", "type_id": 1, "type_name": "host", "weight": 131072, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 6, "weight": 65536, "pos": 0 }, { "id": 7, "weight": 65536, "pos": 1 } ] } ], "rules": [ { "rule_id": 0, "rule_name": "replicated_ruleset", "ruleset": 0, "type": 1, "min_size": 1, "max_size": 10, "steps": [ { "op": "take", "item": -8, "item_name": "default" }, { "op": "choose_firstn", "num": 0, "type": "datacenter" }, { "op": "chooseleaf_firstn", "num": 1, "type": "host" }, { "op": "emit" } ] } ], "tunables": { "choose_local_tries": 0, "choose_local_fallback_tries": 0, "choose_total_tries": 50, "chooseleaf_descend_once": 1, "chooseleaf_vary_r": 1, "straw_calc_version": 1, "allowed_bucket_algs": 54, "profile": "hammer", "optimal_tunables": 0, "legacy_tunables": 0, "require_feature_tunables": 1, "require_feature_tunables2": 1, "require_feature_tunables3": 1, "has_v2_rules": 0, "has_v3_rules": 0, "has_v4_buckets": 0 } } I read a thread ( http://lists.ceph.com/pipermail/ceph-users-ceph.com/2013-November/006017.html) from this mailling list, I tried everything (tunnable to optimal, add more pg, use the same weight ), but I still got this issue. Do you have any ideas to fix this situation ?
_______________________________________________ ceph-users mailing list ceph-users@lists.ceph.com http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com