That's good!

So was the root cause is because the osd was full? What's your thought
about that?
Was there any reason to delete any files?

 Kinjo


On Sun, Jul 5, 2015 at 6:51 PM, Jacek Jarosiewicz <
jjarosiew...@supermedia.pl> wrote:

> ok, I got it working...
>
> first i manually deleted some files from the full osd, set the flag noout
> and restarted the osd daemon.
>
> then i waited a while for the cluster to backfill pgs, and after that the
> rados -p cache cache-try-flush-evict-all command went OK.
>
> I'm wondering though, because this happened to me before (hanging blocked
> request when osd's are near full), is there a better way maybe to solve
> this problem?
>
> J
>
> On 07/04/2015 03:55 PM, Jacek Jarosiewicz wrote:
>
>> Hi,
>>
>> I'm currently testing cache tier on ceph 0.94.2 - I've set up an erasure
>> coded pool with a cache pool on SSD drives, the cache mode is set to
>> writeback. I tried to fill the cache and see how it will flush objects
>> but the cache pool is full and I can't flush-evict any objects.. and
>> there are some requests blocked.
>>
>> I tried to restart osd daemon that is blocked but it didn't help. And
>> when a issue command: rados -p cache cache-flush-evict-all - the command
>> just hangs.
>>
>> How can I fix this?
>>
>> Here's my setup:
>>
>> root@cf01:~# ceph -s
>>      cluster 999ed979-3837-4919-bd41-9929f9d44548
>>       health HEALTH_ERR
>>              4 requests are blocked > 32 sec
>>              1 full osd(s)
>>              2 near full osd(s)
>>       monmap e1: 1 mons at {cf01=10.4.10.211:6789/0}
>>              election epoch 1, quorum 0 cf01
>>       osdmap e951: 14 osds: 14 up, 14 in
>>              flags full
>>        pgmap v32829: 768 pgs, 3 pools, 612 GB data, 153 kobjects
>>              1054 GB used, 55615 GB / 56670 GB avail
>>                   768 active+clean
>>
>> root@cf01:~# ceph osd df
>> ID WEIGHT  REWEIGHT SIZE   USE    AVAIL  %USE  VAR
>> 10 1.00000  1.00000   204G   179G 25917M 87.64 47.09
>> 11 1.00000  1.00000   204G   182G 22827M 89.12 47.88
>> 12 1.00000  1.00000   204G   147G 58599M 72.06 38.72
>> 13 1.00000  1.00000   204G   195G  9784M 95.34 51.22
>>   0 1.00000  1.00000  5585G 33104M  5552G  0.58  0.31
>>   1 1.00000  1.00000  5585G 25250M  5560G  0.44  0.24
>>   2 1.00000  1.00000  5585G 31215M  5554G  0.55  0.29
>>   3 1.00000  1.00000  5585G 43265M  5542G  0.76  0.41
>>   4 1.00000  1.00000  5585G 46252M  5539G  0.81  0.43
>>   5 1.00000  1.00000  5585G 30239M  5555G  0.53  0.28
>>   6 1.00000  1.00000  5585G 32174M  5553G  0.56  0.30
>>   7 1.00000  1.00000  5585G 27162M  5558G  0.47  0.26
>>   9 1.00000  1.00000  5585G 47511M  5538G  0.83  0.45
>>   8 1.00000  1.00000  5585G 42006M  5544G  0.73  0.39
>>                TOTAL 56670G  1054G 55615G  1.86
>> MIN/MAX VAR: 0.24/51.22  STDDEV: 45.24
>>
>>
>> root@cf02:~# ceph osd pool ls detail
>> pool 4 'rbd' replicated size 2 min_size 1 crush_ruleset 0 object_hash
>> rjenkins pg_num 256 pgp_num 256 last_change 189 lfor 188 flags
>> hashpspool stripe_width 0
>> pool 7 'cache' replicated size 2 min_size 1 crush_ruleset 1 object_hash
>> rjenkins pg_num 256 pgp_num 256 last_change 873 flags
>> hashpspool,incomplete_clones tier_of 8 cache_mode writeback target_bytes
>> 440234147840 target_objects 1000000 hit_set
>> bloom{false_positive_probability: 0.05, target_size: 0, seed: 0} 3600s
>> x1 stripe_width 0
>>      removed_snaps [1~1]
>> pool 8 'ecpool' erasure size 4 min_size 3 crush_ruleset 2 object_hash
>> rjenkins pg_num 256 pgp_num 256 last_change 705 lfor 701 flags
>> hashpspool tiers 7 read_tier 7 write_tier 7 stripe_width 4128
>>      removed_snaps [1~1]
>>
>>
>> root@cf01:~# ceph daemon osd.10 dump_ops_in_flight
>> {
>>      "ops": [
>>          {
>>              "description": "osd_op(client.32158.0:2
>> rb.0.7c12.238e1f29.00000001591a [cache-flush] 7.cc760000
>> ack+read+ignore_cache+ignore_overlay+known_if_redirected e932)",
>>              "initiated_at": "2015-07-04 15:42:09.361984",
>>              "age": 574.793637,
>>              "duration": 0.000235,
>>              "type_data": [
>>                  "delayed",
>>                  {
>>                      "client": "client.32158",
>>                      "tid": 2
>>                  },
>>                  [
>>                      {
>>                          "time": "2015-07-04 15:42:09.361984",
>>                          "event": "initiated"
>>                      },
>>                      {
>>                          "time": "2015-07-04 15:42:09.362197",
>>                          "event": "reached_pg"
>>                      },
>>                      {
>>                          "time": "2015-07-04 15:42:09.362219",
>>                          "event": "waiting for blocked object"
>>                      }
>>                  ]
>>              ]
>>          },
>>          {
>>              "description": "osd_op(client.32169.0:2
>> rb.0.7c12.238e1f29.00000001591a [cache-flush] 7.cc760000
>> ack+read+ignore_cache+ignore_overlay+known_if_redirected e951)",
>>              "initiated_at": "2015-07-04 15:49:28.302955",
>>              "age": 135.852667,
>>              "duration": 0.000304,
>>              "type_data": [
>>                  "delayed",
>>                  {
>>                      "client": "client.32169",
>>                      "tid": 2
>>                  },
>>                  [
>>                      {
>>                          "time": "2015-07-04 15:49:28.302955",
>>                          "event": "initiated"
>>                      },
>>                      {
>>                          "time": "2015-07-04 15:49:28.303211",
>>                          "event": "reached_pg"
>>                      },
>>                      {
>>                          "time": "2015-07-04 15:49:28.303258",
>>                          "event": "waiting for blocked object"
>>                      }
>>                  ]
>>              ]
>>          },
>>          {
>>              "description": "osd_op(client.32151.0:1
>> rb.0.7c12.238e1f29.00000001591a [cache-flush] 7.cc760000 RETRY=1
>> ack+retry+read+ignore_cache+ignore_overlay+known_if_redirected e923)",
>>              "initiated_at": "2015-07-04 15:41:24.216643",
>>              "age": 619.938979,
>>              "duration": 0.441889,
>>              "type_data": [
>>                  "started",
>>                  {
>>                      "client": "client.32151",
>>                      "tid": 1
>>                  },
>>                  [
>>                      {
>>                          "time": "2015-07-04 15:41:24.216643",
>>                          "event": "initiated"
>>                      },
>>                      {
>>                          "time": "2015-07-04 15:41:24.269935",
>>                          "event": "reached_pg"
>>                      },
>>                      {
>>                          "time": "2015-07-04 15:41:24.334561",
>>                          "event": "reached_pg"
>>                      },
>>                      {
>>                          "time": "2015-07-04 15:41:24.604050",
>>                          "event": "reached_pg"
>>                      },
>>                      {
>>                          "time": "2015-07-04 15:41:24.658221",
>>                          "event": "reached_pg"
>>                      },
>>                      {
>>                          "time": "2015-07-04 15:41:24.658532",
>>                          "event": "started"
>>                      }
>>                  ]
>>              ]
>>          }
>>      ],
>>      "num_ops": 3
>> }
>>
>>
>>
>> # begin crush map
>> tunable choose_local_tries 0
>> tunable choose_local_fallback_tries 0
>> tunable choose_total_tries 50
>> tunable chooseleaf_descend_once 1
>>
>> # devices
>> device 0 osd.0
>> device 1 osd.1
>> device 2 osd.2
>> device 3 osd.3
>> device 4 osd.4
>> device 5 osd.5
>> device 6 osd.6
>> device 7 osd.7
>> device 8 osd.8
>> device 9 osd.9
>> device 10 osd.10
>> device 11 osd.11
>> device 12 osd.12
>> device 13 osd.13
>>
>> # types
>> type 0 osd
>> type 1 host
>> type 2 chassis
>> type 3 rack
>> type 4 row
>> type 5 pdu
>> type 6 pod
>> type 7 room
>> type 8 datacenter
>> type 9 region
>> type 10 root
>>
>> # buckets
>> host cf01 {
>>      id -2        # do not change unnecessarily
>>      # weight 3.000
>>      alg straw
>>      hash 0    # rjenkins1
>>      item osd.0 weight 1.000
>>      item osd.1 weight 1.000
>>      item osd.2 weight 1.000
>> }
>> host cf02 {
>>      id -3        # do not change unnecessarily
>>      # weight 2.000
>>      alg straw
>>      hash 0    # rjenkins1
>>      item osd.3 weight 1.000
>>      item osd.4 weight 1.000
>> }
>> host cf03 {
>>      id -4        # do not change unnecessarily
>>      # weight 3.000
>>      alg straw
>>      hash 0    # rjenkins1
>>      item osd.5 weight 1.000
>>      item osd.6 weight 1.000
>>      item osd.7 weight 1.000
>> }
>> host cf04 {
>>      id -5        # do not change unnecessarily
>>      # weight 2.000
>>      alg straw
>>      hash 0    # rjenkins1
>>      item osd.9 weight 1.000
>>      item osd.8 weight 1.000
>> }
>> root default {
>>      id -1        # do not change unnecessarily
>>      # weight 10.000
>>      alg straw
>>      hash 0    # rjenkins1
>>      item cf01 weight 3.000
>>      item cf02 weight 2.000
>>      item cf03 weight 3.000
>>      item cf04 weight 2.000
>> }
>> host ssd01 {
>>      id -6        # do not change unnecessarily
>>      # weight 1.000
>>      alg straw
>>      hash 0    # rjenkins1
>>      item osd.10 weight 1.000
>> }
>> host ssd02 {
>>      id -7        # do not change unnecessarily
>>      # weight 1.000
>>      alg straw
>>      hash 0    # rjenkins1
>>      item osd.11 weight 1.000
>> }
>> host ssd03 {
>>      id -8        # do not change unnecessarily
>>      # weight 1.000
>>      alg straw
>>      hash 0    # rjenkins1
>>      item osd.12 weight 1.000
>> }
>> host ssd04 {
>>      id -9        # do not change unnecessarily
>>      # weight 1.000
>>      alg straw
>>      hash 0    # rjenkins1
>>      item osd.13 weight 1.000
>> }
>> root ssd {
>>      id -10        # do not change unnecessarily
>>      # weight 8.000
>>      alg straw
>>      hash 0    # rjenkins1
>>      item ssd01 weight 2.000
>>      item ssd02 weight 2.000
>>      item ssd03 weight 2.000
>>      item ssd04 weight 2.000
>> }
>>
>> # rules
>> rule default {
>>      ruleset 0
>>      type replicated
>>      min_size 1
>>      max_size 10
>>      step take default
>>      step chooseleaf firstn 0 type host
>>      step emit
>> }
>> rule cache {
>>      ruleset 1
>>      type replicated
>>      min_size 1
>>      max_size 10
>>      step take ssd
>>      step chooseleaf firstn 0 type host
>>      step emit
>> }
>> rule ecpool {
>>      ruleset 2
>>      type erasure
>>      min_size 3
>>      max_size 4
>>      step set_chooseleaf_tries 5
>>      step set_choose_tries 100
>>      step take default
>>      step chooseleaf indep 0 type host
>>      step emit
>> }
>>
>> # end crush map
>>
>>
>
> --
> Jacek Jarosiewicz
> Administrator Systemów Informatycznych
>
>
> ----------------------------------------------------------------------------------------
> SUPERMEDIA Sp. z o.o. z siedzibą w Warszawie
> ul. Senatorska 13/15, 00-075 Warszawa
> Sąd Rejonowy dla m.st.Warszawy, XII Wydział Gospodarczy Krajowego Rejestru
> Sądowego,
> nr KRS 0000029537; kapitał zakładowy 42.756.000 zł
> NIP: 957-05-49-503
> Adres korespondencyjny: ul. Jubilerska 10, 04-190 Warszawa
>
>
> ----------------------------------------------------------------------------------------
> SUPERMEDIA ->   http://www.supermedia.pl
> dostep do internetu - hosting - kolokacja - lacza - telefonia
> _______________________________________________
> ceph-users mailing list
> ceph-users@lists.ceph.com
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>



-- 
Life w/ Linux <http://i-shinobu.hatenablog.com/>
_______________________________________________
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

Reply via email to