Dear Ceph experts,
I found something strange about the performance of my Ceph cluster:
Read-out much slower than write-in.
I have 3 machines running OSDs, each has 8 OSDs running on 8 raid0s
(each made up of 2 HDDs) respectively. The OSD journal and data the is
on the same device. All machines in my clusters have 10Gb network.
I used both Ceph RBD and CephFS, the client on another machine outside
cluster or on one of the running OSD (to rule out possible network
issue), an so on. All of these end up in a similar results: write-in can
almost reach the network limit, say 1200 MB/s, while read-out is only
350~450 MB/s.
Trying to figure out, I did an extra test using CephFS:
Version and Config:
[root@dl-disk1 ~]# ceph --version
ceph version *0.94.3* (95cefea9fd9ab740263bf8bb4796fd864d9afe2b)
[root@dl-disk1 ~]# cat /etc/ceph/ceph.conf
[global]
fsid = (hidden)
mon_initial_members = dl-disk1, dl-disk2, dl-disk3
mon_host = (hidden)
auth_cluster_required = cephx
auth_service_required = cephx
auth_client_required = cephx
filestore_xattr_use_omap = true
OSD tree:
# ceph osd tree
ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY
-1 258.88000 root default
-2 87.28000 host dl-disk1
0 10.90999 osd.0 up 1.00000 1.00000
1 10.90999 osd.1 up 1.00000 1.00000
2 10.90999 osd.2 up 1.00000 1.00000
3 10.90999 osd.3 up 1.00000 1.00000
4 10.90999 osd.4 up 1.00000 1.00000
5 10.90999 osd.5 up 1.00000 1.00000
6 10.90999 osd.6 up 1.00000 1.00000
7 10.90999 osd.7 up 1.00000 1.00000
-3 87.28000 host dl-disk2
8 10.90999 osd.8 up 1.00000 1.00000
9 10.90999 osd.9 up 1.00000 1.00000
10 10.90999 osd.10 up 1.00000 1.00000
11 10.90999 osd.11 up 1.00000 1.00000
12 10.90999 osd.12 up 1.00000 1.00000
13 10.90999 osd.13 up 1.00000 1.00000
14 10.90999 osd.14 up 1.00000 1.00000
15 10.90999 osd.15 up 1.00000 1.00000
-4 84.31999 host dl-disk3
16 10.53999 osd.16 up 1.00000 1.00000
17 10.53999 osd.17 up 1.00000 1.00000
18 10.53999 osd.18 up 1.00000 1.00000
19 10.53999 osd.19 up 1.00000 1.00000
20 10.53999 osd.20 up 1.00000 1.00000
21 10.53999 osd.21 up 1.00000 1.00000
22 10.53999 osd.22 up 1.00000 1.00000
23 10.53999 osd.23 up 1.00000 1.00000
Pools and PG (each pool has 128 PGs):
# ceph osd lspools
0 rbd,2 fs_meta,3 fs_data0,4 fs_data1,
# ceph pg dump pools
dumped pools in format plain
pg_stat objects mip degr misp unf bytes log disklog
pool 0 0 0 0 0 0 0 0 0
pool 2 20 0 0 0 0 356958 264 264
pool 3 3264 0 0 0 0 16106127360 14657 14657
pool 4 0 0 0 0 0 0 0 0
To simplify the problem, I made a new crush rule that the CephFS data
pool use OSDs on only one machine (dl-disk1 here), and size = 1.
# ceph osd crush rule dump osd_in_dl-disk1__ruleset
{
"rule_id": 1,
"rule_name": "osd_in_dl-disk1__ruleset",
"ruleset": 1,
"type": 1,
"min_size": 1,
"max_size": 10,
"steps": [
{
"op": "take",
"item": -2,
"item_name": "dl-disk1"
},
{
"op": "chooseleaf_firstn",
"num": 0,
"type": "osd"
},
{
"op": "emit"
}
]
}
# ceph osd pool get fs_data0 crush_ruleset
crush_ruleset: 1
# ceph osd pool get fs_data0 size
size: 1
Here starts the test.
On an client machine, I used dd to write a 4GB-file to CephFS, and
checked dstat on the OSD node dl-disk1:
[root@client ~]# dd of=/mnt/cephfs/4Gfile if=/dev/zero bs=4096k count=1024
1024+0 records in
1024+0 records out
4294967296 bytes (4.3 GB) copied, 3.69993 s, 1.2 GB/s
[root@dl-disk1 ~]# dstat ...
---total-cpu-usage---- ------memory-usage----- -net/total-
--dsk/sdb-----dsk/sdc-----dsk/sdd-----dsk/sde-----dsk/sdf-----dsk/sdg-----dsk/sdh-----dsk/sdi--
usr sys idl wai hiq siq| used buff cach free| recv send| read writ:
read writ: read writ: read writ: read writ: read writ: read writ:
read writ
0 0 100 0 0 0|3461M 67.2M 15.1G 44.3G| 19k 20k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3461M 67.2M 15.1G 44.3G| 32k 32k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
8 18 74 0 0 0|3364M 67.2M 11.1G 48.4G| 391k 391k| 0
2712k: 0 1096k: 0 556k: 0 1084k: 0 1200k: 0 1196k: 0
688k: 0 1252k
0 0 100 0 0 0|3364M 67.2M 11.1G 48.4G| 82k 127k| 0 0
: 0 0 : 0 0 : 0 928k: 0 540k: 0 0 : 0 0
: 0 0
8 16 72 3 0 1|3375M 67.2M 11.8G 47.7G| 718M 2068k| 0
120M: 0 172M: 0 76M: 0 220M: 0 188M: 16k 289M: 0
53M: 0 36M
6 13 77 4 0 1|3391M 67.2M 12.3G 47.1G| 553M 1517k| 0
160M: 0 176M: 0 88M: 0 208M: 0 225M: 0 213M: 0
8208k: 0 49M
6 13 77 3 0 1|3408M 67.2M 12.9G 46.6G| 544M 1272k| 0
212M: 0 8212k: 0 36M: 0 0 : 0 37M: 0 3852k: 0
497M: 0 337M
0 0 99 0 0 0|3407M 67.3M 12.9G 46.6G| 53k 114k| 0
36M: 0 37M: 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3407M 67.3M 12.9G 46.6G| 68k 110k| 0 0
: 0 0 : 0 0 : 0 36M: 0 0 : 0 0 : 0 0
: 0 0
0 0 99 0 0 0|3407M 67.3M 12.9G 46.6G| 38k 328k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0
36M: 0 0
0 1 99 0 0 0|3406M 67.3M 12.9G 46.6G| 11M 132k| 0 0
: 0 0 : 0 8224k: 0 0 : 0 0 : 0 32M: 0 0
: 0 36M
14 24 52 8 0 2|3436M 67.3M 13.8G 45.6G|1026M 2897k| 0
100M: 0 409M: 0 164M: 0 313M: 0 253M: 0 321M: 0
84M: 0 76M
14 24 34 27 0 1|3461M 67.3M 14.7G 44.7G| 990M 2565k| 0
354M: 0 72M: 0 0 : 0 164M: 0 313M: 0 188M: 0
308M: 0 333M
4 9 70 16 0 0|3474M 67.3M 15.1G 44.3G| 269M 646k| 0
324M: 0 0 : 0 0 : 0 36M: 0 0 : 0 0 : 0
349M: 0 172M
0 0 99 0 0 0|3474M 67.3M 15.1G 44.3G| 24k 315k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0
37M: 0 0
0 0 99 0 0 0|3474M 67.4M 15.1G 44.3G| 38k 102k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 36M: 0 0
: 0 36M
0 0 99 0 0 0|3473M 67.4M 15.1G 44.3G| 22k 23k| 0 0
: 0 0 : 0 36M: 0 0 : 0 36M: 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3473M 67.4M 15.1G 44.3G| 39k 40k| 0
304k: 0 16k: 0 0 : 0 0 : 0 0 : 0 0 : 0
0 : 0 0
0 0 100 0 0 0|3472M 67.4M 15.1G 44.3G| 28k 64k| 0
64M: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3471M 67.4M 15.1G 44.3G| 31k 94k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3472M 67.4M 15.1G 44.3G| 38k 39k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
The throughput is 1.2 GB/s, able to reach the network limit 10Gb.
Then, on the client machine, I used dd to read that file back from
CephFS, redirecting the file to /dev/zero (or /dev/null) to rule out
local HDD's IO:
[root@client ~]# dd if=/mnt/cephfs/4Gfile of=/dev/zero bs=4096k count=1024
1024+0 records in
1024+0 records out
4294967296 bytes (4.3 GB) copied, 8.85246 s, 485 MB/s
[root@dl-disk1 ~]# dstat ...
0 0 100 0 0 0|3462M 67.4M 15.1G 44.3G| 36k 36k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3462M 67.4M 15.1G 44.3G| 22k 22k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3463M 67.4M 15.1G 44.3G| 49k 49k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 1 99 0 0 0|3464M 67.4M 15.1G 44.3G| 282k 111M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 5 93 0 0 0|3466M 67.4M 15.1G 44.3G|1171k 535M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 5 93 0 0 0|3467M 67.4M 15.1G 44.3G|1124k 535M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 94 0 0 0|3467M 67.4M 15.1G 44.3G|1124k 535M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 94 0 0 0|3467M 67.4M 15.1G 44.3G|1109k 527M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 93 0 0 0|3471M 67.4M 15.1G 44.3G|1044k 504M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 94 0 0 0|3470M 67.4M 15.1G 44.3G|1031k 504M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 5 93 0 0 0|3470M 67.4M 15.1G 44.3G|1103k 527M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 93 0 0 0|3471M 67.5M 15.1G 44.3G|1084k 504M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3470M 67.5M 15.1G 44.3G| 25k 24k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
----total-cpu-usage---- ------memory-usage----- -net/total-
--dsk/sdb-----dsk/sdc-----dsk/sdd-----dsk/sde-----dsk/sdf-----dsk/sdg-----dsk/sdh-----dsk/sdi--
usr sys idl wai hiq siq| used buff cach free| recv send| read writ:
read writ: read writ: read writ: read writ: read writ: read writ:
read writ
0 0 100 0 0 0|3470M 67.5M 15.1G 44.3G| 43k 44k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3470M 67.5M 15.1G 44.3G| 22k 23k| 0
48k: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 35k 38k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 23k 85k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 44k 44k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 24k 25k| 0
12k: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 45k 43k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3468M 67.5M 15.1G 44.3G| 17k 18k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
The throughput here was only 400~500 MB/s here.
I noticed that there was NO disk I/O during the read-out, that means all
the objects of the file were already cached in memory on the OSD node.
Thus, HDDs does NOT seem to cause the lower throughput.
I also tried read-out using cat (in case dd may not use read-ahead in
file system. ), ended up getting similar result:
[root@client ~]# time cat /mnt/cephfs/4Gfile > /dev/zero
real 0m9.352s
user 0m0.002s
sys 0m4.147s
[root@dl-disk1 ~]# dstat ...
0 0 100 0 0 0|3465M 67.5M 15.1G 44.3G| 23k 22k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3465M 67.5M 15.1G 44.3G| 17k 18k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3465M 67.5M 15.1G 44.3G| 37k 37k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
1 2 97 0 0 0|3466M 67.5M 15.1G 44.3G| 633k 280M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 94 0 0 0|3467M 67.5M 15.1G 44.3G|1057k 498M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 94 0 0 0|3470M 67.5M 15.1G 44.3G|1078k 498M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 94 0 0 0|3470M 67.5M 15.1G 44.3G| 996k 486M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 94 0 0 0|3469M 67.5M 15.1G 44.3G| 988k 489M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 94 0 0 0|3469M 67.5M 15.1G 44.3G|1012k 489M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 94 0 0 0|3470M 67.5M 15.1G 44.3G|1017k 497M| 0 0
: 0 8192B: 0 28k: 0 0 : 0 0 : 0 0 : 0 0
: 0 0
2 4 94 0 0 0|3469M 67.5M 15.1G 44.3G|1032k 498M| 0 0
: 0 0 : 0 0 : 0 8192B: 0 104k: 0 0 : 0 0
: 0 0
----total-cpu-usage---- ------memory-usage----- -net/total-
--dsk/sdb-----dsk/sdc-----dsk/sdd-----dsk/sde-----dsk/sdf-----dsk/sdg-----dsk/sdh-----dsk/sdi--
usr sys idl wai hiq siq| used buff cach free| recv send| read writ:
read writ: read writ: read writ: read writ: read writ: read writ:
read writ
2 4 94 0 0 0|3469M 67.5M 15.1G 44.3G|1025k 496M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 40k: 0
80k: 0 0
0 1 99 0 0 0|3469M 67.5M 15.1G 44.3G| 127k 52M| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 120k
0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 21k 21k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 66k 66k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 35k 38k| 0 0
: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0
: 0 0
The average throughput is 4GB / 9.35s = 438 MB/s. Still, unlikely to be
HDD's issue.
I'm sure that the network can reach 10Gb in both ways via iperf or other
test, and there's no other user process occupying bandwidth.
Could you please help me some to find out the main reason for this
issue? Thank you.
Best Regards,
FaHui
_______________________________________________
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com