--- Original message ---
Asunto: Re: [ceph-users] write speed issue on RBD image
De: Russell E. Glaue <rgl...@cait.org>
Para: German Anders <gand...@despegar.com>
Fecha: Wednesday, 02/04/2014 19:20
the Switches that you are using: Dell PowerConnect 8132, 10GBaseT
version; We're using 2x10GbE LAGs for each host.
firmware of the HBA on the hosts: PERC 6/i RAID Card, latest firmware
are they Blades or "traditional" servers?: Traditional, DELL PER710 -
drives are 2TB Segate
did you use any special options when formatting the XFS filesystem?
and/or mount options?: No
What hypervisor are you using?: KVM/libvirt/QEMU on CentOS 6.5
The dd test yields 1.6GB/s on the hard drives mounted Xfs volume
managed by the OSDs.
Using the "oflag=direct" tests on both the slow-write and fast-write
VMs report about ~ 50MB/s.
putting the dd in the background, and running dstat in the foreground
report about the same results on all tested hosts (see output below).
Now, without the "oflag=direct", and running dd in the background with
dstat in the foreground shows a different story (see output below).
For the fast-disk-write VMs, the data is written out in the first 7
iterations of the 'dstat --all' output, and the writing is in larger
throughput, greater than 100MB/s.
But for slow-disk-write VMs, however, 16 iterations of 'dstat --all'
pass before significant writes are performed, and the writing is in
slower throughput, less than 50MB/s.
So this was a good test. For some reason, the VM OS is not writing
right away.
Any suggestions on how to address this? Rather than reinstalling the
OS - because I'd like to know how to prevent this from occurring
again.
results follow:
--- fast-disk-write VMs without oflag=direct ---
[root@fast1host tmp]# /bin/rm disk-test; dd if=/dev/zero of=disk-test
bs=1048576 count=512 &
[1] 4184
[root@fast1host tmp]# dstat --all
----total-cpu-usage---- -dsk/total- -net/total- ---paging--
---system--
usr sys idl wai hiq siq| read writ| recv send| in out | int csw
1 1 96 2 0 0| 79k 730k| 0 0 | 0 0 |1013
56 512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.362167 seconds, 1.5 GB/s
1 14 0 82 3 0| 0 117M|1062B 728B| 0 0 |1118
78
0 5 0 92 2 1| 0 128M| 366B 322B| 0 0 |1124
66
0 6 0 91 3 0| 0 145M| 426B 322B| 0 0 |1138
58
0 5 0 93 2 0| 0 128M| 486B 322B| 0 0 |1148
54
0 2 54 43 1 0| 0 70M| 426B 322B| 0 0 |1082
36
0 1 98 1 0 0| 0 440k| 606B 322B| 0 0 |1014
30
0 0 100 0 0 0| 0 0 | 246B 322B| 0 0 |1007
40
0 0 100 0 0 0| 0 0 | 426B 322B| 0 0 |1007
20
0 0 100 0 0 0| 0 0 | 306B 322B| 0 0 |1006
22
[1]+ Done dd if=/dev/zero of=disk-test bs=1048576
count=512
--- slow-disk-write VMs without oflag=direct ---
[root@slow1host tmp]# /bin/rm disk-test; dd if=/dev/zero of=disk-test
bs=1048576 count=512 &
[1] 25192
[root@slow1host tmp]# dstat --all
----total-cpu-usage---- -dsk/total- -net/total- ---paging--
---system--
usr sys idl wai hiq siq| read writ| recv send| in out | int csw
0 0 99 1 0 0| 384B 9.9k| 0 0 | 0 0 |1007
22
0 100 0 0 0 0| 0 0 | 678B 642B| 0 0 |1014
22
0 100 0 0 0 0| 0 0 | 408B 322B| 0 0 |1010
27
0 100 0 0 0 0| 0 0 | 546B 322B| 0 0 |1012
27
0 99 0 1 0 0| 0 440k| 486B 322B| 0 0 |1014
35
1 99 0 0 0 0| 0 0 | 246B 322B| 0 0 |1006
21
0 100 0 0 0 0| 0 0 | 552B 420B| 0 0 |1013
25
0 100 0 0 0 0| 0 0 | 426B 322B| 0 0 |1010
23
0 100 0 0 0 0| 0 0 | 546B 322B| 0 0 |1013
27
0 100 0 0 0 0| 0 48k| 306B 322B| 0 0 |1012
60
0 100 0 0 0 0| 0 0 | 576B 322B| 0 0 |1012
23
0 100 0 0 0 0| 0 0 | 366B 322B| 0 0 |1005
23
0 100 0 0 0 0| 0 0 | 336B 322B| 0 0 |1008
25
0 100 0 0 0 0| 0 0 | 606B 322B| 0 0 |1012
25
0 99 0 1 0 0| 0 24k| 336B 322B| 0 0 |1009
31
0 100 0 0 0 0| 0 0 | 366B 322B| 0 0 |1009
21
0 100 0 0 0 0| 0 35M| 486B 322B| 0 0 |1055
41
1 97 0 0 2 0| 0 44M| 306B 322B| 0 0 |1068
43
0 100 0 0 0 0| 0 34M| 426B 322B| 0 0 |1052
45
0 100 0 0 0 0| 0 32M| 606B 322B| 0 0 |1059
48
0 99 0 0 1 0| 0 40M| 426B 322B| 0 0 |1068
39
0 99 0 0 1 0| 0 40M| 732B 420B| 0 0 |1078
43
0 99 0 0 1 0| 0 40M| 306B 322B| 0 0 |1074
43
0 100 0 0 0 0| 0 32M| 426B 322B| 0 0 |1069
47
0 98 0 1 1 0| 0 40M| 606B 322B| 0 0 |1086
54
0 99 0 0 1 0| 0 32M| 426B 322B| 0 0 |1070
37 512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 26.9738 seconds, 19.9 MB/s
0 49 51 0 0 0| 0 24M| 624B 744B| 0 0 |1058
45
0 0 100 0 0 0| 0 0 | 516B 322B| 0 0 |1011
23
0 0 100 0 0 0| 0 0 | 426B 322B| 0 0 |1010
29
1 0 99 0 0 0| 0 0 | 696B 322B| 0 0 |1013
23
0 1 98 1 0 0| 0 48k| 306B 322B| 0 0 |1010
33
[1]+ Done dd if=/dev/zero of=disk-test bs=1048576
count=512
--- results on fast-disk-write VMs with oflag=direct ---
[root@fast1host tmp]# /bin/rm disk-test; dd if=/dev/zero of=disk-test
bs=1048576 count=512 oflag=direct &
[1] 4191
[root@fast1host tmp]# dstat --all
----total-cpu-usage---- -dsk/total- -net/total- ---paging--
---system--
usr sys idl wai hiq siq| read writ| recv send| in out | int csw
1 1 96 2 0 0| 78k 844k| 0 0 | 0 0 |1013
55
0 1 0 99 0 0|8192B 105M| 684B 306B| 0 0 |1206
142
0 2 0 97 1 0| 0 122M| 546B 322B| 0 0 |1246
140
0 2 0 98 0 0| 0 119M| 606B 322B| 0 0 |1238
144
0 1 0 98 1 0| 0 117M| 366B 322B| 0 0 |1232
140
0 1 0 99 0 0| 0 88M| 426B 322B| 0 0 |1179
118
0 1 0 99 0 0| 0 119M| 486B 322B| 0 0 |1236
140
0 2 0 97 1 0| 0 106M| 366B 322B| 0 0 |1210
128
0 1 0 99 0 0| 0 99M| 666B 322B| 0 0 |1210
120
0 1 0 99 0 0| 0 112M| 246B 322B| 0 0 |1235
136 512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 9.29427 seconds, 57.8 MB/s
1 0 94 5 0 0| 0 6144k| 564B 744B| 0 0 |1024
31
0 0 99 1 0 0| 0 80k| 606B 322B| 0 0 |1015
29
0 0 100 0 0 0| 0 0 | 246B 322B| 0 0 |1007
20
0 0 100 0 0 0| 0 0 | 426B 322B| 0 0 |1010
24
0 0 100 0 0 0| 0 0 | 486B 322B| 0 0 |1011
24
0 0 100 0 0 0| 0 0 | 486B 322B| 0 0 |1010
22
[1]+ Done dd if=/dev/zero of=disk-test bs=1048576
count=512 oflag=direct
--- results on slow-disk-write VMs with oflag=direct ---
[root@slow1host tmp]# /bin/rm disk-test; dd if=/dev/zero of=disk-test
bs=1048576 count=512 oflag=direct &
[1] 25264
[root@slow1host tmp]# dstat --all
----total-cpu-usage---- -dsk/total- -net/total- ---paging--
---system--
usr sys idl wai hiq siq| read writ| recv send| in out | int csw
0 0 99 1 0 0| 384B 11k| 0 0 | 0 0 |1007
22
0 2 0 98 0 0| 0 119M| 744B 306B| 0 0 |1227
145
0 1 0 98 1 0| 0 80M| 606B 322B| 0 0 |1172
109
0 1 0 99 0 0| 0 99M| 246B 322B| 0 0 |1216
125
1 1 0 98 0 0| 0 92M| 426B 322B| 0 0 |1206
115
0 1 0 98 1 0| 0 68M| 366B 322B| 0 0 |1149
95
0 2 0 98 0 0| 0 90M| 366B 322B| 0 0 |1187
123
0 8 0 91 1 0| 0 102M| 666B 322B| 0 0 |1205
141
0 1 0 99 0 0| 0 88M| 246B 322B| 0 0 |1178
115
0 1 0 99 0 0| 0 79M| 486B 322B| 0 0 |1150
105
0 2 0 98 0 0| 0 103M| 546B 322B| 0 0 |1206
126 512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 10.9108 seconds, 49.2 MB/s
0 1 24 74 1 0| 0 93M| 504B 744B| 0 0 |1187
133
0 0 100 0 0 0| 0 0 | 546B 322B| 0 0 |1010
25
0 0 100 0 0 0| 0 0 | 306B 322B| 0 0 |1008
25
0 0 100 0 0 0| 0 0 | 186B 322B| 0 0 |1007
25
0 0 100 0 0 0| 0 0 | 486B 322B| 0 0 |1009
25
0 0 100 0 0 0| 0 56k| 306B 322B| 0 0 |1010
33
0 0 100 0 0 0| 0 0 | 786B 644B| 0 0 |1016
21
0 0 100 0 0 0| 0 0 | 486B 322B| 0 0 |1010
27
0 0 100 0 0 0| 0 0 | 576B 322B| 0 0 |1011
23
0 0 99 0 0 1| 0 0 | 906B 322B| 0 0 |1018
27
1 0 99 0 0 0| 0 0 | 306B 322B| 0 0 |1007
25
[1]+ Done dd if=/dev/zero of=disk-test bs=1048576
count=512 oflag=direct
----- Original Message -----
From: "German Anders" <gand...@despegar.com>
To: "Russell E. Glaue" <rgl...@cait.org>
Cc: ceph-users@lists.ceph.com
Sent: Wednesday, April 2, 2014 3:50:26 PM
Subject: Re: [ceph-users] write speed issue on RBD image
Did you try those DD statements with the oflag=direct ? like:
dd if=/dev/zero of=disk-test bs=1048576 count=512 oflag=direct; dd
if=disk-test of=/dev/null bs=1048576 oflag=direct; /bin/rm disk-test
In that way you are bypassing the host cache and wait for the ACK to
first go straight to the disk and make the write.
And see the performance numbers, if they change or not, and also the
slow ones what are in any different. Also you could run those commands
with an & at the last to run them on background and then immediately
run a $ dstat --all to see how much data is send over the network
in/out and how much data is write in disk locally.
Hope this help, also it would be great that you could share a little
bit more about the Switches that you are using, firmware of the HBA on
the hosts, are they Blades or "traditional" servers?, did you use any
special options when formatting the XFS filesystem? and/or mount
options? What hypervisor are you using?
Best regards,
German Anders
Field Storage Support Engineer
Despegar.com - IT Team
--- Original message ---
Asunto: [ceph-users] write speed issue on RBD image
De: Russell E. Glaue <rgl...@cait.org>
Para: <ceph-users@lists.ceph.com>
Fecha: Wednesday, 02/04/2014 15:12
Can someone recommend some testing I can do to further investigate why
this issue with slow-disk-write in the VM OS is occurring?
It seems the issue, details below, are perhaps related to the VM OS
running on the RADOS images in Ceph.
Issue:
I have a handful (like 10) of VM's running that, when tested, report
slow disk write speed of 8MB/s-30MB/s. All of the remaining VM's (like
40) are reporting fast disk write speed of average 800MB/s-1.0GB/s.
There are no VMs reporting any disk write speeds in-between these
numbers. Restarting the OS on any of the VMs does not resolve the
issue.
After these tests, I took one of the VMs (image02host) with slow disk
write speed and reinstalled the basic OS, including repartitioning the
disk. I used the same RADOS image. After this, I retested this VM
(image02host) and all the other VMs with slow disk write speed. This
VM (image02host) I reinstalled the OS on no longer has the slow disk
write speeds any longer. And, surprisingly, one of the other VMs
(another-host) with slow disk write speed started having fast write
speeds. All other VMs with slow disk write speed continued the same.
So, I do not necessarily believe the slow disk issue is directly
related to any kind of bug or outstanding issue with Ceph/RADOS. I
only have a couple guesses at this point:
1. Perhaps my OS install (or possibly configuration), somehow is
having issue. I don't see how this is possible, however. For all the
VMs I have tested, they have all been kick-started with the same disk
and OS configuration. So they are virtually identical, but are having
either fast or slow disk write speed among them.
2. Perhaps I have some bad sectors or hard drive error at the hardware
level that is causing the issue. Perhaps the RADOS images of these
handful (like 10) of VMs is being written across a bad part of a hard
drive. This seems more likely to me. However, all drives across all
Ceph hosts are reporting good health.
So, now, I have come to the ceph-user list to ask for help. What are
some things I can do to test if there is some, possibly, bad sector or
hardware error on one of the hard drives, or some issue with Ceph
writing to part of one of the hard drives? Or are there any other
tests I can run to help determine possible issues.
And, secondly, if I wanted to move a RADOS image to new OSD blocks, is
there a way to do that without exporting and importing the image?
Perhaps, by resplattering the image and testing again to see if the
issue is resolved, this can help determine if the existing slow disk
write speed issue is how the image is splattered across OSDs -
indicating a bad OSD hard drive, or bad parts of an OSD hard drive.
Ceph Configuration:
* Ceph Version 0.72.2
* Three Ceph hosts, CentOS 6.5 OS, using Xfs
* All connected via 10GbE network
* KVM/QEMU Virtualization, with Ceph support
* Virtual Machines are all RHEL 5.9 32bit
* Our Ceph setup is very basic. One pool for all VM disks, all drives
on all Ceph hosts are in that pool.
* Ceph Caching is on:
rbd cache = true
rbd cache size = 128
rbd cache max dirty = 64
rbd cache target dirty = 64
rbd cache max dirty age = 10.0
Test:
Here I provide the test results of two VMs that are running on the
same Ceph host, using disk images from the same ceph pool, and were
cloned from the same RADOS snapshot. They both have the same exact KVM
configuration. However, they report dramaticly different write speeds.
When I tested them both, they were running on the same Ceph host. In
fact, for the VM reporting slow disk write speed, I even had it run on
a different Ceph host to test, and it still gave the same disk write
speed results.
[root@linux]# rbd -p images info osimage01
rbd image 'osimage01':
size 28672 MB in 7168 objects
order 22 (4096 kB objects)
block_name_prefix: rbd_data.2bfb74b0dc51
format: 2
features: layering
[root@linux]# rbd -p images info osimage02
rbd image 'osimage02':
size 28672 MB in 7168 objects
order 22 (4096 kB objects)
block_name_prefix: rbd_data.2c1a2ae8944a
format: 2
features: layering
None of the images used are cloned.
[root@linux]# ssh image01host
image01host [65]% dd if=/dev/zero of=disk-test bs=1048576 count=512;
dd if=disk-test of=/dev/null bs=1048576; /bin/rm disk-test
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.760446 seconds, 706 MB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.214783 seconds, 2.5 GB/s
image01host [66]% dd if=/dev/zero of=disk-test bs=1048576 count=512;
dd if=disk-test of=/dev/null bs=1048576; /bin/rm disk-test
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.514886 seconds, 1.0 GB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.198433 seconds, 2.7 GB/s
image01host [67]% dd if=/dev/zero of=disk-test bs=1048576 count=512;
dd if=disk-test of=/dev/null bs=1048576; /bin/rm disk-test
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.562401 seconds, 955 MB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.223297 seconds, 2.4 GB/s
[root@linux]# ssh image02host
image02host [66]% dd if=/dev/zero of=disk-test bs=1048576 count=512;
dd if=disk-test of=/dev/null bs=1048576; /bin/rm disk-test
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 18.8284 seconds, 28.5 MB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.158142 seconds, 3.4 GB/s
image02host [67]% dd if=/dev/zero of=disk-test bs=1048576 count=512;
dd if=disk-test of=/dev/null bs=1048576; /bin/rm disk-test
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 29.1494 seconds, 18.4 MB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.244414 seconds, 2.2 GB/s
image02host [68]% dd if=/dev/zero of=disk-test bs=1048576 count=512;
dd if=disk-test of=/dev/null bs=1048576; /bin/rm disk-test
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 26.5817 seconds, 20.2 MB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.17213 seconds, 3.1 GB/s
((After reinstalling the OS on VM image02host using RADOS image
osimage02))
[root@image02host tmp]# dd if=/dev/zero of=disk-test bs=1048576
count=512; dd if=disk-test of=/dev/null bs=1048576; /bin/rm disk-test
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.453372 seconds, 1.2 GB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.145874 seconds, 3.7 GB/s
[root@image02host tmp]# dd if=/dev/zero of=disk-test bs=1048576
count=512; dd if=disk-test of=/dev/null bs=1048576; /bin/rm disk-test
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.591697 seconds, 907 MB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.175544 seconds, 3.1 GB/s
[root@image02host tmp]# dd if=/dev/zero of=disk-test bs=1048576
count=512; dd if=disk-test of=/dev/null bs=1048576; /bin/rm disk-test
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.599345 seconds, 896 MB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.164405 seconds, 3.3 GB/s
((As mentioned, surprisingly, this other host started having fast disk
write speeds only after image02host was reinstalled. But I am not
understanding why this would be related.))
another-host [65]% dd if=/dev/zero of=disk-test bs=1048576 count=512;
dd if=disk-test of=/dev/null bs=1048576; /bin/rm disk-test
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 7.88853 seconds, 68.1 MB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.273677 seconds, 2.0 GB/s
# image02host was reinstalled before the next command was issue #
another-host [66]% dd if=/dev/zero of=disk-test bs=1048576 count=512;
dd if=disk-test of=/dev/null bs=1048576; /bin/rm disk-test
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.533444 seconds, 1.0 GB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB) copied, 0.198121 seconds, 2.7 GB/s
_______________________________________________
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com