I've noticed the 3.0.0pre versions sometimes hanging while doing a local copy (through dirvish). This time a had a binary with debugging symbols, so I could do gdb backtraces. This is the result:
# ps -fe | grep rsync root 3712 3710 0 02:04 ? 00:00:03 rsync -vrltH --delete -pgo --stats -D --numeric-ids -x --exclude-from=/backup/0/oudeserver/laatste/exclude --link-dest=/backup/0/oudeserver/20071225/tree /var/lib/vservers/oudeserver/ /backup/0/oudeserver/laatste/tree root 3713 3712 0 02:04 ? 00:00:04 rsync -vrltH --delete -pgo --stats -D --numeric-ids -x --exclude-from=/backup/0/oudeserver/laatste/exclude --link-dest=/backup/0/oudeserver/20071225/tree /var/lib/vservers/oudeserver/ /backup/0/oudeserver/laatste/tree root 3714 3713 0 02:04 ? 00:00:03 rsync -vrltH --delete -pgo --stats -D --numeric-ids -x --exclude-from=/backup/0/oudeserver/laatste/exclude --link-dest=/backup/0/oudeserver/20071225/tree /var/lib/vservers/oudeserver/ /backup/0/oudeserver/laatste/tree # gdb -p 3712 (gdb) bt #0 0x00002ba78dcf3b63 in select () from /lib/libc.so.6 #1 0x0000000000423ee1 in read_timeout (fd=5, buf=0x7fff1d3ac5c0 "\001", len=4) at io.c:653 #2 0x00000000004243e2 in read_loop (fd=5, buf=0x7fff1d3ac5c0 "\001", len=4) at io.c:985 #3 0x00000000004244d5 in readfd_unbuffered (fd=5, buf=0x7fff1d3adc70 "", len=1) at io.c:1022 #4 0x0000000000424a93 in readfd (fd=5, buffer=0x7fff1d3adc70 "", N=1) at io.c:1144 #5 0x0000000000425b20 in read_ndx (f=5) at io.c:1749 #6 0x000000000040a957 in read_ndx_and_attrs (f_in=5, iflag_ptr=0x7fff1d3afe38, type_ptr=0x7fff1d3afe3f "", buf=0x7fff1d3add50 "", len_ptr=0x7fff1d3afe34) at rsync.c:218 #7 0x00000000004125ea in send_files (f_in=5, f_out=4) at sender.c:193 #8 0x00000000004194f7 in client_run (f_in=5, f_out=4, pid=3713, argc=1, argv=0x66f240) at main.c:1045 #9 0x000000000041a550 in main (argc=2, argv=0x66f240) at main.c:1304 Strace says: select(6, [5], [], NULL, {49, 420000} <unfinished ...> # gdb -p 3713 (gdb) bt #0 0x00002ba78dcf3b63 in select () from /lib/libc.so.6 #1 0x0000000000423ee1 in read_timeout (fd=3, buf=0x670be0 "\004", len=8184) at io.c:653 #2 0x0000000000424794 in readfd_unbuffered (fd=4, buf=0x7fff1d3ad520 "¡µ\001", len=4) at io.c:1008 #3 0x0000000000424a93 in readfd (fd=3, buffer=0x7fff1d3ad520 "¡µ\001", N=4) at io.c:1144 #4 0x0000000000424dcf in read_msg_fd () at io.c:342 #5 0x000000000040fd1f in generate_files (f_out=1, local_name=<value optimized out>) at generator.c:2089 #6 0x00000000004190b4 in do_recv (f_in=0, f_out=1, local_name=0x0) at main.c:848 #7 0x00000000004198a3 in start_server (f_in=0, f_out=1, argc=1, argv=<value optimized out>) at main.c:958 #8 0x000000000041af15 in child_main (argc=490388736, argv=0xffffffffffffffff) at main.c:965 #9 0x0000000000430b2e in local_child (argc=2, argv=0x7fff1d3aff00, f_in=0x7fff1d3b2f2c, f_out=0x7fff1d3b2f28, child_main=0x41af00 <child_main>) at pipe.c:150 #10 0x000000000041a4f9 in main (argc=2, argv=0x66f240) at main.c:464 Strace says: select(4, [3], [], NULL, {33, 0} <unfinished ...> # gdb -p 3714 (gdb) bt #0 0x00002ba78dcf3b63 in select () from /lib/libc.so.6 #1 0x0000000000423ee1 in read_timeout (fd=0, buf=0x7fff1d3a9500 "\001", len=4) at io.c:653 #2 0x00000000004243e2 in read_loop (fd=0, buf=0x7fff1d3a9500 "\001", len=4) at io.c:985 #3 0x00000000004244d5 in readfd_unbuffered (fd=0, buf=0x7fff1d3aabb0 "", len=1) at io.c:1022 #4 0x0000000000424a93 in readfd (fd=0, buffer=0x7fff1d3aabb0 "", N=1) at io.c:1144 #5 0x0000000000425b20 in read_ndx (f=0) at io.c:1749 #6 0x000000000040a957 in read_ndx_and_attrs (f_in=0, iflag_ptr=0x7fff1d3aed78, type_ptr=0x7fff1d3aed7f "", buf=0x7fff1d3acc90 "", len_ptr=0x7fff1d3aed74) at rsync.c:218 #7 0x0000000000410fce in recv_files (f_in=0, local_name=0x0) at receiver.c:408 #8 0x0000000000418fb2 in do_recv (f_in=0, f_out=1, local_name=0x0) at main.c:792 #9 0x00000000004198a3 in start_server (f_in=0, f_out=1, argc=1, argv=<value optimized out>) at main.c:958 #10 0x000000000041af15 in child_main (argc=490378112, argv=0xffffffffffffffff) at main.c:965 #11 0x0000000000430b2e in local_child (argc=2, argv=0x7fff1d3aff00, f_in=0x7fff1d3b2f2c, f_out=0x7fff1d3b2f28, child_main=0x41af00 <child_main>) at pipe.c:150 #12 0x000000000041a4f9 in main (argc=2, argv=0x66f240) at main.c:464 Strace says: select(1, [0], [], NULL, {51, 310000} <unfinished ...> All the fds involved are sockets (pipes to one another). As far as I can see, all processes are waiting for something to appear on the pipes, but no one is prepared to break the silence :-) The curious thing is that it always seems to happen at the same point in the filesystem, last filename output is var/lib/dovecot/ssl-parameters.dat in the cases it hangs. (Other times that file is copied and it continues.) This is a pretty fast system, core 2 quad with 8GB memory; perhaps there's some race condition... Paul Slootman -- To unsubscribe or change options: https://lists.samba.org/mailman/listinfo/rsync Before posting, read: http://www.catb.org/~esr/faqs/smart-questions.html