Eureka! Nadav Har'El wrote:
>On Mon, Mar 18, 2002, Malcolm Kavalsky wrote about "Re: pthreads question": > >>I asked one of the top Unix hackers that I know, and he said: >> >>"I would guess that if you do large af_unix transfers that are page >>aligned then the system doesn't have to actually copy the data rather it >>can share the page and do a copy on write. This preserves the socket >>semantics and can be faster than memcpy. This was done many years ago in >>Solaris." >> >>I wonder if digging deep enough in the kernel sources, will reveal this ... >> > >You can try to check if this is the case, by following each send or memcpy >by a memset() of the buffer. If the memcpy method suddenly becomes quicker, >this explanation might be true. >Strange though - how come malloc() returns page-aligned buffers? Does the >Linux code really checks for this rare and rather esoteric case (if you >write to the buffer after sending it, and the kernel can't know you're >writing whole pages, it will have to do a copy-on- write and do the copy >anyway). > This is exactly what happened! I added in memset after memcpy, and also after sending the buffer, the results are: Memcpy'ed and memsetted 1000 blocks of size 1048576 in 18 seconds => 55 Mbytes/second Started receiving at Mon Mar 18 13:41:13 2002 Received 1048576000 bytes in 17 seconds over unix socket => 59 Mbytes/second Started sending at Mon Mar 18 13:41:13 2002 Sent and memsetted 1000 blocks of size 1048576 in 17 seconds over unix socket => 58 Mbytes/second (You notice that I also added printing exact time that send and receive started, to ensure no delay between the two) I also attach the source file for reference.
#include <stdio.h> #include <malloc.h> #include <string.h> #include <time.h> #include <sys/socket.h> #include <sys/un.h> #include <sys/types.h> #include <sys/wait.h> #include <unistd.h> #define BUFSIZE 0x100000 /* 1 Megabyte */ #define NBLOCKS 1000 #define PORT_NAME "/tmp/foo" void server() { struct sockaddr_un sin,from; int s,g,len,n; char *buf; float nbytes; time_t start_time, elapsed_time; buf = malloc( BUFSIZE ); /* Create an unbound socket */ if( (s=socket( PF_UNIX, SOCK_STREAM, 0 )) < 0 ){ printf( "Bad socket\n"); return; } strcpy( sin.sun_path, PORT_NAME ); sin.sun_family = PF_UNIX; if( bind( s, (struct sockaddr *)&sin, strlen(sin.sun_path) + sizeof(sin.sun_family)) < 0){ printf( "Bad bind\n"); return; } listen( s, 5 ); len = sizeof(from); g = accept( s, (struct sockaddr *)&from, &len ); nbytes = read( g, buf, BUFSIZE ); start_time = time(0); while( (n = read( g, buf, BUFSIZE )) > 0 ) { nbytes += n; } elapsed_time = time(0) - start_time; close(g); close(s); unlink( PORT_NAME ); printf("\nStarted receiving at %s", ctime( &start_time )); printf( "Received %10.0f bytes in %d seconds over unix socket =>", nbytes, (int)elapsed_time ); printf( " %4.0f Mbytes/second \n", nbytes / (0x100000 * elapsed_time) ); } void client() { struct sockaddr_un sin; int s; char *buf; time_t start_time, elapsed_time; int i; buf = malloc( BUFSIZE ); if( (s=socket( PF_UNIX, SOCK_STREAM, 0 )) < 0 ){ printf( "Bad socket\n"); return; } strcpy( sin.sun_path, PORT_NAME ); sin.sun_family = PF_UNIX; if( connect( s, (struct sockaddr *)&sin, sizeof(sin)) < 0 ){ printf("Bad connect\n"); close(s); return; } start_time = time(0); for( i=0; i< NBLOCKS && write(s, buf, BUFSIZE) == BUFSIZE ; i++ ) { memset( buf, 'A', BUFSIZE ); } elapsed_time = time(0) - start_time; close(s); printf("\nStarted sending at %s", ctime( &start_time )); printf( "Sent and memsetted %d blocks of size %d in %d seconds over unix socket =>", i, BUFSIZE, (int)elapsed_time ); printf( " %d Mbytes/second \n", (NBLOCKS * BUFSIZE) / (0x100000 * (int)elapsed_time) ); } void memcpy_benchmark() { char *src, *dst; time_t start_time, elapsed_time; int i; src = malloc ( BUFSIZE ); dst = malloc ( BUFSIZE ); start_time = time(0); for( i=0; i< NBLOCKS; i++ ){ memcpy( dst, src, BUFSIZE ); memset( dst, 'A', BUFSIZE ); } elapsed_time = time(0) - start_time; printf( "Memcpy'ed and memsetted %d blocks of size %d in %d seconds =>", NBLOCKS, BUFSIZE, (int)elapsed_time ); printf( " %d Mbytes/second\n", (NBLOCKS * BUFSIZE) / (0x100000 * (int)elapsed_time) ); } void socket_benchmark() { int status; if ( fork() == 0 ) { server(); } else { sleep(1); /* Dirty, but ensures client runs after server is ready */ client(); } wait(&status); } int main() { memcpy_benchmark(); socket_benchmark(); return 0; }