Hi everyone, Currently, apr_file_write() can cause an excessive amount of syscalls for buffered files on Windows in some situations. This happens because for buffered files, writing is implemented with a loop that keeps copying the data to the internal 4KB buffer and writing the contents of this buffer to disk with WriteFile(). In other words, performing a 100 KB write currently results in 25 consecutive WriteFile(4096) syscalls.
This patch series reduces the amount of syscalls in such situations by performing a single WriteFile() call without any buffering, if possible. If some data is already buffered, then the buffer is first filled, flushed and the remaining part of the data is written with a single WriteFile(). My measurements indicate that writing in larger chunks and avoiding such syscalls can save a certain part of the CPU time. For example, "svn import" does both small and large buffered writes with the following pattern: write: nbytes = 9 write: nbytes = 5 write: nbytes = 86267 write: nbytes = 9 write: nbytes = 5 write: nbytes = 86413 write: nbytes = 9 write: nbytes = 5 write: nbytes = 86415 write: nbytes = 9 write: nbytes = 5 write: nbytes = 67680 ... When I test "svn import" for a large file before and after this patch, I see that - the amount of syscalls decreases from 199,403 to 17,061 and - the overall CPU time decreases from 7.28 s to 6.68 s. (I think that's a lot, considering that "svn import" does a many other CPU-intensive operations.) The implementation is split into three dependent patches. The first two patches lay the necessary groundwork by factoring out a couple of helper functions. The third patch is the core change that implements the described optimization. The log messages are included in the beginning of each patch file. Thanks, Evgeny Kotkov
Factor out a helper function that adapts WriteFile() to apr_size_t instead of DWORD. * file_io/win32/readwrite.c (write_helper): New helper function, factored out from ... (apr_file_flush): ...here. Patch by: Evgeny Kotkov <evgeny.kotkov {at} visualsvn.com> Index: file_io/win32/readwrite.c =================================================================== --- file_io/win32/readwrite.c (revision 1805607) +++ file_io/win32/readwrite.c (working copy) @@ -257,6 +257,38 @@ APR_DECLARE(apr_status_t) apr_file_rotating_manual return APR_ENOTIMPL; } +/* Helper function that adapts WriteFile() to apr_size_t instead + * of DWORD. */ +static apr_status_t write_helper(HANDLE filehand, const char *buf, + apr_size_t len, apr_size_t *pwritten) +{ + apr_size_t remaining = len; + + *pwritten = 0; + do { + DWORD to_write; + DWORD written; + + if (remaining > APR_DWORD_MAX) { + to_write = APR_DWORD_MAX; + } + else { + to_write = (DWORD)remaining; + } + + if (!WriteFile(filehand, buf, to_write, &written, NULL)) { + *pwritten += written; + return apr_get_os_error(); + } + + *pwritten += written; + remaining -= written; + buf += written; + } while (remaining); + + return APR_SUCCESS; +} + APR_DECLARE(apr_status_t) apr_file_write(apr_file_t *thefile, const void *buf, apr_size_t *nbytes) { apr_status_t rv; @@ -576,35 +608,15 @@ APR_DECLARE(apr_status_t) apr_file_gets(char *str, APR_DECLARE(apr_status_t) apr_file_flush(apr_file_t *thefile) { if (thefile->buffered) { - DWORD numbytes, written = 0; apr_status_t rc = 0; - char *buffer; - apr_size_t bytesleft; if (thefile->direction == 1 && thefile->bufpos) { - buffer = thefile->buffer; - bytesleft = thefile->bufpos; + apr_size_t written; - do { - if (bytesleft > APR_DWORD_MAX) { - numbytes = APR_DWORD_MAX; - } - else { - numbytes = (DWORD)bytesleft; - } + rc = write_helper(thefile->filehand, thefile->buffer, + thefile->bufpos, &written); + thefile->filePtr += written; - if (!WriteFile(thefile->filehand, buffer, numbytes, &written, NULL)) { - rc = apr_get_os_error(); - thefile->filePtr += written; - break; - } - - thefile->filePtr += written; - bytesleft -= written; - buffer += written; - - } while (bytesleft > 0); - if (rc == 0) thefile->bufpos = 0; }
Factor out a helper function that contains the implementation of writing to buffered files. * file_io/win32/readwrite.c (write_buffered): New helper function, factored out from ... (apr_file_write): ...here. Patch by: Evgeny Kotkov <evgeny.kotkov {at} visualsvn.com> Index: file_io/win32/readwrite.c =================================================================== --- file_io/win32/readwrite.c (revision 1805607) +++ file_io/win32/readwrite.c (working copy) @@ -289,6 +289,39 @@ static apr_status_t write_helper(HANDLE filehand, return APR_SUCCESS; } +static apr_status_t write_buffered(apr_file_t *thefile, const char *buf, + apr_size_t len) +{ + apr_size_t blocksize; + apr_status_t rv; + + if (thefile->direction == 0) { + /* Position file pointer for writing at the offset we are logically reading from */ + apr_off_t offset = thefile->filePtr - thefile->dataRead + thefile->bufpos; + DWORD offlo = (DWORD)offset; + LONG offhi = (LONG)(offset >> 32); + if (offset != thefile->filePtr) + SetFilePointer(thefile->filehand, offlo, &offhi, FILE_BEGIN); + thefile->bufpos = thefile->dataRead = 0; + thefile->direction = 1; + } + + rv = 0; + while (rv == 0 && len > 0) { + if (thefile->bufpos == thefile->bufsize) /* write buffer is full */ + rv = apr_file_flush(thefile); + + blocksize = len > thefile->bufsize - thefile->bufpos ? + thefile->bufsize - thefile->bufpos : len; + memcpy(thefile->buffer + thefile->bufpos, buf, blocksize); + thefile->bufpos += blocksize; + buf += blocksize; + len -= blocksize; + } + + return rv; +} + APR_DECLARE(apr_status_t) apr_file_write(apr_file_t *thefile, const void *buf, apr_size_t *nbytes) { apr_status_t rv; @@ -309,38 +342,10 @@ APR_DECLARE(apr_status_t) apr_file_write(apr_file_ } if (thefile->buffered) { - char *pos = (char *)buf; - apr_size_t blocksize; - apr_size_t size = *nbytes; - if (thefile->flags & APR_FOPEN_XTHREAD) { apr_thread_mutex_lock(thefile->mutex); } - - if (thefile->direction == 0) { - /* Position file pointer for writing at the offset we are logically reading from */ - apr_off_t offset = thefile->filePtr - thefile->dataRead + thefile->bufpos; - DWORD offlo = (DWORD)offset; - LONG offhi = (LONG)(offset >> 32); - if (offset != thefile->filePtr) - SetFilePointer(thefile->filehand, offlo, &offhi, FILE_BEGIN); - thefile->bufpos = thefile->dataRead = 0; - thefile->direction = 1; - } - - rv = 0; - while (rv == 0 && size > 0) { - if (thefile->bufpos == thefile->bufsize) /* write buffer is full */ - rv = apr_file_flush(thefile); - - blocksize = size > thefile->bufsize - thefile->bufpos ? - thefile->bufsize - thefile->bufpos : size; - memcpy(thefile->buffer + thefile->bufpos, pos, blocksize); - thefile->bufpos += blocksize; - pos += blocksize; - size -= blocksize; - } - + rv = write_buffered(thefile, buf, *nbytes, nbytes); if (thefile->flags & APR_FOPEN_XTHREAD) { apr_thread_mutex_unlock(thefile->mutex); }
Win32: Improve apr_file_write() performance on buffered files by minimizing the amount of WriteFile() syscalls when possible. Previously, writing has been implemented with a loop that keeps copying the data to the internal 4KB buffer and writing this buffer to disk by calling WriteFile(4096). This patch reduces the amount of syscalls in such situations by performing a single WriteFile() call without any buffering, if possible. If some data is already buffered, then the buffer is first filled, flushed and the remaining part of the data is written with a single WriteFile(). * file_io/win32/readwrite.c (write_buffered): Perform all the writing in one or two calls to WriteFile(), depending on whether we have something in the buffer, or not. Return an appropriate number of written bytes to satisfy the apr_file_write() function contract. (apr_file_write): Adjust call to write_buffered(). * test/testfile.c (test_large_write_buffered, test_two_large_writes_buffered, test_write_buffered_spanning_over_bufsize): New tests. (testfile): Run the new tests. Patch by: Evgeny Kotkov <evgeny.kotkov {at} visualsvn.com> Index: file_io/win32/readwrite.c =================================================================== --- file_io/win32/readwrite.c (revision 1805607) +++ file_io/win32/readwrite.c (working copy) @@ -290,10 +290,10 @@ static apr_status_t write_helper(HANDLE filehand, } static apr_status_t write_buffered(apr_file_t *thefile, const char *buf, - apr_size_t len) + apr_size_t len, apr_size_t *pwritten) { - apr_size_t blocksize; - apr_status_t rv; + apr_size_t remaining = len; + apr_size_t buf_free; if (thefile->direction == 0) { /* Position file pointer for writing at the offset we are logically reading from */ @@ -306,20 +306,47 @@ static apr_status_t write_buffered(apr_file_t *the thefile->direction = 1; } - rv = 0; - while (rv == 0 && len > 0) { - if (thefile->bufpos == thefile->bufsize) /* write buffer is full */ - rv = apr_file_flush(thefile); + buf_free = thefile->bufsize - thefile->bufpos; + /* Buffer the incoming data if we already have something in the + * buffer or if this data fully fits into it. */ + if (thefile->bufpos > 0 || remaining <= buf_free) { + apr_size_t blocksize; - blocksize = len > thefile->bufsize - thefile->bufpos ? - thefile->bufsize - thefile->bufpos : len; + if (remaining < buf_free) { + blocksize = remaining; + } + else { + blocksize = buf_free; + } memcpy(thefile->buffer + thefile->bufpos, buf, blocksize); thefile->bufpos += blocksize; buf += blocksize; - len -= blocksize; + remaining -= blocksize; } - return rv; + if (remaining) { + apr_status_t rv; + apr_size_t written; + + /* We still have data to write, flush the internal buffer if needed + * and write the remaining block with a single syscall. */ + if (thefile->bufpos > 0) { + rv = apr_file_flush(thefile); + if (rv) { + *pwritten = len - remaining; + return rv; + } + } + rv = write_helper(thefile->filehand, buf, remaining, &written); + thefile->filePtr += written; + if (rv) { + *pwritten = len - remaining + written; + return rv; + } + } + + *pwritten = len; + return APR_SUCCESS; } APR_DECLARE(apr_status_t) apr_file_write(apr_file_t *thefile, const void *buf, apr_size_t *nbytes) Index: test/testfile.c =================================================================== --- test/testfile.c (revision 1805607) +++ test/testfile.c (working copy) @@ -1446,6 +1446,141 @@ static void test_append(abts_case *tc, void *data) apr_file_close(f2); } +static void test_large_write_buffered(abts_case *tc, void *data) +{ + apr_status_t rv; + apr_file_t *f; + const char *fname = "data/testlarge_write_buffered.dat"; + apr_size_t len; + apr_size_t bytes_written; + apr_size_t bytes_read; + char *buf; + char *buf2; + + apr_file_remove(fname, p); + + rv = apr_file_open(&f, fname, + APR_FOPEN_CREATE | APR_FOPEN_WRITE | APR_FOPEN_BUFFERED, + APR_FPROT_OS_DEFAULT, p); + APR_ASSERT_SUCCESS(tc, "open test file for writing", rv); + + /* Test a single large write. */ + len = 80000; + buf = apr_palloc(p, len); + memset(buf, 'a', len); + rv = apr_file_write_full(f, buf, len, &bytes_written); + APR_ASSERT_SUCCESS(tc, "write to file", rv); + ABTS_INT_EQUAL(tc, (int)len, (int)bytes_written); + apr_file_close(f); + + rv = apr_file_open(&f, fname, APR_FOPEN_READ, + APR_FPROT_OS_DEFAULT, p); + APR_ASSERT_SUCCESS(tc, "open test file for reading", rv); + + buf2 = apr_palloc(p, len + 1); + rv = apr_file_read_full(f, buf2, len + 1, &bytes_read); + ABTS_INT_EQUAL(tc, APR_EOF, rv); + ABTS_INT_EQUAL(tc, (int)len, (int)bytes_read); + ABTS_TRUE(tc, memcmp(buf, buf2, len) == 0); + apr_file_close(f); + + apr_file_remove(fname, p); +} + +static void test_two_large_writes_buffered(abts_case *tc, void *data) +{ + apr_status_t rv; + apr_file_t *f; + const char *fname = "data/testtwo_large_writes_buffered.dat"; + apr_size_t len; + apr_size_t bytes_written; + apr_size_t bytes_read; + char *buf; + char *buf2; + + apr_file_remove(fname, p); + + rv = apr_file_open(&f, fname, + APR_FOPEN_CREATE | APR_FOPEN_WRITE | APR_FOPEN_BUFFERED, + APR_FPROT_OS_DEFAULT, p); + APR_ASSERT_SUCCESS(tc, "open test file for writing", rv); + + /* Test two consecutive large writes. */ + len = 80000; + buf = apr_palloc(p, len); + memset(buf, 'a', len); + + rv = apr_file_write_full(f, buf, len / 2, &bytes_written); + APR_ASSERT_SUCCESS(tc, "write to file", rv); + ABTS_INT_EQUAL(tc, (int)(len / 2), (int)bytes_written); + + rv = apr_file_write_full(f, buf, len / 2, &bytes_written); + APR_ASSERT_SUCCESS(tc, "write to file", rv); + ABTS_INT_EQUAL(tc, (int) (len / 2), (int)bytes_written); + + apr_file_close(f); + + rv = apr_file_open(&f, fname, APR_FOPEN_READ, + APR_FPROT_OS_DEFAULT, p); + APR_ASSERT_SUCCESS(tc, "open test file for reading", rv); + + buf2 = apr_palloc(p, len + 1); + rv = apr_file_read_full(f, buf2, len + 1, &bytes_read); + ABTS_INT_EQUAL(tc, APR_EOF, rv); + ABTS_INT_EQUAL(tc, (int) len, (int)bytes_read); + ABTS_TRUE(tc, memcmp(buf, buf2, len) == 0); + apr_file_close(f); + + apr_file_remove(fname, p); +} + +static void test_write_buffered_spanning_over_bufsize(abts_case *tc, void *data) +{ + apr_status_t rv; + apr_file_t *f; + const char *fname = "data/testwrite_buffered_spanning_over_bufsize.dat"; + apr_size_t len; + apr_size_t bytes_written; + apr_size_t bytes_read; + char *buf; + char *buf2; + + apr_file_remove(fname, p); + + rv = apr_file_open(&f, fname, + APR_FOPEN_CREATE | APR_FOPEN_WRITE | APR_FOPEN_BUFFERED, + APR_FPROT_OS_DEFAULT, p); + APR_ASSERT_SUCCESS(tc, "open test file for writing", rv); + + /* Test two writes than span over the default buffer size. */ + len = APR_BUFFERSIZE + 1; + buf = apr_palloc(p, len); + memset(buf, 'a', len); + + rv = apr_file_write_full(f, buf, APR_BUFFERSIZE - 1, &bytes_written); + APR_ASSERT_SUCCESS(tc, "write to file", rv); + ABTS_INT_EQUAL(tc, APR_BUFFERSIZE - 1, (int)bytes_written); + + rv = apr_file_write_full(f, buf, 2, &bytes_written); + APR_ASSERT_SUCCESS(tc, "write to file", rv); + ABTS_INT_EQUAL(tc, 2, (int)bytes_written); + + apr_file_close(f); + + rv = apr_file_open(&f, fname, APR_FOPEN_READ, + APR_FPROT_OS_DEFAULT, p); + APR_ASSERT_SUCCESS(tc, "open test file for reading", rv); + + buf2 = apr_palloc(p, len + 1); + rv = apr_file_read_full(f, buf2, len + 1, &bytes_read); + ABTS_INT_EQUAL(tc, APR_EOF, rv); + ABTS_INT_EQUAL(tc, (int)len, (int)bytes_read); + ABTS_TRUE(tc, memcmp(buf, buf2, len) == 0); + apr_file_close(f); + + apr_file_remove(fname, p); +} + abts_suite *testfile(abts_suite *suite) { suite = ADD_SUITE(suite) @@ -1495,6 +1630,9 @@ abts_suite *testfile(abts_suite *suite) abts_run_test(suite, test_buffer_set_get, NULL); abts_run_test(suite, test_xthread, NULL); abts_run_test(suite, test_append, NULL); + abts_run_test(suite, test_large_write_buffered, NULL); + abts_run_test(suite, test_two_large_writes_buffered, NULL); + abts_run_test(suite, test_write_buffered_spanning_over_bufsize, NULL); return suite; }