This is an automated email from the ASF dual-hosted git repository. ptaylor pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 8537420 ARROW-5100: [JS] Remove swap while collapsing contiguous buffers 8537420 is described below commit 853742021ce1dc3123c614c1f4b12e8050b11a3c Author: ptaylor <paul.e.tay...@me.com> AuthorDate: Fri Apr 12 11:20:29 2019 -0700 ARROW-5100: [JS] Remove swap while collapsing contiguous buffers Fixes https://issues.apache.org/jira/browse/ARROW-5100 Seems like this is a fairly rare edge case, but really perplexing when it happens. I saw it when manually creating Arrow Columns from data sliced out of an ArrayBuffer pool, and just so happened to get an overlapping + out-of-order set of buffers. I added a test case that demonstrates the issue too. Description from the issue: > We collapse contiguous Uint8Arrays that share the same underlying ArrayBuffer and have overlapping byte ranges. This was done to maintain true zero-copy behavior when using certain node core streams that use a buffer pool internally, and could write chunks of the same logical Arrow Message at out-of-order byte offsets in the pool. > Unfortunately this can also lead to a bug where, in rare cases, buffers are swapped while writing Arrow Messages too. We could have a flag to indicate whether we think collapsing out-of-order same-buffer chunks is safe, but I'm not sure if we can always know that, so I'd prefer to take it out and incur the copy cost. Author: ptaylor <paul.e.tay...@me.com> Closes #4102 from trxcllnt/js/no-buffer-swap and squashes the following commits: 0aaf813d <ptaylor> ensure built asset names match the js version 2a5c5d51 <ptaylor> remove swap behavior from contiguous buffer collapse for safety --- js/gulp/arrow-task.js | 12 ++++++------ js/src/util/buffer.ts | 8 ++------ js/test/unit/table/serialize-tests.ts | 13 ++++++++++++- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/js/gulp/arrow-task.js b/js/gulp/arrow-task.js index 48e717e..0b95440 100644 --- a/js/gulp/arrow-task.js +++ b/js/gulp/arrow-task.js @@ -40,14 +40,14 @@ const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target) { const esnextUmdSourceMapsGlob = `${targetDir(`esnext`, `umd`)}/*.map`; return Observable.forkJoin( observableFromStreams(gulp.src(dtsGlob), gulp.dest(out)), // copy d.ts files - observableFromStreams(gulp.src(cjsGlob), gulp.dest(out)), // copy es2015 cjs files - observableFromStreams(gulp.src(cjsSourceMapsGlob), gulp.dest(out)), // copy es2015 cjs sourcemaps - observableFromStreams(gulp.src(esmSourceMapsGlob), gulp.dest(out)), // copy es2015 esm sourcemaps + observableFromStreams(gulp.src(cjsGlob), gulp.dest(out)), // copy esnext cjs files + observableFromStreams(gulp.src(cjsSourceMapsGlob), gulp.dest(out)), // copy esnext cjs sourcemaps + observableFromStreams(gulp.src(esmSourceMapsGlob), gulp.dest(out)), // copy esnext esm sourcemaps observableFromStreams(gulp.src(es5UmdSourceMapsGlob), gulp.dest(out)), // copy es5 umd sourcemap files, but don't rename - observableFromStreams(gulp.src(esnextUmdSourceMapsGlob), gulp.dest(out)), // copy es2015 umd sourcemap files, but don't rename - observableFromStreams(gulp.src(esmGlob), gulpRename((p) => { p.extname = '.mjs'; }), gulp.dest(out)), // copy es2015 esm files and rename to `.mjs` + observableFromStreams(gulp.src(esnextUmdSourceMapsGlob), gulp.dest(out)), // copy esnext umd sourcemap files, but don't rename + observableFromStreams(gulp.src(esmGlob), gulpRename((p) => { p.extname = '.mjs'; }), gulp.dest(out)), // copy esnext esm files and rename to `.mjs` observableFromStreams(gulp.src(es5UmdGlob), gulpRename((p) => { p.basename += `.es5.min`; }), gulp.dest(out)), // copy es5 umd files and add `.min` - observableFromStreams(gulp.src(esnextUmdGlob), gulpRename((p) => { p.basename += `.es2015.min`; }), gulp.dest(out)), // copy es2015 umd files and add `.es2015.min` + observableFromStreams(gulp.src(esnextUmdGlob), gulpRename((p) => { p.basename += `.esnext.min`; }), gulp.dest(out)), // copy esnext umd files and add `.esnext.min` ).publish(new ReplaySubject()).refCount(); }))({}); diff --git a/js/src/util/buffer.ts b/js/src/util/buffer.ts index e0fb0fd..2c56b1c 100644 --- a/js/src/util/buffer.ts +++ b/js/src/util/buffer.ts @@ -32,15 +32,11 @@ function collapseContiguousByteRanges(chunks: Uint8Array[]) { for (let x, y, i = 0, j = 0, n = chunks.length; ++i < n;) { x = result[j]; y = chunks[i]; - // continue x and y don't share the same underlying ArrayBuffer - if (!x || !y || x.buffer !== y.buffer) { + // continue if x and y don't share the same underlying ArrayBuffer, or if x isn't before y + if (!x || !y || x.buffer !== y.buffer || y.byteOffset < x.byteOffset) { y && (result[++j] = y); continue; } - // swap if y starts before x - if (y.byteOffset < x.byteOffset) { - x = chunks[i]; y = result[j]; - } ({ byteOffset: xOffset, byteLength: xLen } = x); ({ byteOffset: yOffset, byteLength: yLen } = y); // continue if the byte ranges of x and y aren't contiguous diff --git a/js/test/unit/table/serialize-tests.ts b/js/test/unit/table/serialize-tests.ts index f337913..9dce2f5 100644 --- a/js/test/unit/table/serialize-tests.ts +++ b/js/test/unit/table/serialize-tests.ts @@ -18,7 +18,7 @@ import '../../jest-extensions'; import * as generate from '../../generate-test-data'; import { - Table, Schema, Field, DataType, Dictionary, Int32, Float32, Utf8, Null + Table, Schema, Field, DataType, Dictionary, Int32, Float32, Utf8, Null, Int32Vector } from '../../Arrow'; const toSchema = (...xs: [string, DataType][]) => new Schema(xs.map((x) => new Field(...x))); @@ -34,6 +34,17 @@ function createTable<T extends { [key: string]: DataType } = any>(schema: Schema describe('Table#serialize()', () => { + test(`doesn't swap the order of buffers that share the same underlying ArrayBuffer but are in a different order`, () => { + const values = new Int32Array([0, 1, 2, 3, 4, 5, 6, 7]); + const expected = values.slice(); + const x = Int32Vector.from(values.subarray(4, 8)); // back + const y = Int32Vector.from(values.subarray(0, 4)); // front + const source = Table.new([x, y], ['x', 'y']); + const table = Table.from(source.serialize()); + expect(table.getColumn('x').toArray()).toEqual(expected.subarray(4, 8)); + expect(table.getColumn('y').toArray()).toEqual(expected.subarray(0, 4)); + }); + test(`Table#empty round-trips through serialization`, () => { const source = Table.empty(); source.schema.metadata.set('foo', 'bar');