This is an automated email from the ASF dual-hosted git repository.

ptaylor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 8537420  ARROW-5100: [JS] Remove swap while collapsing contiguous 
buffers
8537420 is described below

commit 853742021ce1dc3123c614c1f4b12e8050b11a3c
Author: ptaylor <paul.e.tay...@me.com>
AuthorDate: Fri Apr 12 11:20:29 2019 -0700

    ARROW-5100: [JS] Remove swap while collapsing contiguous buffers
    
    Fixes https://issues.apache.org/jira/browse/ARROW-5100
    
    Seems like this is a fairly rare edge case, but really perplexing when it 
happens. I saw it when manually creating Arrow Columns from data sliced out of 
an ArrayBuffer pool, and just so happened to get an overlapping + out-of-order 
set of buffers. I added a test case that demonstrates the issue too.
    
    Description from the issue:
    > We collapse contiguous Uint8Arrays that share the same underlying 
ArrayBuffer and have overlapping byte ranges. This was done to maintain true 
zero-copy behavior when using certain node core streams that use a buffer pool 
internally, and could write chunks of the same logical Arrow Message at 
out-of-order byte offsets in the pool.
    
    > Unfortunately this can also lead to a bug where, in rare cases, buffers 
are swapped while writing Arrow Messages too. We could have a flag to indicate 
whether we think collapsing out-of-order same-buffer chunks is safe, but I'm 
not sure if we can always know that, so I'd prefer to take it out and incur the 
copy cost.
    
    Author: ptaylor <paul.e.tay...@me.com>
    
    Closes #4102 from trxcllnt/js/no-buffer-swap and squashes the following 
commits:
    
    0aaf813d <ptaylor> ensure built asset names match the js version
    2a5c5d51 <ptaylor> remove swap behavior from contiguous buffer collapse for 
safety
---
 js/gulp/arrow-task.js                 | 12 ++++++------
 js/src/util/buffer.ts                 |  8 ++------
 js/test/unit/table/serialize-tests.ts | 13 ++++++++++++-
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/js/gulp/arrow-task.js b/js/gulp/arrow-task.js
index 48e717e..0b95440 100644
--- a/js/gulp/arrow-task.js
+++ b/js/gulp/arrow-task.js
@@ -40,14 +40,14 @@ const arrowTask = ((cache) => memoizeTask(cache, function 
copyMain(target) {
     const esnextUmdSourceMapsGlob = `${targetDir(`esnext`, `umd`)}/*.map`;
     return Observable.forkJoin(
         observableFromStreams(gulp.src(dtsGlob),                 
gulp.dest(out)), // copy d.ts files
-        observableFromStreams(gulp.src(cjsGlob),                 
gulp.dest(out)), // copy es2015 cjs files
-        observableFromStreams(gulp.src(cjsSourceMapsGlob),       
gulp.dest(out)), // copy es2015 cjs sourcemaps
-        observableFromStreams(gulp.src(esmSourceMapsGlob),       
gulp.dest(out)), // copy es2015 esm sourcemaps
+        observableFromStreams(gulp.src(cjsGlob),                 
gulp.dest(out)), // copy esnext cjs files
+        observableFromStreams(gulp.src(cjsSourceMapsGlob),       
gulp.dest(out)), // copy esnext cjs sourcemaps
+        observableFromStreams(gulp.src(esmSourceMapsGlob),       
gulp.dest(out)), // copy esnext esm sourcemaps
         observableFromStreams(gulp.src(es5UmdSourceMapsGlob),    
gulp.dest(out)), // copy es5 umd sourcemap files, but don't rename
-        observableFromStreams(gulp.src(esnextUmdSourceMapsGlob), 
gulp.dest(out)), // copy es2015 umd sourcemap files, but don't rename
-        observableFromStreams(gulp.src(esmGlob),       gulpRename((p) => { 
p.extname = '.mjs'; }),          gulp.dest(out)), // copy es2015 esm files and 
rename to `.mjs`
+        observableFromStreams(gulp.src(esnextUmdSourceMapsGlob), 
gulp.dest(out)), // copy esnext umd sourcemap files, but don't rename
+        observableFromStreams(gulp.src(esmGlob),       gulpRename((p) => { 
p.extname = '.mjs'; }),          gulp.dest(out)), // copy esnext esm files and 
rename to `.mjs`
         observableFromStreams(gulp.src(es5UmdGlob),    gulpRename((p) => { 
p.basename += `.es5.min`; }),    gulp.dest(out)), // copy es5 umd files and add 
`.min`
-        observableFromStreams(gulp.src(esnextUmdGlob), gulpRename((p) => { 
p.basename += `.es2015.min`; }), gulp.dest(out)), // copy es2015 umd files and 
add `.es2015.min`
+        observableFromStreams(gulp.src(esnextUmdGlob), gulpRename((p) => { 
p.basename += `.esnext.min`; }), gulp.dest(out)), // copy esnext umd files and 
add `.esnext.min`
     ).publish(new ReplaySubject()).refCount();
 }))({});
 
diff --git a/js/src/util/buffer.ts b/js/src/util/buffer.ts
index e0fb0fd..2c56b1c 100644
--- a/js/src/util/buffer.ts
+++ b/js/src/util/buffer.ts
@@ -32,15 +32,11 @@ function collapseContiguousByteRanges(chunks: Uint8Array[]) 
{
     for (let x, y, i = 0, j = 0, n = chunks.length; ++i < n;) {
         x = result[j];
         y = chunks[i];
-        // continue x and y don't share the same underlying ArrayBuffer
-        if (!x || !y || x.buffer !== y.buffer) {
+        // continue if x and y don't share the same underlying ArrayBuffer, or 
if x isn't before y
+        if (!x || !y || x.buffer !== y.buffer || y.byteOffset < x.byteOffset) {
             y && (result[++j] = y);
             continue;
         }
-        // swap if y starts before x
-        if (y.byteOffset < x.byteOffset) {
-            x = chunks[i]; y = result[j];
-        }
         ({ byteOffset: xOffset, byteLength: xLen } = x);
         ({ byteOffset: yOffset, byteLength: yLen } = y);
         // continue if the byte ranges of x and y aren't contiguous
diff --git a/js/test/unit/table/serialize-tests.ts 
b/js/test/unit/table/serialize-tests.ts
index f337913..9dce2f5 100644
--- a/js/test/unit/table/serialize-tests.ts
+++ b/js/test/unit/table/serialize-tests.ts
@@ -18,7 +18,7 @@
 import '../../jest-extensions';
 import * as generate from '../../generate-test-data';
 import {
-    Table, Schema, Field, DataType, Dictionary, Int32, Float32, Utf8, Null
+    Table, Schema, Field, DataType, Dictionary, Int32, Float32, Utf8, Null, 
Int32Vector
 } from '../../Arrow';
 
 const toSchema = (...xs: [string, DataType][]) => new Schema(xs.map((x) => new 
Field(...x)));
@@ -34,6 +34,17 @@ function createTable<T extends { [key: string]: DataType } = 
any>(schema: Schema
 
 describe('Table#serialize()', () => {
 
+    test(`doesn't swap the order of buffers that share the same underlying 
ArrayBuffer but are in a different order`, () => {
+        const values = new Int32Array([0, 1, 2, 3, 4, 5, 6, 7]);
+        const expected = values.slice();
+        const x = Int32Vector.from(values.subarray(4, 8)); // back
+        const y = Int32Vector.from(values.subarray(0, 4)); // front
+        const source = Table.new([x, y], ['x', 'y']);
+        const table = Table.from(source.serialize());
+        expect(table.getColumn('x').toArray()).toEqual(expected.subarray(4, 
8));
+        expect(table.getColumn('y').toArray()).toEqual(expected.subarray(0, 
4));
+    });
+
     test(`Table#empty round-trips through serialization`, () => {
         const source = Table.empty();
         source.schema.metadata.set('foo', 'bar');

Reply via email to