This is an automated email from the ASF dual-hosted git repository. ptaylor pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new b796b57 ARROW-3336: [JS] Fix IPC writer serializing sliced arrays b796b57 is described below commit b796b579b8aba472c6690a49f95e8e174d01a4bb Author: ptaylor <paul.e.tay...@me.com> AuthorDate: Thu Sep 27 11:08:39 2018 -0700 ARROW-3336: [JS] Fix IPC writer serializing sliced arrays Fixes https://issues.apache.org/jira/browse/ARROW-3336. Realized too late I branched from the https://github.com/apache/arrow/pull/2616, so that's why there's some extra commits at the front. Check out the relevant tests here: https://github.com/trxcllnt/arrow/blob/860f61046fca0081dbe0ce986a97408ed5934e22/js/test/unit/writer-tests.ts#L26. This test covers the primitive, nested, and dictionary vectors, but it'd be worth adding more (for example, Unions). Author: ptaylor <paul.e.tay...@me.com> Closes #2638 from trxcllnt/js-fix-writing-sliced-arrays and squashes the following commits: bcd58caa <ptaylor> fix writer-tests.ts import fa4d8f54 <ptaylor> only use vector offset to serialize bitmaps and data buffers of variable-width vectors, as all other buffers have already been adjusted as part of the original call to Vector#slice() 5ece0806 <ptaylor> reset internal children Array in case the childData for each has been sliced e093f7c0 <ptaylor> add test validating writer serializes sliced arrays correctly f9dd91b0 <ptaylor> move vector comparison extension into separate module 380d6555 <ptaylor> fix table constructor args --- js/src/ipc/writer/binary.ts | 12 +++--- js/src/table.ts | 21 ++++++---- js/src/vector/nested.ts | 2 +- js/test/integration/validate-tests.ts | 58 +-------------------------- js/test/jest-extensions.ts | 74 +++++++++++++++++++++++++++++++++++ js/test/unit/table-tests.ts | 2 +- js/test/unit/writer-tests.ts | 62 +++++++++++++++++++++++++++++ 7 files changed, 159 insertions(+), 72 deletions(-) diff --git a/js/src/ipc/writer/binary.ts b/js/src/ipc/writer/binary.ts index 4be3dc7..015e747 100644 --- a/js/src/ipc/writer/binary.ts +++ b/js/src/ipc/writer/binary.ts @@ -215,7 +215,7 @@ export class RecordBatchSerializer extends VectorVisitor { // Set all to -1 to indicate that we haven't observed a first occurrence of a particular child yet const childOffsets = new Int32Array(maxChildTypeId + 1).fill(-1); const shiftedOffsets = new Int32Array(length); - const unshiftedOffsets = this.getZeroBasedValueOffsets(sliceOffset, length, valueOffsets); + const unshiftedOffsets = this.getZeroBasedValueOffsets(0, length, valueOffsets); for (let typeId, shift, index = -1; ++index < length;) { typeId = typeIds[index]; // ~(-1) used to be faster than x === -1, so maybe worth benchmarking the difference of these two impls for large dense unions: @@ -257,9 +257,9 @@ export class RecordBatchSerializer extends VectorVisitor { } protected visitFlatVector<T extends FlatType>(vector: Vector<T>) { const { view, data } = vector; - const { offset, length, values } = data; + const { length, values } = data; const scaledLength = length * ((view as any).size || 1); - return this.addBuffer(values.subarray(offset, scaledLength)); + return this.addBuffer(values.subarray(0, scaledLength)); } protected visitFlatListVector<T extends FlatListType>(vector: Vector<T>) { const { data, length } = vector; @@ -269,17 +269,17 @@ export class RecordBatchSerializer extends VectorVisitor { const byteLength = Math.min(lastOffset - firstOffset, values.byteLength - firstOffset); // Push in the order FlatList types read their buffers // valueOffsets buffer first - this.addBuffer(this.getZeroBasedValueOffsets(offset, length, valueOffsets)); + this.addBuffer(this.getZeroBasedValueOffsets(0, length, valueOffsets)); // sliced values buffer second this.addBuffer(values.subarray(firstOffset + offset, firstOffset + offset + byteLength)); return this; } protected visitListVector<T extends SingleNestedType>(vector: Vector<T>) { const { data, length } = vector; - const { offset, valueOffsets } = <any> data; + const { valueOffsets } = <any> data; // If we have valueOffsets (ListVector), push that buffer first if (valueOffsets) { - this.addBuffer(this.getZeroBasedValueOffsets(offset, length, valueOffsets)); + this.addBuffer(this.getZeroBasedValueOffsets(0, length, valueOffsets)); } // Then insert the List's values child return this.visit((vector as any as ListVector<T>).getChildAt(0)!); diff --git a/js/src/table.ts b/js/src/table.ts index e3be9bb..e09e206 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -93,16 +93,21 @@ export class Table implements DataFrame { constructor(schema: Schema, batches: RecordBatch[]); constructor(schema: Schema, ...batches: RecordBatch[]); constructor(...args: any[]) { - let schema: Schema; - let batches: RecordBatch[]; + + let schema: Schema = null!; + if (args[0] instanceof Schema) { - schema = args[0]; - batches = Array.isArray(args[1][0]) ? args[1][0] : args[1]; - } else if (args[0] instanceof RecordBatch) { - schema = (batches = args)[0].schema; - } else { - schema = (batches = args[0])[0].schema; + schema = args.shift(); + } + + let batches = args.reduce(function flatten(xs: any[], x: any): any[] { + return Array.isArray(x) ? x.reduce(flatten, xs) : [...xs, x]; + }, []).filter((x: any): x is RecordBatch => x instanceof RecordBatch); + + if (!schema && !(schema = batches[0] && batches[0].schema)) { + throw new TypeError('Table must be initialized with a Schema or at least one RecordBatch with a Schema'); } + this.schema = schema; this.batches = batches; this.batchesUnion = batches.length == 0 ? diff --git a/js/src/vector/nested.ts b/js/src/vector/nested.ts index 6d1bd4a..31bbee4 100644 --- a/js/src/vector/nested.ts +++ b/js/src/vector/nested.ts @@ -33,7 +33,7 @@ export abstract class NestedView<T extends NestedType> implements View<T> { this._children = children || new Array(this.numChildren); } public clone(data: Data<T>): this { - return new (<any> this.constructor)(data, this._children) as this; + return new (<any> this.constructor)(data, new Array(this.numChildren)) as this; } public isValid(): boolean { return true; diff --git a/js/test/integration/validate-tests.ts b/js/test/integration/validate-tests.ts index 5d0d3ff..0f1ebcc 100644 --- a/js/test/integration/validate-tests.ts +++ b/js/test/integration/validate-tests.ts @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +import '../jest-extensions'; + import * as fs from 'fs'; import * as path from 'path'; @@ -59,62 +61,6 @@ const jsonAndArrowPaths = toArray(zip( )) .filter(([p1, p2]) => p1 !== undefined && p2 !== undefined) as [string, string][]; -expect.extend({ - toEqualVector([v1, format1, columnName]: [any, string, string], [v2, format2]: [any, string]) { - - const format = (x: any, y: any, msg= ' ') => `${ - this.utils.printExpected(x)}${ - msg}${ - this.utils.printReceived(y) - }`; - - let getFailures = new Array<string>(); - let propsFailures = new Array<string>(); - let iteratorFailures = new Array<string>(); - let allFailures = [ - { title: 'get', failures: getFailures }, - { title: 'props', failures: propsFailures }, - { title: 'iterator', failures: iteratorFailures } - ]; - - let props = [ - // 'name', 'nullable', 'metadata', - 'type', 'length', 'nullCount' - ]; - - for (let i = -1, n = props.length; ++i < n;) { - const prop = props[i]; - if (`${v1[prop]}` !== `${v2[prop]}`) { - propsFailures.push(`${prop}: ${format(v1[prop], v2[prop], ' !== ')}`); - } - } - - for (let i = -1, n = v1.length; ++i < n;) { - let x1 = v1.get(i), x2 = v2.get(i); - if (this.utils.stringify(x1) !== this.utils.stringify(x2)) { - getFailures.push(`${i}: ${format(x1, x2, ' !== ')}`); - } - } - - let i = -1; - for (let [x1, x2] of zip(v1, v2)) { - ++i; - if (this.utils.stringify(x1) !== this.utils.stringify(x2)) { - iteratorFailures.push(`${i}: ${format(x1, x2, ' !== ')}`); - } - } - - return { - pass: allFailures.every(({ failures }) => failures.length === 0), - message: () => [ - `${columnName}: (${format(format1, format2, ' !== ')})\n`, - ...allFailures.map(({ failures, title }) => - !failures.length ? `` : [`${title}:`, ...failures].join(`\n`)) - ].join('\n') - }; - } -}); - describe(`Integration`, () => { for (const [jsonFilePath, arrowFilePath] of jsonAndArrowPaths) { let { name, dir } = path.parse(arrowFilePath); diff --git a/js/test/jest-extensions.ts b/js/test/jest-extensions.ts new file mode 100644 index 0000000..f45b70c --- /dev/null +++ b/js/test/jest-extensions.ts @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { zip } from 'ix/iterable/zip'; + +expect.extend({ + toEqualVector([v1, format1, columnName]: [any, string, string], [v2, format2]: [any, string]) { + + const format = (x: any, y: any, msg= ' ') => `${ + this.utils.printExpected(x)}${ + msg}${ + this.utils.printReceived(y) + }`; + + let getFailures = new Array<string>(); + let propsFailures = new Array<string>(); + let iteratorFailures = new Array<string>(); + let allFailures = [ + { title: 'get', failures: getFailures }, + { title: 'props', failures: propsFailures }, + { title: 'iterator', failures: iteratorFailures } + ]; + + let props = [ + // 'name', 'nullable', 'metadata', + 'type', 'length', 'nullCount' + ]; + + for (let i = -1, n = props.length; ++i < n;) { + const prop = props[i]; + if (`${v1[prop]}` !== `${v2[prop]}`) { + propsFailures.push(`${prop}: ${format(v1[prop], v2[prop], ' !== ')}`); + } + } + + for (let i = -1, n = v1.length; ++i < n;) { + let x1 = v1.get(i), x2 = v2.get(i); + if (this.utils.stringify(x1) !== this.utils.stringify(x2)) { + getFailures.push(`${i}: ${format(x1, x2, ' !== ')}`); + } + } + + let i = -1; + for (let [x1, x2] of zip(v1, v2)) { + ++i; + if (this.utils.stringify(x1) !== this.utils.stringify(x2)) { + iteratorFailures.push(`${i}: ${format(x1, x2, ' !== ')}`); + } + } + + return { + pass: allFailures.every(({ failures }) => failures.length === 0), + message: () => [ + `${columnName}: (${format(format1, format2, ' !== ')})\n`, + ...allFailures.map(({ failures, title }) => + !failures.length ? `` : [`${title}:`, ...failures].join(`\n`)) + ].join('\n') + }; + } +}); diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index 98e3eb2..6ec5b74 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -321,7 +321,7 @@ function leftPad(str: string, fill: string, n: number) { return (new Array(n + 1).join(fill) + str).slice(-1 * n); } -function getSingleRecordBatchTable() { +export function getSingleRecordBatchTable() { return Table.from({ 'schema': { 'fields': [ diff --git a/js/test/unit/writer-tests.ts b/js/test/unit/writer-tests.ts new file mode 100644 index 0000000..7bd63fc --- /dev/null +++ b/js/test/unit/writer-tests.ts @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import '../jest-extensions'; + +import Arrow from '../Arrow'; +import { getSingleRecordBatchTable } from './table-tests'; +const { Table, RecordBatch } = Arrow; + +describe('Table.serialize()', () => { + test(`Serializes sliced RecordBatches`, () => { + + const table = getSingleRecordBatchTable(); + const batch = table.batches[0], half = batch.length / 2 | 0; + + // First compare what happens when slicing from the batch level + let [batch1, batch2] = [batch.slice(0, half), batch.slice(half)]; + + compareBatchAndTable(table, 0, batch1, Table.from(new Table(batch1).serialize())); + compareBatchAndTable(table, half, batch2, Table.from(new Table(batch2).serialize())); + + // Then compare what happens when creating a RecordBatch by slicing each child individually + batch1 = new RecordBatch(batch1.schema, batch1.length, batch1.schema.fields.map((_, i) => { + return batch.getChildAt(i)!.slice(0, half); + })); + + batch2 = new RecordBatch(batch2.schema, batch2.length, batch2.schema.fields.map((_, i) => { + return batch.getChildAt(i)!.slice(half); + })); + + compareBatchAndTable(table, 0, batch1, Table.from(new Table(batch1).serialize())); + compareBatchAndTable(table, half, batch2, Table.from(new Table(batch2).serialize())); + }); +}); + +function compareBatchAndTable(source: Table, offset: number, batch: RecordBatch, table: Table) { + expect(batch.length).toEqual(table.length); + expect(table.numCols).toEqual(source.numCols); + expect(batch.numCols).toEqual(source.numCols); + for (let i = -1, n = source.numCols; ++i < n;) { + const v0 = source.getColumnAt(i)!.slice(offset, offset + batch.length); + const v1 = batch.getChildAt(i); + const v2 = table.getColumnAt(i); + const name = source.schema.fields[i].name; + (expect([v1, `batch`, name]) as any).toEqualVector([v0, `source`]); + (expect([v2, `table`, name]) as any).toEqualVector([v0, `source`]); + } +}