This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git


The following commit(s) were added to refs/heads/master by this push:
     new 0c1f5c5  PARQUET-1274: Prevent segfault that was occurring when 
writing a nanosecond timestamp with arrow writer properties set to coerce 
timestamps and support deprecated int96 timestamps.
0c1f5c5 is described below

commit 0c1f5c51f0af08509dbc229339310c2915b1df18
Author: Joshua Storck <[email protected]>
AuthorDate: Wed Apr 18 12:55:23 2018 +0200

    PARQUET-1274: Prevent segfault that was occurring when writing a nanosecond 
timestamp with arrow writer properties set to coerce timestamps and support 
deprecated int96 timestamps.
    
    The bug was a due to the fact that the physical type was int64 but the 
WriteTimestamps function was taking a path that assumed the physical type was 
int96. This caused memory corruption because it was writing past the end of the 
array. The bug was fixed by checking that coerce timestamps is disabled when 
writing int96.
    
    A unit test was added for the regression.
    
    Author: Joshua Storck <[email protected]>
    
    Closes #456 from joshuastorck/ARROW_2082 and squashes the following commits:
    
    5fa0a94 [Joshua Storck] Removing 'using ::arrow' in favor of using 
::arrow::SomeType
    9725ecc [Joshua Storck] Bug fix for ARROW-2082, in which a segfault was 
being encountered when writing a nanosecond timestamp column with arrow writer 
properties set to coerce timestamps and support deprecated int96 timestamps. 
The bug was a segfault due to the fact that the physical type was int64 but the 
WriteTimestamps function was taking a path that assumed the physical type was 
int96. The bug was fixed by checking that coerce timestamps is disabled when 
writing int96. A unit test [...]
---
 src/parquet/arrow/arrow-reader-writer-test.cc | 58 +++++++++++++++++++++++++++
 src/parquet/arrow/writer.cc                   |  9 ++++-
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc 
b/src/parquet/arrow/arrow-reader-writer-test.cc
index 93ecd3c..cb38b8f 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1436,6 +1436,64 @@ TEST(TestArrowReadWrite, ConvertedDateTimeTypes) {
   AssertTablesEqual(*ex_table, *result);
 }
 
+// Regression for ARROW-2802
+TEST(TestArrowReadWrite, CoerceTimestampsAndSupportDeprecatedInt96) {
+  using ::arrow::Column;
+  using ::arrow::Field;
+  using ::arrow::Schema;
+  using ::arrow::Table;
+  using ::arrow::TimeUnit;
+  using ::arrow::TimestampType;
+  using ::arrow::TimestampBuilder;
+  using ::arrow::default_memory_pool;
+
+  auto timestamp_type = std::make_shared<TimestampType>(TimeUnit::NANO);
+
+  TimestampBuilder builder(timestamp_type, default_memory_pool());
+  for (std::int64_t ii = 0; ii < 10; ++ii) {
+    ASSERT_OK(builder.Append(1000000000L * ii));
+  }
+  std::shared_ptr<Array> values;
+  ASSERT_OK(builder.Finish(&values));
+
+  std::vector<std::shared_ptr<Field>> fields;
+  auto field = std::make_shared<Field>("nanos", timestamp_type);
+  fields.emplace_back(field);
+
+  auto schema = std::make_shared<Schema>(fields);
+
+  std::vector<std::shared_ptr<Column>> columns;
+  auto column = std::make_shared<Column>("nanos", values);
+  columns.emplace_back(column);
+
+  auto table = Table::Make(schema, columns);
+
+  auto arrow_writer_properties = ArrowWriterProperties::Builder()
+                                     .coerce_timestamps(TimeUnit::MICRO)
+                                     ->enable_deprecated_int96_timestamps()
+                                     ->build();
+
+  std::shared_ptr<Table> result;
+  DoSimpleRoundtrip(table, 1, table->num_rows(), {}, &result, 
arrow_writer_properties);
+
+  ASSERT_EQ(table->num_columns(), result->num_columns());
+  ASSERT_EQ(table->num_rows(), result->num_rows());
+
+  auto actual_column = result->column(0);
+  auto data = actual_column->data();
+  auto expected_values =
+      
static_cast<::arrow::NumericArray<TimestampType>*>(values.get())->raw_values();
+  for (int ii = 0; ii < data->num_chunks(); ++ii) {
+    auto chunk =
+        
static_cast<::arrow::NumericArray<TimestampType>*>(data->chunk(ii).get());
+    auto values = chunk->raw_values();
+    for (int64_t jj = 0; jj < chunk->length(); ++jj, ++expected_values) {
+      // Check that the nanos have been converted to micros
+      ASSERT_EQ(*expected_values / 1000, values[jj]);
+    }
+  }
+}
+
 void MakeDoubleTable(int num_columns, int num_rows, int nchunks,
                      std::shared_ptr<Table>* out) {
   std::shared_ptr<::arrow::Column> column;
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index 14e9c6e..50b4649 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -602,7 +602,14 @@ Status ArrowColumnWriter::WriteTimestamps(const Array& 
values, int64_t num_level
 
   const bool is_nanosecond = type.unit() == TimeUnit::NANO;
 
-  if (is_nanosecond && 
ctx_->properties->support_deprecated_int96_timestamps()) {
+  // In the case where support_deprecated_int96_timestamps was specified
+  // and coerce_timestamps_enabled was specified, a nanosecond column
+  // will have a physical type of int64. In that case, we fall through
+  // to the else if below.
+  //
+  // See https://issues.apache.org/jira/browse/ARROW-2082
+  if (is_nanosecond && ctx_->properties->support_deprecated_int96_timestamps() 
&&
+      !ctx_->properties->coerce_timestamps_enabled()) {
     return TypedWriteBatch<Int96Type, ::arrow::TimestampType>(values, 
num_levels,
                                                               def_levels, 
rep_levels);
   } else if (is_nanosecond ||

-- 
To stop receiving notification emails like this one, please contact
[email protected].

Reply via email to