This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 144699c4210ac0b75d16395a583671b2874acc9e Author: Wes McKinney <wes.mckin...@twosigma.com> AuthorDate: Wed Aug 30 09:49:14 2017 +0200 PARQUET-1083: Factor logic in parquet-scan.cc into a library function to help with perf testing See ARROW-1377 Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #385 from wesm/PARQUET-1083 and squashes the following commits: 359cd09 [Wes McKinney] Factor main logic in parquet-scan.cc into a library function, so that library users can use for performance testing Change-Id: Ia50d136c380c4d42d6c62577e02a9533df6fa6fe --- cpp/tools/parquet/parquet-scan.cc | 38 ++------------------------------------ 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/cpp/tools/parquet/parquet-scan.cc b/cpp/tools/parquet/parquet-scan.cc index 5bf2b18..fdc73d7 100644 --- a/cpp/tools/parquet/parquet-scan.cc +++ b/cpp/tools/parquet/parquet-scan.cc @@ -57,50 +57,16 @@ int main(int argc, char** argv) { } } - std::vector<int16_t> rep_levels(batch_size); - std::vector<int16_t> def_levels(batch_size); try { double total_time; std::clock_t start_time = std::clock(); std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(filename); - // columns are not specified explicitly. Add all columns - if (num_columns == 0) { - num_columns = reader->metadata()->num_columns(); - columns.resize(num_columns); - for (int i = 0; i < num_columns; i++) { - columns[i] = i; - } - } - - std::vector<int64_t> total_rows(num_columns); - - for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) { - auto group_reader = reader->RowGroup(r); - int col = 0; - for (auto i : columns) { - total_rows[col] = 0; - std::shared_ptr<parquet::ColumnReader> col_reader = group_reader->Column(i); - size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type()); - std::vector<uint8_t> values(batch_size * value_byte_size); - int64_t values_read = 0; - while (col_reader->HasNext()) { - total_rows[col] += - ScanAllValues(batch_size, def_levels.data(), rep_levels.data(), - values.data(), &values_read, col_reader.get()); - } - col++; - } - } + int64_t total_rows = parquet::ScanFileContents(columns, batch_size, reader.get()); total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC); - for (int ct = 1; ct < num_columns; ++ct) { - if (total_rows[0] != total_rows[ct]) { - std::cerr << "Parquet error: Total rows among columns do not match" << std::endl; - } - } - std::cout << total_rows[0] << " rows scanned in " << total_time << " seconds." + std::cout << total_rows << " rows scanned in " << total_time << " seconds." << std::endl; } catch (const std::exception& e) { std::cerr << "Parquet error: " << e.what() << std::endl;