Thanks for the quick response. When I use pyspark to read a parquet file 
written by arrow, I can't see even file-level metadata. Is that also a known 
issue? (Note: I searched the JIRA issues and couldn't find any info.)


Sent with ProtonMail Secure Email.

‐‐‐‐‐‐‐ Original Message ‐‐‐‐‐‐‐
On Thursday, October 10, 2019 12:44 PM, Wes McKinney <wesmck...@gmail.com> 
wrote:

> We haven't implemented storing field-level metadata in Parquet files
> yet. It's somewhat tricky. See
> https://issues.apache.org/jira/browse/ARROW-4359
>
> On Thu, Oct 10, 2019 at 11:51 AM Isaac Myers
> isaacmy...@protonmail.com.invalid wrote:
>
> > I can write both field- and schema-level metadata and read the values back 
> > from schema or relevant field. I write the schema and table described by 
> > the schema to a local parquet file. Upon reading the table or schema from 
> > the parquet file, only schema metadata are present and field metadata are 
> > not present. Am I doing something wrong? Please view the minimum working 
> > example below:
> > <code>
> > #include <vector>
> > #include <cstdint>
> > #include <map>
> > #include <arrow/api.h>
> > #include <arrow/io/api.h>
> > #include <parquet/arrow/reader.h>
> > #include <parquet/arrow/writer.h>
> > #include <parquet/arrow/schema.h>
> > //#include <arrow/>
> > int main(int argc, char* argv[])
> > {
> > /*********************************
> > Create Parquet File
> > *********************************/
> > arrow::Status st;
> > arrow::MemoryPool pool = arrow::default_memory_pool();// Create Schema and 
> > fields with metadata
> > std::vector<std::shared_ptrarrow::Field> fields;
> > std::unordered_map<std::string, std::string> a_keyval;
> > a_keyval["unit"] = "sec";
> > a_keyval["note"] = "not the standard millisecond unit";
> > arrow::KeyValueMetadata a_md(a_keyval);
> > std::shared_ptrarrow::Field a_field = arrow::field("a", arrow::int16(), 
> > false, a_md.Copy());
> > fields.push_back(a_field);
> > std::unordered_map<std::string, std::string> b_keyval;
> > b_keyval["unit"] = "ft";
> > arrow::KeyValueMetadata b_md(b_keyval);
> > std::shared_ptrarrow::Field b_field = arrow::field("b", arrow::int16(), 
> > false, b_md.Copy());
> > fields.push_back(b_field);
> > std::shared_ptrarrow::Schema schema = arrow::schema(fields);
> > // Add metadata to schema.
> > std::unordered_map<std::string, std::string> schema_keyval;
> > schema_keyval["classification"] = "Type 0";
> > arrow::KeyValueMetadata schema_md(schema_keyval);
> > schema = schema->AddMetadata(schema_md.Copy());
> > // Build arrays of data and add to Table.
> > const int64_t rowgroup_size = 100;
> > std::vector<int16_t> a_data(rowgroup_size, 0);
> > std::vector<int16_t> b_data(rowgroup_size, 0);
> > for (int16_t i = 0; i < rowgroup_size; i++)
> > {
> > a_data[i] = i;
> > b_data[i] = rowgroup_size - i;
> > }
> > arrow::Int16Builder a_bldr(pool);
> > arrow::Int16Builder b_bldr(pool);
> > st = a_bldr.Resize(rowgroup_size);
> > if (!st.ok()) return 1;
> > st = b_bldr.Resize(rowgroup_size);
> > if (!st.ok()) return 1;
> > st = a_bldr.AppendValues(a_data);
> > if (!st.ok()) return 1;
> > st = b_bldr.AppendValues(b_data);
> > if (!st.ok()) return 1;
> > std::shared_ptrarrow::Array a_arr_ptr;
> > std::shared_ptrarrow::Array b_arr_ptr;
> > arrow::ArrayVector arr_vec;
> > st = a_bldr.Finish(&a_arr_ptr);
> > if (!st.ok()) return 1;
> > arr_vec.push_back(a_arr_ptr);
> > st = b_bldr.Finish(&b_arr_ptr);
> > if (!st.ok()) return 1;
> > arr_vec.push_back(b_arr_ptr);
> > std::shared_ptrarrow::Table table = arrow::Table::Make(schema, arr_vec);
> > // Test metadata
> > printf("\nMetadata from original schema:\n");
> > printf("%s\n", schema->metadata()->ToString().c_str());
> > printf("%s\n", schema->field(0)->metadata()->ToString().c_str());
> > printf("%s\n", schema->field(1)->metadata()->ToString().c_str());
> > std::shared_ptrarrow::Schema table_schema = table->schema();
> > printf("\nMetadata from schema retrieved from table (should be the 
> > same):\n");
> > printf("%s\n", table_schema->metadata()->ToString().c_str());
> > printf("%s\n", table_schema->field(0)->metadata()->ToString().c_str());
> > printf("%s\n", table_schema->field(1)->metadata()->ToString().c_str());
> > // Open file and write table.
> > std::string file_name = "test.parquet";
> > std::shared_ptrarrow::io::FileOutputStream ostream;
> > st = arrow::io::FileOutputStream::Open(file_name, &ostream);
> > if (!st.ok()) return 1;
> > std::unique_ptrparquet::arrow::FileWriter writer;
> > std::shared_ptrparquet::WriterProperties props = 
> > parquet::default_writer_properties();
> > st = parquet::arrow::FileWriter::Open(*schema, pool, ostream, props, 
> > &writer);
> > if (!st.ok()) return 1;
> > st = writer->WriteTable(*table, rowgroup_size);
> > if (!st.ok()) return 1;
> > // Close file and stream.
> > st = writer->Close();
> > if (!st.ok()) return 1;
> > st = ostream->Close();
> > if (!st.ok()) return 1;
> > /*********************************
> > Read Parquet File
> > **********************************/
> > // Create new memory pool. Not sure if this is necessary.
> > //arrow::MemoryPool* pool2 = arrow::default_memory_pool();
> > // Open file reader.
> > std::shared_ptrarrow::io::ReadableFile input_file;
> > st = arrow::io::ReadableFile::Open(file_name, pool, &input_file);
> > if (!st.ok()) return 1;
> > std::unique_ptrparquet::arrow::FileReader reader;
> > st = parquet::arrow::OpenFile(input_file, pool, &reader);
> > if (!st.ok()) return 1;
> > // Get schema and read metadata.
> > std::shared_ptrarrow::Schema new_schema;
> > st = reader->GetSchema(&new_schema);
> > if (!st.ok()) return 1;
> > printf("\nMetadata from schema read from file:\n");
> > printf("%s\n", new_schema->metadata()->ToString().c_str());
> > // Crashes because there are no metadata.
> > /printf("%s\n", new_schema->field(0)->metadata()->ToString().c_str());
> > printf("%s\n", 
> > new_schema->field(1)->metadata()->ToString().c_str());/printf("field name 
> > %s metadata exists: %d\n", new_schema->field(0)->name().c_str(),
> > new_schema->field(0)->HasMetadata());
> > printf("field name %s metadata exists: %d\n", 
> > new_schema->field(1)->name().c_str(),
> > new_schema->field(1)->HasMetadata());
> > // What if I read the whole table and get the schema from it.
> > std::shared_ptrarrow::Table new_table;
> > st = reader->ReadTable(&new_table);
> > if (!st.ok()) return 1;
> > std::shared_ptrarrow::Schema schema_from_table = new_table->schema();
> > printf("\nMetadata from schema that is retrieved through table that is read 
> > from file:\n");
> > printf("%s\n", schema_from_table->metadata()->ToString().c_str());
> > // Crashes because there are no metadata.
> > /printf("%s\n", 
> > schema_from_table->field(0)->metadata()->ToString().c_str());
> > printf("%s\n", 
> > schema_from_table->field(1)->metadata()->ToString().c_str());/printf("field 
> > name %s metadata exists: %d\n", schema_from_table->field(0)->name().c_str(),
> > schema_from_table->field(0)->HasMetadata());
> > printf("field name %s metadata exists: %d\n", 
> > schema_from_table->field(1)->name().c_str(),
> > schema_from_table->field(1)->HasMetadata());
> > st = input_file->Close();
> > if (!st.ok()) return 1;
> > return 0;
> > }
> > </code>
> > Sent with ProtonMail Secure Email.


Reply via email to