I have a 2-D array of variable size:
// Initialize 2-D array of x and y values
std::vector<std::vector<double>> x(rows, std::vector<double>(cols, 0.0));
std::vector<std::vector<double>> y(rows, std::vector<double>(cols, 0.0));
}
How do I write these values to a parquet file with the following format?
x0 | y0 | ... | xn | yn | |
---|---|---|---|---|---|
0 | x[0][0] |
y[0][0] |
... | x[n][0] |
y[n][0] |
... | ... | ... | ... | ... | ... |
N | x[0][N] |
y[0][N] |
... | x[n][N] |
y[n][N] |
where n = rows - 1 and N = cols - 1.
Here is my attempt so far:
// Create schema for output file
arrow::FieldVector fields;
for(int i = 0; i < rows; i++) {
fields.push_back(arrow::field("x_" + std::to_string(i), arrow::float64()));
fields.push_back(arrow::field("y_" + std::to_string(i), arrow::float64()));
}
std::shared_ptr<arrow::Schema> schema = arrow::schema(fields);
// Store data into arrow::Table for output
arrow::ArrayVector array_vector;
for(int i = 0; i < rows; i++) {
arrow::FloatBuilder fbuilder;
std::shared_ptr<arrow::Array> data_array;
for(int j = 0; j < cols; j++) {
fbuilder.Append(x[i][j]);
}
fbuilder.Finish(&data_array);
array_vector.push_back(data_array);
}
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, array_vector);
// Write table to output file
std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_THROW_NOT_OK(arrow::io::FileOutputStream::Open("test.parquet", &outfile));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), outfile, 3));
The resulting "test.parquet" contains nothing and throws the following error when trying to read it using python:
pyarrow.lib.ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet file size is 0 bytes
I get a number of errors and compiler warnings trying to run your code. Make sure you are paying attention to those. Also, PARQUET_THROW_NOT_OK
should be throwing an exception with what you have.
i < rows
? for(int i = 0; i < rows; i++) {
fields.push_back(arrow::field("x_" + std::to_string(i), arrow::float64()));
fields.push_back(arrow::field("y_" + std::to_string(i), arrow::float64()));
}
You are ignoring some potential bad status returns in the array builder methods Append
and Finish
.
You are defining 2*rows
columns (one set of x_
and one of y_
) but you are only creating rows
arrays.
When building your schema you use float64
. However, the builder type you are using is FloatBuilder
which is float32
. Use DoubleBuilder
if you want float64
.
You are passing &outfile
as the second parameter of arrow::io::FileOutputStream::Open
but Open
returns a result and does not take in an out parameter. Instead that pointer is being implicitly cast to a bool
.
Here is a roughly similar version that does create a parquet file successfully:
#include <arrow/io/api.h>
#include <arrow/array/builder_primitive.h>
#include <arrow/result.h>
#include <arrow/status.h>
#include <arrow/table.h>
#include <parquet/arrow/writer.h>
#include <iostream>
using arrow::Status;
namespace
{
static constexpr int kRows = 100;
static constexpr int kCols = 100;
Status RunMain()
{
// Create schema for output file
arrow::FieldVector fields;
for (int i = 0; i < kRows; i++)
{
fields.push_back(arrow::field("x_" + std::to_string(i), arrow::float32()));
// fields.push_back(arrow::field("y_" + std::to_string(i), arrow::float64()));
}
std::shared_ptr<arrow::Schema> schema = arrow::schema(fields);
// Store data into arrow::Table for output
arrow::ArrayVector array_vector;
for (int i = 0; i < kRows; i++)
{
arrow::FloatBuilder fbuilder;
std::shared_ptr<arrow::Array> data_array;
for (int j = 0; j < kCols; j++)
{
ARROW_RETURN_NOT_OK(fbuilder.Append(i * kRows + j));
}
ARROW_RETURN_NOT_OK(fbuilder.Finish(&data_array));
array_vector.push_back(data_array);
}
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, array_vector);
// Write table to output file
std::shared_ptr<arrow::io::FileOutputStream> outfile;
ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test.parquet"));
ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), outfile, 3));
return outfile->Close();
}
} // namespace
int main()
{
Status st = RunMain();
if (!st.ok())
{
std::cerr << st << std::endl;
return 1;
}
return 0;
}