Does anyone have any pointers towards a somewhat complete example or representative source code as to how to actually use parquet-glib (the C bindings to reading and writing Apache Parquet files)? The API reference documentation is good but does not really tell me how to build things with it. I'm not sure how well the docs for other languages translate to C, at least so far they've been unhelpful.
Ok, I managed to hack this together, this is pretty much how it goes:
#include <arrow-glib/arrow-glib.h>
#include <parquet-glib/parquet-glib.h>
int main() {
GError *error = NULL;
g_autoptr(GArrowDataType) int32_data_type = (GArrowDataType *) garrow_int32_data_type_new();
g_autoptr(GArrowDataType) float_data_type = (GArrowDataType *) garrow_float_data_type_new();
g_autoptr(GArrowDataType) string_data_type = (GArrowDataType *) garrow_string_data_type_new();
// Define the schema
g_autoptr(GArrowField) int_field = garrow_field_new("int_column", int32_data_type);
g_autoptr(GArrowField) float_field = garrow_field_new("float_column", float_data_type);
g_autoptr(GArrowField) string_field = garrow_field_new("string_column", string_data_type);
g_autoptr(GList) fields = NULL;
fields = g_list_append(fields, int_field);
fields = g_list_append(fields, float_field);
fields = g_list_append(fields, string_field);
g_autoptr(GArrowSchema) schema = garrow_schema_new(fields);
// Create arrays for each column
g_autoptr(GArrowInt32ArrayBuilder) int_builder = garrow_int32_array_builder_new();
g_autoptr(GArrowFloatArrayBuilder) float_builder = garrow_float_array_builder_new();
g_autoptr(GArrowStringArrayBuilder) string_builder = garrow_string_array_builder_new();
for (int i = 0; i < 10; ++i) {
garrow_int32_array_builder_append_value(int_builder, i, &error);
garrow_float_array_builder_append_value(float_builder, (float)i * 1.1f, &error);
garrow_string_array_builder_append_string(string_builder, g_strdup_printf("string%d", i), &error);
}
g_autoptr(GArrowArray) int_array = garrow_array_builder_finish((GArrowArrayBuilder*) int_builder, &error);
g_autoptr(GArrowArray) float_array = garrow_array_builder_finish((GArrowArrayBuilder*) float_builder, &error);
g_autoptr(GArrowArray) string_array = garrow_array_builder_finish((GArrowArrayBuilder*) string_builder, &error);
// Create a table from the arrays
g_autoptr(GList) columns = NULL;
columns = g_list_append(columns, int_array);
columns = g_list_append(columns, float_array);
columns = g_list_append(columns, string_array);
g_autoptr(GArrowTable) table = garrow_table_new_values(schema, columns, &error);
// Create a file output stream
g_autoptr(GArrowFileOutputStream) output = garrow_file_output_stream_new("output.parquet", FALSE, &error);
if (error != NULL) {
g_print("Error opening Parquet file: %s\n", error->message);
return 1;
}
// Create a Parquet file writer
g_autoptr(GParquetArrowFileWriter) writer = gparquet_arrow_file_writer_new_arrow(schema, GARROW_OUTPUT_STREAM(output), NULL, &error);
if (error != NULL) {
g_print("Error creating Parquet file writer: %s\n", error->message);
return 1;
}
// Write the table to the Parquet file
if (!gparquet_arrow_file_writer_write_table(writer, table, 10000, &error)) {
g_print("Error writing table to Parquet file: %s\n", error->message);
return 1;
}
// Close the writer
if (!gparquet_arrow_file_writer_close(writer, &error)) {
g_print("Error closing Parquet file: %s\n", error->message);
return 1;
}
return 0;
}