Search code examples
cparquet

Writing Apache Parquet files using Parquet-GLib


Does anyone have any pointers towards a somewhat complete example or representative source code as to how to actually use parquet-glib (the C bindings to reading and writing Apache Parquet files)? The API reference documentation is good but does not really tell me how to build things with it. I'm not sure how well the docs for other languages translate to C, at least so far they've been unhelpful.


Solution

  • Ok, I managed to hack this together, this is pretty much how it goes:

    #include <arrow-glib/arrow-glib.h>
    #include <parquet-glib/parquet-glib.h>
    
    int main() {
        GError *error = NULL;
    
        g_autoptr(GArrowDataType) int32_data_type = (GArrowDataType *) garrow_int32_data_type_new();
        g_autoptr(GArrowDataType) float_data_type = (GArrowDataType *) garrow_float_data_type_new();
        g_autoptr(GArrowDataType) string_data_type = (GArrowDataType *) garrow_string_data_type_new();
    
        // Define the schema
        g_autoptr(GArrowField) int_field = garrow_field_new("int_column", int32_data_type);
        g_autoptr(GArrowField) float_field = garrow_field_new("float_column", float_data_type);
        g_autoptr(GArrowField) string_field = garrow_field_new("string_column", string_data_type);
    
        g_autoptr(GList) fields = NULL;
        fields = g_list_append(fields, int_field);
        fields = g_list_append(fields, float_field);
        fields = g_list_append(fields, string_field);
    
        g_autoptr(GArrowSchema) schema = garrow_schema_new(fields);
    
        // Create arrays for each column
        g_autoptr(GArrowInt32ArrayBuilder) int_builder = garrow_int32_array_builder_new();
        g_autoptr(GArrowFloatArrayBuilder) float_builder = garrow_float_array_builder_new();
        g_autoptr(GArrowStringArrayBuilder) string_builder = garrow_string_array_builder_new();
    
        for (int i = 0; i < 10; ++i) {
            garrow_int32_array_builder_append_value(int_builder, i, &error);
            garrow_float_array_builder_append_value(float_builder, (float)i * 1.1f, &error);
            garrow_string_array_builder_append_string(string_builder, g_strdup_printf("string%d", i), &error);
        }
    
        g_autoptr(GArrowArray) int_array = garrow_array_builder_finish((GArrowArrayBuilder*) int_builder, &error);
        g_autoptr(GArrowArray) float_array = garrow_array_builder_finish((GArrowArrayBuilder*) float_builder, &error);
        g_autoptr(GArrowArray) string_array = garrow_array_builder_finish((GArrowArrayBuilder*) string_builder, &error);
    
        // Create a table from the arrays
        g_autoptr(GList) columns = NULL;
        columns = g_list_append(columns, int_array);
        columns = g_list_append(columns, float_array);
        columns = g_list_append(columns, string_array);
    
        g_autoptr(GArrowTable) table = garrow_table_new_values(schema, columns, &error);
    
        // Create a file output stream
        g_autoptr(GArrowFileOutputStream) output = garrow_file_output_stream_new("output.parquet", FALSE, &error);
        if (error != NULL) {
            g_print("Error opening Parquet file: %s\n", error->message);
            return 1;
        }
    
        // Create a Parquet file writer
        g_autoptr(GParquetArrowFileWriter) writer = gparquet_arrow_file_writer_new_arrow(schema, GARROW_OUTPUT_STREAM(output), NULL, &error);
        if (error != NULL) {
            g_print("Error creating Parquet file writer: %s\n", error->message);
            return 1;
        }
    
        // Write the table to the Parquet file
        if (!gparquet_arrow_file_writer_write_table(writer, table, 10000, &error)) {
            g_print("Error writing table to Parquet file: %s\n", error->message);
            return 1;
        }
    
        // Close the writer
        if (!gparquet_arrow_file_writer_close(writer, &error)) {
            g_print("Error closing Parquet file: %s\n", error->message);
            return 1;
        }
    
        return 0;
    }