Search code examples
pythonc++parquetpyarrowapache-arrow

Reading parquet file is slower in c++ than in python


I have written code to read the same parquet file using c++ and using python. The time taken to read the file is much less for python than in c++, but as generally we know, execution in c++ is faster than in python. I have attached the code here -

#include <arrow/api.h>
#include <parquet/arrow/reader.h>
#include <arrow/filesystem/localfs.h>
#include <chrono>
#include <iostream>

int main(){
   // ...
   arrow::Status st;
   arrow::MemoryPool* pool = arrow::default_memory_pool();
   arrow::fs::LocalFileSystem file_system;
   std::shared_ptr<arrow::io::RandomAccessFile> input = file_system.OpenInputFile("data.parquet").ValueOrDie();

   // Open Parquet file reader
   std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
   st = parquet::arrow::OpenFile(input, pool, &arrow_reader);
   if (!st.ok()) {
      // Handle error instantiating file reader...
   }

   // Read entire file as a single Arrow table
   std::shared_ptr<arrow::Table> table;
   auto t1 = std::chrono::high_resolution_clock::now();
   st = arrow_reader->ReadTable(&table);
   auto t2 = std::chrono::high_resolution_clock::now();
   if (!st.ok()) {
      // Handle error reading Parquet data...
   }
   else{
       auto ms_int = std::chrono::duration_cast<std::chrono::milliseconds> (t2 - t1);
       std::cout << "Time taken to read parquet file is : " << ms_int.count() << "ms\n";
   }
}

The code i used in python is -

#!/usr/bin/env python3
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import time

start_time = time.time()

table = pq.read_table('data.parquet')

end_time = time.time()

print("Time taken to read parquet is : ",(end_time - start_time)*1000, "ms")

On running the c++ code for a file of size about 87mb, the output for c++ is -

Time taken to read parquet file is : 186ms

While for python the output is -

Time taken to read parquet is : 108.66141319274902 ms

Why there is such a difference in time of execution for the function read_table in c++ and python ?


Solution

  • If you want a comparison try this CPP code:

    #include <cassert>
    #include <chrono>
    #include <cstdlib>
    #include <iostream>
    
    using namespace std::chrono;
    
    #include <arrow/api.h>
    #include <arrow/filesystem/api.h>
    #include <parquet/arrow/reader.h>
    
    using arrow::Result;
    using arrow::Status;
    
    namespace {
    
    Result<std::unique_ptr<parquet::arrow::FileReader>> OpenReader() {
      arrow::fs::LocalFileSystem file_system;
      ARROW_ASSIGN_OR_RAISE(auto input, file_system.OpenInputFile("data.parquet"));
    
      parquet::ArrowReaderProperties arrow_reader_properties =
          parquet::default_arrow_reader_properties();
    
      arrow_reader_properties.set_pre_buffer(true);
      arrow_reader_properties.set_use_threads(true);
    
      parquet::ReaderProperties reader_properties =
          parquet::default_reader_properties();
    
      // Open Parquet file reader
      std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
      auto reader_builder = parquet::arrow::FileReaderBuilder();
      reader_builder.properties(arrow_reader_properties);
      ARROW_RETURN_NOT_OK(reader_builder.Open(std::move(input), reader_properties));
      ARROW_RETURN_NOT_OK(reader_builder.Build(&arrow_reader));
    
      return arrow_reader;
    }
    
    Status RunMain(int argc, char **argv) {
      // Read entire file as a single Arrow table
      std::shared_ptr<arrow::Table> table;
      for (auto i = 0; i < 10; i++) {
        ARROW_ASSIGN_OR_RAISE(auto arrow_reader, OpenReader());
        auto t1 = std::chrono::high_resolution_clock::now();
        ARROW_RETURN_NOT_OK(arrow_reader->ReadTable(&table));
        std::cout << table->num_rows() << "," << table->num_columns() << std::endl;
        auto t2 = std::chrono::high_resolution_clock::now();
        auto ms_int =
            std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);
        std::cout << "Time taken to read parquet file is : " << ms_int.count()
                  << "ms\n";
      }
    
      return Status::OK();
    }
    
    } // namespace
    
    int main(int argc, char **argv) {
      Status st = RunMain(argc, argv);
      if (!st.ok()) {
        std::cerr << st << std::endl;
        return 1;
      }
      return 0;
    }
    

    Then compare with this python code:

    #!/usr/bin/env python3                                                                                                                                                                                     
    import pandas as pd
    import pyarrow as pa
    import pyarrow.parquet as pq
    import time
    
    for i in range(10):
        parquet_file = pq.ParquetFile('/home/pace/experiments/so4/data.parquet', pre_buffer=True)
        start_time = time.time()
        table = parquet_file.read()
        end_time = time.time()
        print("Time taken to read parquet is : ",(end_time - start_time)*1000, "ms")
    

    On my system after 10 runs a t-test fails to distinguish the two distributions (p=0.64).