Search code examples
c++boostiomemory-mapped-filesboost-iostreams

Efficient parsing of mmap file


Following is the code for creating a memory map file using boost.

boost::iostreams::mapped_file_source file;  
boost::iostreams::mapped_file_params param;  
param.path = "\\..\\points.pts";  //! Filepath  
file.open(param, fileSize);  
if(file.is_open())  
{  
  //! Access the buffer and populate the ren point buffer  
  const char* pData = file.data();  
  char* pData1 = const_cast<char*>(pData);  //! this gives me all the data from Mmap file  
  std::vector<RenPoint> readPoints;  
  ParseData( pData1, readPoints);
}  

The implementation of ParseData is as follows

void ParseData ( char* pbuffer , std::vector<RenPoint>>& readPoints)    
{
  if(!pbuffer)
throw std::logic_error("no Data in memory mapped file");

stringstream strBuffer;
strBuffer << pbuffer;

//! Get the max number of points in the pts file
std::string strMaxPts;
std::getline(strBuffer,strMaxPts,'\n');
auto nSize = strMaxPts.size();
unsigned nMaxNumPts = GetValue<unsigned>(strMaxPts);
readPoints.clear();

//! Offset buffer 
pbuffer += nSize;
strBuffer << pbuffer;
std::string cur_line;
while(std::getline(strBuffer, cur_line,'\n'))
{
       //! How do I read the data from mmap file directly and populate my renpoint structure    
           int yy = 0;
}

//! Working but very slow
/*while (std::getline(strBuffer,strMaxPts,'\n'))
{
    std::vector<string> fragments;

    istringstream iss(strMaxPts);

    copy(istream_iterator<string>(iss),
        istream_iterator<string>(),
        back_inserter<vector<string>>(fragments));

    //! Logic to populate the structure after getting data back from fragments
    readPoints.push_back(pt);
}*/
}  

I have say a minimum of 1 million points in my data structure and I want to optimize my parsing. Any ideas ?


Solution

    1. read in header information to get the number of points
    2. reserve space in a std::vector for N*num_points (N=3 assuming only X,Y,Z, 6 with normals, 9 with normals and rgb)
    3. load the remainder of the file into a string
    4. boost::spirit::qi::phrase_parse into the vector.

    //code here can parse a file with 40M points (> 1GB) in about 14s on my 2 year old macbook:

    #include <boost/spirit/include/qi.hpp>
    #include <fstream>
    #include <vector>
    
    template <typename Iter>
    bool parse_into_vec(Iter p_it, Iter p_end, std::vector<float>& vf) {
        using boost::spirit::qi::phrase_parse;
        using boost::spirit::qi::float_;
        using boost::spirit::qi::ascii::space;
    
        bool ret = phrase_parse(p_it, p_end, *float_, space, vf);
        return p_it != p_end ? false : ret;
    }
    
    int main(int argc, char **args) {
        if(argc < 2) {
            std::cerr << "need a file" << std::endl;
            return -1;
        }
        std::ifstream in(args[1]);
    
        size_t numPoints;
        in >> numPoints;
    
        std::istreambuf_iterator<char> eos;
        std::istreambuf_iterator<char> it(in);
        std::string strver(it, eos);
    
        std::vector<float> vf;
        vf.reserve(3 * numPoints);
    
        if(!parse_into_vec(strver.begin(), strver.end(), vf)) {
            std::cerr << "failed during parsing" << std::endl;
            return -1;
        }
    
        return 0;
    }