Search code examples
c++ifstreamofstream

Reading and Writing Huge Files C++(Memory Overload)


I have a very huge file(55 gigabytes of json data). I am using ifstream to read and another ofstream to write to another file. The program runs correctly for some time. And then crashes due to heavy memory usage.

I tried using ignore and clear to clear the input buffer. And also tried using flush to clear the output buffer.

Also the file is very huge, So, I want it to be fast.

p.s. I wrote the json parser while half asleep. So please pardon my bad parser code. And maybe the memory leak is present there. Any help will be appreciated.

Small Example

int main() 
{ 
    std::ifstream file("aggressive_dedup.json", std::ifstream::in);
    std::ofstream outFile("processed.json", std::ofstream::out);
    std::string str;
    int a; 
    long long count = 0;


    while (std::getline(file, str))
    {

        JsonParserStateMachine jsonParserStateMachine;
        for(char &c : str) jsonParserStateMachine.changeState(c);
        //std::cout<<jsonParserStateMachine.getReview();
        //This lines just gives a string to the output which is around may be 1000 characters 
        outFile << jsonParserStateMachine.getReview(); 
        if(++count % 1000 == 0) {
            std::cout<<count<<" Processed\n";
            outFile.flush();
            return 0;
        }
    }
    outFile.close();
    return 0;
}

For those who are willing to see the whole code

#include <fstream>
#include <string>
#include <iostream>

enum state {
    q0, q1, q2, q3, q4, q5, q6, h
};

class KeyValueStore{
    std::string *keys;
    std::string *values;
    int currentKeyPosition;
    int currentValuePosition;
    int maxLength;
public:
    KeyValueStore(const int length) : maxLength(length),currentKeyPosition(0),currentValuePosition(0)
    {
        this->keys = new std::string[length];
        this->values = new std::string[length];

        for(int i=0;i<length;i++)
        {
            this->keys[i] = "";
            this->values[i] = "";
        }

    }

    void updateKeyPosition()
    {
        this->currentKeyPosition = this->currentKeyPosition++%9;
    }
    void updateValuePosition()
    {
        this->currentValuePosition = this->currentValuePosition++%9;
    }

    void putKey(char c)
    {
        this->keys[currentKeyPosition] += c;
    }
    void putValue(char c)
    {
        this->values[currentValuePosition] += c;
    }


    std::string getValue(std::string key)
    {
        for(int i=0;i<this->maxLength;i++)
        {
            if(this->keys[i] == key) return this->values[i];
        }
        return "";
    }

    void print()
    {
        std::cout<<"Keys"<<"\t"<<"Values"<<std::endl;
        for(int i=0;i<maxLength;i++)
        {
            std::cout<<this->keys[i] <<'\t'<<this->values[i]<<std::endl;
        }
    }


    std::string getReview()
    {
        return std::string("{\"" +  this->getValue("reviewText") + "\":\"" + this->getValue("overall") + "\"}"); 
    }
};



class JsonParserStateMachine{
    state currentState;
    KeyValueStore keyValueStore;
    bool inNum;
    bool inArray;
public:
    JsonParserStateMachine(): keyValueStore(9), currentState(state::q0), inNum(false),inArray(false){}

    state getState()
    {
        return this->currentState;
    }

    void print()
    {
        keyValueStore.print();
    }


    std::string getReview()
    {
        return keyValueStore.getReview();
    }

    state changeState(char c)
    {
        switch(currentState)
        {
            case state::q0:
                if(c == ' ') break;
                else if(c == '{') this->currentState = state::q1;
                else this->currentState = state::h;
                break;
            case state::q1:
                if(c == ' ') break;
                else if(c == '\"') this->currentState = state::q2;
                else this->currentState = state::h;
                break;
            case state::q2:
                if(c == '\"'){
                    this->currentState = state::q3;
                    this->keyValueStore.updateKeyPosition();
                    break;
                }
                else{
                    this->keyValueStore.putKey(c);
                    break;
                } 
            case state::q3:
                if(c == ':') this->currentState = state::q4;
                else if(c == ' ') {

                }
                else {
                    //std::cout<<"From Q3"<<std::endl;
                    this->currentState = state::h;
                }break;
            case state::q4:
                if(c == '\"' || c == '[') {
                    this->currentState = state::q5;
                    inArray = c == '[' ? true: false; 

                }else if(c == ' ') break;
                else {
                    //std::cout<<"Got Here"<<std::endl;
                    inNum = true;
                    this->currentState = state::q5;
                    this->keyValueStore.putValue(c);
                }
                break;
            case state::q5:
                if(c == '\"' || c == ']'){
                    this->currentState = state::q6;
                    this->keyValueStore.updateValuePosition();
                    inArray = c == ']'? false: true;
                    break;
                }else if(inNum && c == ',' ){
                    this->currentState = state::q1;
                    this->keyValueStore.updateValuePosition();
                    inNum = false;
                }
                else{
                    this->keyValueStore.putValue(c);
                    break;
                } 
            case state::q6:
                if(c == ','){
                    this->currentState = state::q1;
                    break;
                }else if(c == ' '){
                    break;
                }else{
                    //std::cout<<"From Q6"<<std::endl;
                    this->currentState = state::h;
                }
        }

        return this->currentState;
    }
};

class Review{

    std::string reviewText;
    int overall;
    std::string summary;
public:
    void pusReviewText(std::string reviewText)
    {
        this->reviewText = reviewText;
    }

    void putOverall(int overall)
    {
        this->overall = overall;
    }


    void putSummary(std::string summary)
    {
        this->summary = summary;
    }

    std::string getReviewText()
    {
        return this->reviewText;
    }
    int getOverall()
    {
        return this->overall;
    }
    std::string getSummary()
    {
        return this->summary;
    }
};

int main() 
{ 
    std::ifstream file("aggressive_dedup.json", std::ifstream::in);
    std::ofstream outFile("processed.json", std::ofstream::out);
    std::string str;
    int a; 
    long long count = 0;


    while (std::getline(file, str))
    {

        JsonParserStateMachine jsonParserStateMachine;
        for(char &c : str) jsonParserStateMachine.changeState(c);
        //std::cout<<jsonParserStateMachine.getReview();
        outFile << jsonParserStateMachine.getReview();
        if(++count % 1000 == 0) {
            std::cout<<count<<" Processed\n";
            outFile.flush();
            return 0;
        }
    }
    outFile.close();
    return 0;
}

Solution

  • The problem comes from your KeyValueStore class:

    KeyValueStore(const int length) : maxLength(length),currentKeyPosition(0),currentValuePosition(0)
    {
        this->keys = new std::string[length];
        this->values = new std::string[length];
        ...
    

    Nothing ever deletes these arrays. Deleting them in a destructor is the simple fix:

    ~KeyValueStore() {
      delete[] this->keys;
      delete[] this->values;
    }
    

    However! You really should consider using std::vector<std::string> instead. Or even better, rebuild the entire thing around a std::unordered_map<std::string, std::string> instead.