Search code examples
c++segmentation-faultstring-parsing

Parsing text file into list gives segmentation fault


I'm getting a segmentation fault while trying to parse a big text file. The file contains 91 529 mRNA transcripts and details about these transcripts. I've created a RefSeqTranscript object that will take these details. When I parse the file, I create a list of these objects and start putting the details into these lists. It works fine for the first 1829 transcripts and then crashes with a segmentation fault. The method I'm running is:

void TranscriptGBFFParser::ParseFile(list<RefSeqTranscript> &transcripts, const char* filepath)
{
    cout << "Parsing " << filepath << "..." << endl;

    ifstream infile;
    infile.open(filepath);

    int num = 0;
    RefSeqTranscript *transcript = new RefSeqTranscript();
    for(string line; getline(infile, line); )
    {
        in.clear();
        in.str(line);

        if (boost::starts_with(line, "LOCUS"))
        {
            if((*transcript).transcriptRefSeqAcc.size() > 0)
            {           
                cout << (*transcript).transcriptRefSeqAcc << ":" << (*transcript).gi << ":" << (*transcript).gene.geneName << ":" << ++num << endl; 

                transcripts.push_back(*transcript); 
                delete transcript;

                RefSeqTranscript *transcript = new RefSeqTranscript();  

            }   
        }
        else if (boost::starts_with(line, "     var"))
        {
            TranscriptVariation variant;
            (*transcript).variations.push_back(variant);            
        }
        //Store the definition of the transcript in the description attribute
        else if (boost::starts_with(line, "DEFINITION"))
        {           
            (*transcript).description = line.substr(12);

            for(line; getline(infile, line); )
            {
                if(boost::starts_with(line, "ACCESSION   "))
                    break;

                (*transcript).description += line.substr(12);
            }       
        }
        //The accession number and GI number are obtained from the VERSION line
        else if (boost::starts_with(line, "VERSION"))
        {
            string versions = line.substr(12);
            vector<string> strs;
            boost::split(strs, versions, boost::is_any_of( " GI:" ), boost::token_compress_on);
            boost::trim_left(strs[0]);

            (*transcript).transcriptRefSeqAcc = strs[0];
            (*transcript).gi = atoi(strs[1].c_str());
        }
        //Gene information is obtained from the "gene" sections of each transcript
        else if (boost::starts_with(line, "     gene"))
        {           
            for(line; getline(infile, line); )
            {
                if(boost::starts_with(line.substr(21), "/gene="))
                {
                    Gene *gene = new Gene();

                    string name = line.substr(27);
                    Utilities::trim(name, '\"');

                    (*gene).geneName = name;

                    (*transcript).gene = *gene;

                    delete gene;
                    break;
                }
            }
            (*transcript).gene.geneID = 0;      
        }
        else if (boost::starts_with(line, "     CDS"))
        {
            (*transcript).proteinRefSeqAcc = "";            
        }
        else if (boost::starts_with(line, "ORIGIN"))
        {
            (*transcript).sequence = "";            
        }       
    }

    cout << (*transcript).transcriptRefSeqAcc << ":" << (*transcript).gi << ":" << (*transcript).gene.geneName << endl;

    transcripts.push_back(*transcript); 
    delete transcript;          

    cout << "No. transcripts: " << transcripts.size() << endl;
    cout << flush;

    infile.close();

    cout << "Finished parsing " << filepath << "." << endl; 
}

I'm new to C++ and don't have a great understanding of how to work with pointers etc so I'm guessing I might have done something wrong there. I don't understand why it would work for almost 2000 objects before cutting out though.

The file I'm parsing is 2.1 GB and consists of about 44 000 000 lines so any tips on how to improve the efficiency would also be much appreciated.


Solution

  • This is probably not the only answer, but you have a leak...

        if (boost::starts_with(line, "LOCUS"))
        {
            if((*transcript).transcriptRefSeqAcc.size() > 0)
            {           
                cout << (*transcript).transcriptRefSeqAcc << ":" << (*transcript).gi << ":" << (*transcript).gene.geneName << ":" << ++num << endl; 
    
                transcripts.push_back(*transcript); 
                delete transcript;
                // LEAK!
                RefSeqTranscript *transcript = new RefSeqTranscript();  
    
            }   
        }
    

    You probably mean:

    transcript = new RefSeqTranscript();