Search code examples
c++fread

data lost when read big file in chunks using fread


I am a novice of c++. Those are trade data of NYSE, I am using those data to do quant analysis. A single file for one day is about 10G, so I have hundreds of those files. For the first step, I just want make sure that I could read the data by chunk correctly. Then I will maniupulate those chunk. I was told using python would be slow, so I try with c++. I try to read one file in chunks using fread(). I want to count number of lines to check my code, but it's different from the number of lines that I got when I using notepad++. Hope someone could help me with this problem thanks.

Data:

Q,14340,EUR/NZD,1.65027,1,1.6504,1
T,14340,EUR/NZD,1,1.65034,@,70,X
Q,14340,AUD/NZD,1.03427,1,1.03437,1
T,14340,AUD/NZD,1,1.03432,@,70,X
Q,14340,CAD/CHF,0.75142,1,0.75146,1
T,14340,CAD/CHF,1,0.75144,@,70,X
Q,14340,GBP/NZD,1.90908,1,1.90927,1
T,14340,GBP/NZD,1,1.90918,@,70,X
Q,14340,GBP/CHF,1.312,1,1.31208,1
T,14340,GBP/CHF,1,1.31204,@,70,X
Q,13724,#6S,0.9928,12,0.9929,29

number of lines using fread: 279 174 248

number of lines using notepad++:279 485 508

Wrong Code:

#include <iostream>
#include <cstdio>
using namespace std;

int main()
{
    clock_t start = clock();
    char buffer[100000]="\0";
    int cursor = sizeof(buffer);
    FILE* fp;
    int Judge;
    int offset = 0;
    Judge = fopen_s(&fp, "E:\\feedRec\\TFD20190227", "r");
    int count = 0;
    int num = 0;
    while (1)
    {
        //read one chunk
        num=fread(buffer, sizeof(char), (sizeof(buffer) - 1), fp);
        // null terminated the buffer
        buffer[num] = '\0';
        char* ptr=buffer;
        while (*ptr!='\0') {
            if (*ptr == '\n') {
                count++;
            }
            ptr++;
        }
        //Since the lines are not formatted. The buffer will end at
        //the middle of the line. So I need to move back the pointer 
        //back to the beginning of the sliced line to let it be read in 
        //the next loop. 
        //T,14340,AUD/NZD,1,1.03432,@,70,X
        //if buffer end here|
        //pointer need to be move back to the beginning.
        //T,14340,AUD/NZD,1,1.03432,@,70,X
        //|
        //calculte the offset
        offset = 0;
        cursor = sizeof(buffer)-1;
        while (buffer[cursor] != '\n') {
            cursor--;
            offset++;
        }
        //move back the pointer 
        if (offset > 0 && (num == sizeof(buffer) - 1)) {
            fseek(fp, -offset, SEEK_CUR);
        }
        //clear the buffer
        memset(buffer, '\0', sizeof(buffer) - 1);
        // deal with last chunk: if the num less than buffer size, stop
        if (num < sizeof(buffer)-1) 
        {
            fclose(fp); break; 
        }
    }
    //count the last chunk
    char* ptr = buffer;
    while (*ptr != '\0') {
        if (*ptr == '\n') {
            count++;
        }
        ptr++;
    }
    cout << "count: " << count << endl;
    clock_t end = clock();
    cout << "time : " << ((double)end - start) / CLOCKS_PER_SEC << "s\n";
    fclose(fp);
    return 0;
}

Solution

  • There are NUL char in some lines, using "*ptr!='\0'" to determined whether it reached the end of the buffer is wrong:

    T,14369,GBP/USD,1,1.33012,@,70,X LF
    T,14370,TIME$,1,9453,@,40, NUL LF
    Q,10516,#YKH9,901.125,1,911.875,2 LF

    The loop terminated after "@,40 ".

    clock_t start = clock();
    char buffer[300]="\0";
    int cursor = sizeof(buffer);
    FILE* fp;
    int Judge;
    int offset = 0;
    Judge = fopen_s(&fp, "E:\\feedRec\\TFD20190228", "rb");
    int count = 0;
    size_t num = 0;
    while (1)
    {
        num=fread(buffer, sizeof(char), (sizeof(buffer) - 1), fp);
    
        buffer[num] = '\0';
    
    
        char* ptr=buffer;
        char* endptr = &buffer[num];
        int ct = 0;
        if (num == sizeof(buffer) - 1) 
        {     
            while (ptr != endptr) {
                ct++;
                if (*ptr == '\n') {
                    count++;
                }
                ptr++;
            }
        }
        offset = 0;
        cursor = sizeof(buffer)-1;
        while (buffer[cursor-1] != '\n') {
            cursor--;
            offset++;
        }
    
        if (offset > 0 && (num == sizeof(buffer) - 1)) {
            fseek(fp, -offset, SEEK_CUR);
        }
        
        if (num < sizeof(buffer)-1) 
        {
            fclose(fp); break; 
        }
        memset(buffer, '\0', sizeof(buffer) - 1);
    }
    
    char* ptr = buffer;
    char* endptr = &buffer[num];
    int ct = 0;
    if (num == sizeof(buffer) - 1)
    {
        while (ptr != endptr) {
            ct++;
            if (*ptr == '\n') {
                count++;
            }
            ptr++;
        }
    }
    cout << "count: " << count << endl;
    clock_t end = clock();
    cout << "time : " << ((double)end - start) / CLOCKS_PER_SEC << "s\n";
    fclose(fp);
    return 0;