I am a novice of c++. Those are trade data of NYSE, I am using those data to do quant analysis. A single file for one day is about 10G, so I have hundreds of those files. For the first step, I just want make sure that I could read the data by chunk correctly. Then I will maniupulate those chunk. I was told using python would be slow, so I try with c++. I try to read one file in chunks using fread(). I want to count number of lines to check my code, but it's different from the number of lines that I got when I using notepad++. Hope someone could help me with this problem thanks.
Data:
Q,14340,EUR/NZD,1.65027,1,1.6504,1
T,14340,EUR/NZD,1,1.65034,@,70,X
Q,14340,AUD/NZD,1.03427,1,1.03437,1
T,14340,AUD/NZD,1,1.03432,@,70,X
Q,14340,CAD/CHF,0.75142,1,0.75146,1
T,14340,CAD/CHF,1,0.75144,@,70,X
Q,14340,GBP/NZD,1.90908,1,1.90927,1
T,14340,GBP/NZD,1,1.90918,@,70,X
Q,14340,GBP/CHF,1.312,1,1.31208,1
T,14340,GBP/CHF,1,1.31204,@,70,X
Q,13724,#6S,0.9928,12,0.9929,29
number of lines using fread: 279 174 248
number of lines using notepad++:279 485 508
Wrong Code:
#include <iostream>
#include <cstdio>
using namespace std;
int main()
{
clock_t start = clock();
char buffer[100000]="\0";
int cursor = sizeof(buffer);
FILE* fp;
int Judge;
int offset = 0;
Judge = fopen_s(&fp, "E:\\feedRec\\TFD20190227", "r");
int count = 0;
int num = 0;
while (1)
{
//read one chunk
num=fread(buffer, sizeof(char), (sizeof(buffer) - 1), fp);
// null terminated the buffer
buffer[num] = '\0';
char* ptr=buffer;
while (*ptr!='\0') {
if (*ptr == '\n') {
count++;
}
ptr++;
}
//Since the lines are not formatted. The buffer will end at
//the middle of the line. So I need to move back the pointer
//back to the beginning of the sliced line to let it be read in
//the next loop.
//T,14340,AUD/NZD,1,1.03432,@,70,X
//if buffer end here|
//pointer need to be move back to the beginning.
//T,14340,AUD/NZD,1,1.03432,@,70,X
//|
//calculte the offset
offset = 0;
cursor = sizeof(buffer)-1;
while (buffer[cursor] != '\n') {
cursor--;
offset++;
}
//move back the pointer
if (offset > 0 && (num == sizeof(buffer) - 1)) {
fseek(fp, -offset, SEEK_CUR);
}
//clear the buffer
memset(buffer, '\0', sizeof(buffer) - 1);
// deal with last chunk: if the num less than buffer size, stop
if (num < sizeof(buffer)-1)
{
fclose(fp); break;
}
}
//count the last chunk
char* ptr = buffer;
while (*ptr != '\0') {
if (*ptr == '\n') {
count++;
}
ptr++;
}
cout << "count: " << count << endl;
clock_t end = clock();
cout << "time : " << ((double)end - start) / CLOCKS_PER_SEC << "s\n";
fclose(fp);
return 0;
}
There are NUL char in some lines, using "*ptr!='\0'" to determined whether it reached the end of the buffer is wrong:
T,14369,GBP/USD,1,1.33012,@,70,X LF
T,14370,TIME$,1,9453,@,40, NUL LF
Q,10516,#YKH9,901.125,1,911.875,2 LF
The loop terminated after "@,40 ".
clock_t start = clock();
char buffer[300]="\0";
int cursor = sizeof(buffer);
FILE* fp;
int Judge;
int offset = 0;
Judge = fopen_s(&fp, "E:\\feedRec\\TFD20190228", "rb");
int count = 0;
size_t num = 0;
while (1)
{
num=fread(buffer, sizeof(char), (sizeof(buffer) - 1), fp);
buffer[num] = '\0';
char* ptr=buffer;
char* endptr = &buffer[num];
int ct = 0;
if (num == sizeof(buffer) - 1)
{
while (ptr != endptr) {
ct++;
if (*ptr == '\n') {
count++;
}
ptr++;
}
}
offset = 0;
cursor = sizeof(buffer)-1;
while (buffer[cursor-1] != '\n') {
cursor--;
offset++;
}
if (offset > 0 && (num == sizeof(buffer) - 1)) {
fseek(fp, -offset, SEEK_CUR);
}
if (num < sizeof(buffer)-1)
{
fclose(fp); break;
}
memset(buffer, '\0', sizeof(buffer) - 1);
}
char* ptr = buffer;
char* endptr = &buffer[num];
int ct = 0;
if (num == sizeof(buffer) - 1)
{
while (ptr != endptr) {
ct++;
if (*ptr == '\n') {
count++;
}
ptr++;
}
}
cout << "count: " << count << endl;
clock_t end = clock();
cout << "time : " << ((double)end - start) / CLOCKS_PER_SEC << "s\n";
fclose(fp);
return 0;