Search code examples
calgorithmdata-miningsamplingreservoir-sampling

Reservoir Sampling algorithm not wroking


I have a project for my data mining class, in which I have to code the reservoir sampling algorithm for files. The program takes as input a number k, the name of the input file, and the name of the output file to be created. The output file must contain k random lines from the input. I tried some stuff but the output is wrong.

This is the code I use:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

int countLines(FILE* file)
{
    char ch,lines=0;

    while ((ch=fgetc(file)) != EOF)
        if (ch=='\n')
            lines++;
    return(lines);
}

void itemSelection(FILE* fp1, FILE* fp2, int k)
{
    int i,j,n,test=0;
    char line[256];
    char** buffer;

    srand((unsigned int) time(NULL));
    buffer = (char**)malloc(sizeof(char*));
    for(i=0;i<k;i++)
        buffer[i] = (char*)malloc(256*sizeof(char));
    n = countLines(fp1);
    if(k>n)
    {
        rewind(fp1);
        while(fgets(line, 256, fp1)!=NULL)
        {
            printf("%s test\n",line); 
            fprintf(fp2,"%s",line); 
        }
    }
    else
    {
        rewind(fp1);    
        for(i=0;i<k;i++)
        {
            fgets(line, 256, fp1);
            buffer[i]=line;
            printf("first k lines:\t%s\n",buffer[i]);
        }
        for(i=k;i<n;i++)
        {
            fgets(line,256,fp1);
            printf("line is:\t%s.\n", line);
            j = rand() % (i+1);
            if(j<k)
            {
                buffer[j]=line;
                printf("later parts are:\t%s. J is:%d.\n",buffer[j],j);
            }
        }
    }
    for(i=0;i<k;i++)
        printf("buffer test:\t%s.\n", buffer[i]);
}

void printFunc(FILE* fp2,int k)
{
    char line[256];
    int i;

    rewind(fp2);
    for(i=0;i<k;i++)
    {
        fgets(line, 256, fp2);
        printf("print test is:\t%s.\n",line);
    }   
}

void main(int args, char** argv)
{
    FILE* fp1;
    FILE* fp2;
    int k;

    if(args<4)
    {
        printf("Expected more arguments!\n");
        exit(-1);
    }

    fp1 = fopen(argv[2],"r");
    if(fp1 == NULL)
    {
        printf("Could not open input file!\n");
        perror("Error: ");
        exit(-1);
    }
    fp2 = fopen(argv[3],"w");
    if(fp2 == NULL)
    {
        printf("Could not open output file!\n");
        perror("Error: ");
        exit(-1);
    }
    k = atoi(argv[1]);
    itemSelection(fp1, fp2, k);
    printFunc(fp2,k);
    fclose(fp1);
    fclose(fp2);
}

What this program tries to do is read the k first lines from the file and store it in a 2-dimensional string array of (k,256) size. Then for every next line generate a random number j, and if that number is smaller than k, replace the buffer[j] with the most recently line taken from the file.

However the output I get consists of k lines of }, which is the last character of the input. Like this(e.g k=5):

}
}
}
}
}

When I print the buffer to see its content, it shows correctly. But when I write to the file it writes the wrong output.

Any help would be highly appreciated! Thank you in advance!


Solution

  • When you pick the lines, you must copy the line's content, not the pointer (which is always line, whose contents will constantly be overwritten):

    buffer[i]=line;
    

    should be

    strcpy(buffer[i], line);
    

    and likewise for buffer[j].

    You also have an allocation error:

    buffer = (char**)malloc(sizeof(char*));
    

    should be:

    buffer = malloc(k * sizeof(char*));
    

    to make room for k strings (and to heed popular advice about casting the result of malloc in C). You should also think about what you want to do with buffer: Do you return it, so that client code can use it (and has to free it) or is it local to itemSelection, which should free it before returning. At the moment you do neither.

    And finally, in your countLines function, the variables should be int, not char: fgetc returns an int so you can distibguish between all valid (unsigned) char values and the special value EOF.