Search code examples
c++bsp

weirdest bug ever - int definition causes sigsegv. stack overflow?


I'm using the BSPlib and when adding a simple definition of "int i" on the function running on multiple threads (along with many other) I get a message like "Process 2 caught SIGNAL 11 Segmantation fault ". it's important to note that I checked alot and without it I don't get the segmentation fault and with it I get it almost all the time. how could int definition cause it? is there a stack overflow I might have caused? thanks.

int P;
int main( int argc, char* argv[] )
{
    /** sequentail - process 0 */
    P=bsp_nprocs(); /// maximum number of process avialble (must do that on sequential part ,need for bsp begin)
    bsp_begin(P);
    char* str1;
    char* str2;
    int n;
    int** table;
    int thread=bsp_pid();
    int num_threads=bsp_nprocs();
    if(thread == 0)
    {
        ifstream file1(argv[1]);
        ifstream file2(argv[2]);
        // check if the strings are the same size RDBG
        string string1((istreambuf_iterator<char>(file1)), istreambuf_iterator<char>());
        string string2((istreambuf_iterator<char>(file2)), istreambuf_iterator<char>());
        n=string1.length();
        str1= (char*)malloc(sizeof(char)*(n+1));
        str2= (char*)malloc(sizeof(char)*(n+1));
        strcpy(str1,string1.c_str());
        strcpy(str2,string2.c_str());
    }
    if (thread!=0)
    {
        str1= (char*)malloc(sizeof(char)*(n+1));
        str2= (char*)malloc(sizeof(char)*(n+1));
    }
    bsp_push_reg(&n,SZINT);
    bsp_sync();
    bsp_get(0,&n,0,&n,SZINT);
    bsp_sync();
    if (thread==0)
    {
        table=(int**)malloc(sizeof(int)*(n+1));
        for (int i=0; i<n+1; i++)
            table[i]=(int*)malloc(sizeof(int)*(n+1));
    }
    bsp_push_reg(str1,SZCHAR*(n+1));
    bsp_push_reg(str2,SZCHAR*(n+1));
    bsp_push_reg(table,n*n*SZINT);
    bsp_sync();
    if (thread==0)
    {
        for(int t=1; t<num_threads; t++)
            for (int k=0; k<=n; k++)
            {
                bsp_put(t,str1+k,str1,k*SZCHAR,SZCHAR);
                bsp_put(t,str2+k,str2,k*SZCHAR,SZCHAR);
            }
    }
    bsp_sync();
    cout << thread << "!!!" << str1 << ";" << str2 << endl;
    int i;
    bsp_sync();
    bsp_pop_reg(table);
    bsp_pop_reg(str2);
    bsp_pop_reg(str1);
    bsp_pop_reg(&n);
    bsp_sync();
    free(str1);
    free(str2);
    bsp_sync();
    bsp_end();
    return 0;
}

Solution

  • Your declaration/initialization of the table variable is incorrect. You are initializing it as an array of arrays (i.e. as n+1 distinct memory blocks), whereas you are telling bsplib that it is a a contiguous memory block of n*n ints. You either need to change your allocation or the registration.

    As a consequence, bsplib overwrites memory that isn't initialized at all.