Search code examples
copenmpcollatzxeon-phi

Segmentation fault with high values (Xeon Phi)


I am working on a Collatz Conjecture problem using Xeon Phi through Stampede. I have tested my code been tested and works fine for values up to 100,000, but testing values up 1 million, I receive a segmentation fault ("SIGSEV") almost immediately. I've been banging my head against the wall for days, but simply cannot figure out the bug. Any help is truly appreciated.

typedef unsigned long long bigInt;

// Number to test to (starting from 1)
   #define bigSize     100000

typedef struct {
    int numSteps;
    bigInt stopPoint;
} batcher;

typedef struct {
    bigInt num;
    batcher to_batch;
} to_ret;
int main () {
    //Stores values as [num][#steps to smaller val][smaller val]
    to_ret retlist[bigSize];
    //Stores values as [#steps to smaller val][smaller val], sorted by num
    batcher results[bigSize];
    ...

    #pragma offload target(mic:0) inout(retlist) shared(retlist)
    {
        #pragma omp parallel for
        for(i = 1; i < bigSize; i++){
            retlist[i].num = i + 1;
            bigInt next = retlist[i].num;
            int count = 0;

            do {
                count++;

                if (next%2 == 1)
                    next=(3*next+1)/2;
                else
                    next/=2;

            } while(next > retlist[i].num);

            retlist[i].to_batch.numSteps = count;
            retlist[i].to_batch.stopPoint = next;
        }
    }

    ///Organizes data into a sorted array
    #pragma omp parallel for
    for (i = 0; i < bigSize; i++){
        results[retlist[i].num - 1] = retlist[i].to_batch;
    }
    ...
}

I'm pretty confident that the issue would be somewhere in the code segment above.


Solution

  • The full code can be found on github here, and though while there are still a lot of efficiency issues with it (could use vectorization support), what I've currently landed upon is this (utilizing the suggestion by barak-manos):

    typedef unsigned long long bigInt;
    
    /// Number to test up to (starting from 1)
    #define bigSize     1000000000 //340282366920938463463374607431768211455
    
    typedef struct {
        int numSteps;
        bigInt stopPoint;
    } batcher;
    
    typedef struct {
        bigInt num;
        batcher to_batch;
    } to_ret;
    
    __attribute__((target(mic))) to_ret retlist[bigSize]; ///Stores values as [num][#steps to smaller val][smaller val]
    __attribute__((target(mic))) batcher results[bigSize]; ///Stores values as [#steps to smaller val][smaller val] & is sorted by num
    
    
    int main () {
        bigInt j;
        double start, end;
    
        retlist[0].num = 1; retlist[0].to_batch.numSteps = 0; retlist[0].to_batch.stopPoint = 1;
        start = omp_get_wtime();
    
        #pragma offload target(mic:0) out(results)
        {
            int count;
            bigInt i, next;
    
        #pragma omp parallel for
            for(i = 1; i < bigSize; i++){
                next = retlist[i].num = i + 1;
                count = 0;
    
                do {
                    count++;
    
                    if (next%2 == 1)
                        next=(3*next+1)/2;
                    else
                        next/=2;
    
                } while(next > retlist[i].num);
    
                retlist[i].to_batch.numSteps = count;
                retlist[i].to_batch.stopPoint = next;
            }
    
        ///Organizes data into a sorted array
        #pragma omp parallel for
        for (i = 0; i < bigSize; i++){
            results[i] = retlist[i].to_batch;
        }
      }
    ...
    
        for(j = 0; j < bigSize; j++){
            results[j].numSteps += results[results[j].stopPoint-1].numSteps;
        }
    
        return(0);
    }
    

    If anyone's interested, please feel free to create a fork of my project.