I'm having a Segmentation Fault when accessing an array inside a for loop. What I'm trying to do is to generate all subsequences of a DNA string.
It was happening when I created the array inside the for. After reading for a while, I found out that the openmp limits the stack size, so it would be safer to use the heap instead. So I change the code to use malloc, but the problem persists.
This is the full code:
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#define DNA_SIZE 26
#define DNA "AGTC"
static char** powerset(int argc, char* argv)
{
unsigned int i, j, bits, i_max = 1U << argc;
if (argc >= sizeof(i) * CHAR_BIT) {
fprintf(stderr, "Error: set too large\n");
exit(1);
}
omp_set_num_threads(2);
char** subsequences = malloc(i_max*sizeof(char*));
#pragma omp parallel for shared(subsequences, argv)
for (i = 0; i < i_max ; ++i) {
//printf("{");
int characters = 0;
for (bits=i; bits ; bits>>=1)
if (bits & 1)
++characters;
//This is the line where the error is happening.
char *ss = malloc(characters+1 * sizeof(char)*16);//the *16 is just to save the cache lin
int ssindex = 0;
for (bits = i, j=0; bits; bits >>= 1, ++j) {
if (bits & 1) {
//char a = argv[j];
ss[ssindex++] = argv[j] ;
}
}
ss[ssindex] = '\0';
subsequences[i] = ss;
}
return subsequences;
}
char* getdna()
{
int i;
char *dna = (char *)malloc((DNA_SIZE+1) * sizeof(char));
for(i = 0; i < DNA_SIZE; i++)
{
int randomDNA = rand() % 4;
dna[i] = DNA[randomDNA];
}
dna[DNA_SIZE] = '\0';
return dna;
}
void printResult(char** ss, int size)
{
//PRINTING THE SUBSEQUENCES
printf("SUBSEQUENCES FOUND:\r\n");
int i;
for(i = 0; i < size; i++)
{
printf("%i.\t{ %s } \r\n",i+1 , ss[i]);
free(ss[i]);
}
free(ss);
}
int main(int argc, char* argv[])
{
srand(time(NULL));
double starttime, stoptime;
starttime = omp_get_wtime();
char* a = getdna();
printf("%s\r\n", a);
int size = pow(2, DNA_SIZE);
printf("number of subsequences: %i\r\n", size);
char** subsequences = powerset(DNA_SIZE, a);
//todo: make it optional printing to the stdout or saving to a file
//printResult(subsequences, size);
stoptime = omp_get_wtime();
printf("Tempo de execucao: %3.2f segundos\n\n", stoptime-starttime);
printf("Numero de sequencias geradas: %i\n\n", size);
free(a);
return 0;
}
I also tried to make the malloc line critical with the #pragma omp critical
which didn't help.
Also I tried to compile with -mstackrealign which also didn't work.
Appreciate all the help.
You should use a more efficient thread-safe memory management.
Applications can use either malloc()
and free()
explicitly, or implicitly in the compiler-generated code for dynamic/allocatable arrays, vectorized intrinsics, and so on.
The thread-safe malloc()
and free()
in some libc
implementations carry a high synchronization overhead caused by internal locking. Faster allocators for multi-threaded applications exist. For instance, on Solaris multithreaded applications should be linked with the "MT-hot" allocator mtmalloc
, (i.e., link with -lmtmalloc
to use mtmalloc
instead of the default libc allocator). glibc
, used on Linux and some OpenSolaris and FreeBSD distributions with GNU userlands, uses a modified ptmalloc2
allocator, which is based on Doug Lea's dlmalloc
. It uses multiple memory arenas to achieve near lock-free behavior. It can also be configured to use per-thread arenas and some distributions, notably RHEL 6 and derivates, have that feature enabled.
static char** powerset(int argc, char* argv)
{
int i, j, bits, i_max = 1U << argc;
if (argc >= sizeof(i) * CHAR_BIT) {
fprintf(stderr, "Error: set too large\n");
exit(1);
}
omp_set_num_threads(2);
char** subsequences = malloc(i_max*sizeof(char*));
int characters = 0;
for (i = 0; i < i_max ; ++i)
{
for (bits=i; bits ; bits>>=1)
if (bits & 1)
++characters;
subsequences[i] = malloc(characters+1 * sizeof(char)*16);
characters = 0;
}
#pragma omp parallel for shared(subsequences, argv) private(j,bits)
for (i = 0; i < i_max; ++i)
{
int ssindex = 0;
for (bits = i, j=0; bits; bits >>= 1, ++j) {
if (bits & 1) {
subsequences[i][ssindex++] = argv[j] ;
}
}
subsequences[i][ssindex] = '\0';
}
return subsequences;
}
I create (and allocate) the desired data before the parallel region, and then made the remaining calculations. The version above running with 12 threads in a 24 core machine takes "Tempo de execucao: 9.44 segundos".
However, when I try to parallelize the following code:
#pragma omp parallel for shared(subsequences) private(bits,characters)
for (i = 0; i < i_max ; ++i)
{
for (bits=i; bits ; bits>>=1)
if (bits & 1)
++characters;
subsequences[i] = malloc(characters+1 * sizeof(char)*16);
characters = 0;
}
it take "Tempo de execucao: 10.19 segundos"
As you can see calling malloc
in parallel leads to slower times.
Eventually, you would have had problems with the fact that each sub-malloc was trying to allocate (characters+1*DNA_SIZE*sizeof(char))
rather than ((characters+1)*DNA_SIZE*sizeof(char))
, and the multiplying by a factor for cache line size is not necessary inside the parallel section if I understand what you were trying to avoid.
There also seems to be some issue with this piece of code:
for (bits = i, j=0; bits; bits >>= 1, ++j) {
if (bits & 1) {
//char a = argv[j];
ss[ssindex++] = argv[j] ;
}
}
With this code, j
sometimes hits DNA_SIZE
or DNA_SIZE+1
, resulting in reading argv[j]
going off the end of the array. (Also, using argc
and argv
as names for arguments in this function is somewhat confusing.)