Search code examples
cbin

C: Compresing DNA sequence into binary


Hi im trying to work out how to get a function to convert a DNA code into binary bits, need to make it look like this :ACGTT -> XXXXXX11 11100100 from right to left for A=0 C=1 G=2 T=3. now i was thinking to do it by character and then moving it by >>2 out of function but i cant figure out the function itself... i tried like this and this way it returns NULL.

char CompressChar(char c){
char temp[8]="XXXXXXXX";
if (c=='A'){
 temp[7] = '0';
 temp[6] = '0';
}
if (c=='C'){
    temp[7]='1';
    temp[6]='0';
}
if(c=='G'){

    temp[7]='0';
    temp[6]='1';
}

if(c=='T'){
   temp[7]='1';
   temp[6]='1';
}
return temp;

}


Solution

  • I take it that you want each letter (they are called nucleotide iirc?) to represent 2 binary digits.

    To begin with, char temp[8]="XXXXXXXX"; doesn't make sense because strings in C are null terminated and you don't allocate room for the null terminator, should be [8+1]. And you want 16 bits, not 8.

    Your function returns a char which is incorrect, you would need to return a whole array. Which has to be done through a parameter, since you can't return arrays in C. Not can you return a pointer to local data, so it would have been better to leave allocation to the caller. Anyway... scratch that function.

    In addition, it doesn't make much sense to convert this into a "binary string". It is better to convert it into a binary number, and then as needed, convert that number into a string.

    The conversion, included the "backwards order", could then be done like this:

    uint16_t dna_to_bin (const char* str)
    {
      uint16_t result = 0;
      size_t i;
    
      for(i=0; i<16; i+=2)       // loop over bits in the resulting binary number
      {
        typedef enum             // local enum just for readability
        {
          A = 0,
          C = 1,
          G = 2,
          T = 3,
        } dna_t;
        dna_t type=0;            // default is 0 if nothing to decode
    
        if(*str != '\0')         // keep decoding string until reaching the end
        {
          switch(*str)
          {
            case 'A':     type = A; break;
            case 'C':     type = C; break;
            case 'G':     type = G; break;
            case 'T':     type = T; break;
          }
          str++;
        }
    
        result |= (uint16_t)type << i; // store data at correct position in the result
      }
    
      return result;
    }
    

    Full example including a function that prints binary and discards leading zeroes below. If you want to replace leading zeroes with X, it should be trivial to modify.

    #include <stdint.h>
    #include <inttypes.h>
    #include <stdio.h>
    #include <stdbool.h>
    
    uint16_t dna_to_bin (const char* str)
    {
      uint16_t result = 0;
      size_t i;
    
      for(i=0; i<16; i+=2)
      {
        typedef enum
        {
          A = 0,
          C = 1,
          G = 2,
          T = 3,
        } dna_t;
        dna_t type=0;
    
        if(*str != '\0')
        {
          switch(*str)
          {
            case 'A':     type = A; break;
            case 'C':     type = C; break;
            case 'G':     type = G; break;
            case 'T':     type = T; break;
          }
          str++;
        }
        result |= (uint16_t)type << i;
      }
    
      return result;
    }
    
    
    void print_bin (uint16_t bin)
    {
      bool remove_zeroes = true;
    
      for(size_t i=0; i<16; i++)
      {
        uint16_t mask = 1u << (16-1-i);
        uint16_t bit = bin & mask;
    
        if(bit == 0)
        {
          if(!remove_zeroes)
          {
            printf("0");
          }
        }
        else
        {
          remove_zeroes = false;
          printf("1");
        }
      }
    }
    
    
    int main (void)
    {
      const char STR_DNA[] = "ACGTT";
      uint16_t bin = dna_to_bin(STR_DNA);
    
      puts(STR_DNA);
      printf("Hex: %.4"PRIX16 "\n", bin);
      printf("Bin: ");
      print_bin(bin);
    
      return (0);
    }
    

    Output:

    ACGTT
    Hex: 03E4
    Bin: 1111100100