Search code examples
ccharacter-encodingasciiebcdic

Detect if execution character set letters are contiguous


In the code, a switch is used to convert letters to contiguous values. Optimizing compilers in general won't do the job as well as the simple contiguous digit condition right before the switch. How can I detect which execution character set used and/or conclude that the letters are contiguous to replace it with simple conditionals ?

static long digit_value(char c)
{
    if (c >= '0' && c <= '9')
        return (c-'0');

    switch(c) {
    case 'a':
    case 'A':
        return 10;
    case 'b':
    case 'B':
        return 11;
    case 'c':
    case 'C':
        return 12;
    case 'd':
    case 'D':
        return 13;
    case 'e':
    case 'E':
        return 14;
    case 'f':
    case 'F':
        return 15;
    case 'g':
    case 'G':
        return 16;
    case 'h':
    case 'H':
        return 17;
    case 'i':
    case 'I':
        return 18;
    case 'j':
    case 'J':
        return 19;
    case 'k':
    case 'K':
        return 20;
    case 'l':
    case 'L':
        return 21;
    case 'm':
    case 'M':
        return 22;
    case 'n':
    case 'N':
        return 23;
    case 'o':
    case 'O':
        return 24;
    case 'p':
    case 'P':
        return 25;
    case 'q':
    case 'Q':
        return 26;
    case 'r':
    case 'R':
        return 27;
    case 's':
    case 'S':
        return 28;
    case 't':
    case 'T':
        return 29;
    case 'u':
    case 'U':
        return 30;
    case 'v':
    case 'V':
        return 31;
    case 'w':
    case 'W':
        return 32;
    case 'x':
    case 'X':
        return 33;
    case 'y':
    case 'Y':
        return 34;
    case 'z':
    case 'Z':
        return 35;
    default:
        break;
    }

    return -1;
}

Solution

  • How can I detect which execution character set used and/or conclude that the letters are contiguous?

    At compile time, simply test them all. ('a-z' left out for simplicity)

    static_assert(
      'A' == ('B' - 1) &&
      'B' == ('C' - 1) && 'C' == ('D' - 1) && 'D' == ('E' - 1) && 'E' == ('F' - 1) && 'F' == ('G' - 1) && 'G' == ('H' - 1) && 'H' == ('I' - 1) && 'I' == ('J' - 1) && 'J' == ('K' - 1) && 'K' == ('L' - 1) && 'L' == ('M' - 1) && 'M' == ('N' - 1) && 'N' == ('O' - 1) && 'O' == ('P' - 1) && 'P' == ('Q' - 1) && 'Q' == ('R' - 1) && 'R' == ('S' - 1) && 'S' == ('T' - 1) && 'T' == ('U' - 1) && 'U' == ('V' - 1) && 'V' == ('W' - 1) && 'W' == ('X' - 1) && 'X' == ('Y' - 1) &&
      'Y' == ('Z' - 1), "Dinosaur: not continuous A-Z");
    
    static int digit_value(char c) {
      if (c >= '0' && c <= '9') return c - '0';
      if (c >= 'A' && c <= 'Z') return c - 'A' + 10;
      return -1;
    }
    

    Other dinosaur tests.


    Or use the slow, but highly portable:

    static int digit_value(char c) {
      static const char *base36 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
      const char *p = strchr(base36, (unsigned char) c);
      if (p && *p) {
        return (int) (p - base36);
      }
      return -1;
    }
    

    Or perhaps a big #if?

    #if ('A' == ('B' - 1) && 'B' == ('C' - 1) && 'C' == ('D' - 1) && 'D' == ('E' - 1) && 'E' == ('F' - 1) && 'F' == ('G' - 1) && 'G' == ('H' - 1) && 'H' == ('I' - 1) && 'I' == ('J' - 1) && 'J' == ('K' - 1) && 'K' == ('L' - 1) && 'L' == ('M' - 1) && 'M' == ('N' - 1) && 'N' == ('O' - 1) && 'O' == ('P' - 1) && 'P' == ('Q' - 1) && 'Q' == ('R' - 1) && 'R' == ('S' - 1) && 'S' == ('T' - 1) && 'T' == ('U' - 1) && 'U' == ('V' - 1) && 'V' == ('W' - 1) && 'W' == ('X' - 1) && 'X' == ('Y' - 1) && 'Y' == ('Z' - 1))
    
    static int digit_value(char c) {
      if (c >= '0' && c <= '9') return c - '0';
      if (c >= 'A' && c <= 'Z') return c - 'A' + 10;
      return -1;
    }
    
    #else
    
    static int digit_value(char c) {
      static const char *base36 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
      const char *p = strchr(base36, (unsigned char) c);
      if (p && *p) {
        return (int) (p - base36);
      }
      return -1;
    }
    
    #endif
    

    Or .... if UCHAR_MAX not too big and concern about speed, make a lookup table and skip the sequential concerns.

    #include <limits.h>
    int digit_value(char c) {
      unsigned char val[UCHAR_MAX] = {['0'] = 1, ['1'] = 2, ['2'] = 3, ['3'] = 4,
          ['4'] = 5, ['5'] = 6, ['6'] = 7, ['7'] = 8, ['9'] = 10, 
          ['A'] = 11, ['B'] = 12, ['C'] = 13, ['D'] = 14, ['E'] = 15, ...
          ['a'] = 11, ['b'] = 12, ['c'] = 13, ['d'] = 14, ['e'] = 15, ...
      };
      return val[(unsigned char) c] - 1;
    }