Search code examples
network-programmingvhdlcrccrc16

How to Calculate CRC Starting at Last Byte


I'm trying to implement a CRC-CCITT calculator in VHDL. I was able to initially do that; however, I recently found out that data is delivered starting at the least-significant byte. In my code, data is transmitted 7 bytes at a time through a frame. So let's say we have the following data: 123456789 in ASCII or 313233343536373839 in hex. The data would be transmitted as such (with the following CRC):

-- First frame of data
RxFrame.Data <= (
    1 => x"39", -- LSB
    2 => x"38",
    3 => x"37",
    4 => x"36",
    5 => x"35",
    6 => x"34",
    7 => x"33"
);

-- Second/last frame of data
RxFrame.Data <= (
    1 => x"32",
    2 => x"31", -- MSB
    3 => xx,    -- "xx" means irrelevant data, not part of CRC calculation.
    4 => xx,    -- This occurs only in the last frame, when it specified in
    5 => xx,    -- byte 0 which bytes contain data
    6 => xx,
    7 => xx
);

-- Calculated CRC should be 0x31C3

Another example with data 0x4376669A1CFC048321313233343536373839 and its correct CRC is shown below:

-- First incoming frame of data
RxFrame.Data <= (
    1 => x"39", -- LSB
    2 => x"38",
    3 => x"37",
    4 => x"36",
    5 => x"35",
    6 => x"34",
    7 => x"33"
);

-- Second incoming frame of data
RxFrame.Data <= (
    1 => x"32",
    2 => x"31",
    3 => x"21",
    4 => x"83",
    5 => x"04",
    6 => x"FC",
    7 => x"1C"
);

-- Third/last incoming frame of data
RxFrame.Data <= (
    1 => x"9A",
    2 => x"66",
    3 => x"76",
    4 => x"43", -- MSB
    5 => xx,    -- Irrelevant data, specified in byte 0
    6 => xx,
    7 => xx
);

-- Calculated CRC should be 0x2848

Is there a concept I'm missing? Is there a way to calculate the CRC with the data being received in reverse order? I am implementing this for CANopen SDO block protocols. Thanks!

CRC calculation algorithm to verify SDO block transfer from CANopen standard


Solution

  • Example code to generate a CRC16 with the bytes read in reverse (last byte first), using a function to do a carryless multiply modulo the CRC polynomial. An explanation follows.

    #include <stdio.h>
    
    typedef unsigned char       uint8_t;
    typedef unsigned short     uint16_t;
    
    #define POLY (0x1021u)
    
    /* carryless multiply modulo crc polynomial */
    uint16_t MpyModPoly(uint16_t a, uint16_t b) /* (a*b)%poly */
    {
    uint16_t pd = 0;
    uint16_t i;
        for(i = 0; i < 16; i++){
            /* assumes twos complement */
            pd = (pd<<1)^((0-(pd>>15))&POLY);
            pd ^= (0-(b>>15))&a;
            b <<= 1;
        }
        return pd;
    }
    
    /* generate crc in reverse byte order */
    uint16_t Crc16R(uint8_t * b, size_t sz)
    {
    uint8_t *e = b + sz;                    /* end of bfr ptr */
    uint16_t crc = 0u;                      /* crc */
    uint16_t pdm = 0x100u;                  /* padding multiplier */
        while(e > b){                       /* generate crc */
            pdm  = MpyModPoly(0x100, pdm);
            crc ^= MpyModPoly( *--e, pdm);
        }
        return(crc);
    }
    
    /*      msg will be processed in reverse order */
    static uint8_t msg[] = {0x43,0x76,0x66,0x9A,0x1C,0xFC,0x04,0x83,
                            0x21,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
                            0x38,0x39};
    
    int main()
    {
    uint16_t crc;
        crc = Crc16R(msg, sizeof(msg));
        printf("%04x\n", crc);
        return 0;
    }
    

    Example code using X86 xmm pclmulqdq and psrlq, to emulate a 16 bit by 16 bit hardware (VHDL) carryless multiply:

    /*      __m128i is an intrinsic for X86 128 bit xmm register */
    static __m128i poly =    {.m128i_u32[0] = 0x00011021u}; /* poly */
    static __m128i invpoly = {.m128i_u32[0] = 0x00008898u}; /* 2^31 / poly */
    
    /* carryless multiply modulo crc polynomial */
    /* using xmm pclmulqdq and psrlq */
    uint16_t MpyModPoly(uint16_t a, uint16_t b)
    {
    __m128i ma, mb, mp, mt;
        ma.m128i_u64[0] = a;
        mb.m128i_u64[0] = b;
        mp = _mm_clmulepi64_si128(ma, mb, 0x00);      /* mp = a*b */
        mt = _mm_srli_epi64(mp, 16);                  /* mt = mp>>16 */
        mt = _mm_clmulepi64_si128(mt, invpoly, 0x00); /* mt = mt*ipoly */
        mt = _mm_srli_epi64(mt, 15);                  /* mt = mt>>15 = (a*b)/poly */ 
        mt = _mm_clmulepi64_si128(mt, poly, 0x00);    /* mt = mt*poly */
        return mp.m128i_u16[0] ^ mt.m128i_u16[0];     /* ret  mp^mt */
    }
    
    /* external code to generate invpoly */
    #define POLY (0x11021u)
    static __m128i invpoly;                 /* 2^31 / poly */
    void GenMPoly(void)                     /* generate __m12i8 invpoly */
    {
    uint32_t N = 0x10000u;                  /* numerator = x^16 */
    uint32_t Q = 0;                         /* quotient = 0 */
        for(size_t i = 0; i <= 15; i++){    /* 31 - 16 = 15 */
            Q <<= 1;
            if(N&0x10000u){
                Q |= 1;
                N ^= POLY;
            }
            N <<= 1;
        }
        invpoly.m128i_u16[0] = Q;
    }
    

    Explanation: consider the data as separate strings of ever increasing length, padded with zeroes at the end. For the first few bytes of your example, the logic would calculate

    CRC  = CRC16({39})
    CRC ^= CRC16({38 00})
    CRC ^= CRC16({37 00 00})
    CRC ^= CRC16({36 00 00 00})
    ...
    

    To speed up this calculation, rather than actually pad with n zero bytes, you can do a carryless multiply of a CRC by 2^{n·8} modulo POLY, where POLY is the 17 bit polynomial used for CRC16:

    CRC  =  CRC16({39})
    CRC ^= (CRC16({38}) · (2^08 % POLY)) % POLY
    CRC ^= (CRC16({37}) · (2^10 % POLY)) % POLY
    CRC ^= (CRC16({36}) · (2^18 % POLY)) % POLY
    ...
    

    A carryless multiply modulo POLY is equivalent to what CRC16 does, so this translates into pseudo code (all values in hex, 2^8 = 100)

    CRC  =    0
    PDM  =  100                  ;padding multiplier
    
    PDM  = (100 · PDM) % POLY    ;main loop (2 lines per byte)
    CRC ^= ( 39 · PDM) % POLY
    PDM  = (100 · PDM) % POLY
    CRC ^= ( 38 · PDM) % POLY
    PDM  = (100 · PDM) % POLY
    CRC ^= ( 37 · PDM) % POLY
    PDM  = (100 · PDM) % POLY
    CRC ^= ( 36 · PDM) % POLY
    ...
    

    Implementing (A · B) % POLY is based on binary math:

    (A · B) % POLY = (A · B) ^ (((A · B) / POLY) · POLY)
    

    Where multiply is carryless (XOR instead of add) and divide is borrowless (XOR instead of subtract). Since the divide is borrowless, and most significant term of POLY is x^16, the quotient

    Q = (A · B) / POLY 
    

    only depends on the upper 16 bits of (A · B). Dividing by POLY uses multiplication by the 16 bit constant IPOLY = (2^31)/POLY followed by a right shift:

    Q = (A · B) / POLY  = (((A · B) >> 16) · IPOLY) >> 15
    

    The process uses a 16 bit by 16 bit carryless multiply, producing a 31 bit product.

    POLY  = 0x11021u                  ; CRC polynomial (17 bit)
    IPOLY = 0x08898u                  ; 2^31 / POLY
                                      ;  generated by external software
    MpyModPoly(A, B)
    {
        MP = A · B                    ; MP = A · B
        MT = MP >> 16                 ; MT = MP >> 16
        MT = MT · IPOLY               ; MT = MT · IPOLY
        MT = MT >> 15                 ; MT = (A · B) / POLY
        MT = MT · POLY                ; MT = ((A · B) / POLY) * POLY
        return MP xor MT              ;      (A·B) ^ (((A · B) / POLY) · POLY)
    }
    

    A hardware based carryless multiply would look something like this 4 bit · 4 bit example.

    p[] = [a3 a2 a1 a0] · [b3 b2 b1 b0]
    
    p[] is a 7 bit product generated with 7 parallel circuits.
    The time for multiply would be worst case propagation time for p3.
    
    p6 = a3&b3
    p5 = a3&b2 ^ a2&b3
    p4 = a3&b1 ^ a2&b2 ^ a1&b3
    p3 = a3&b0 ^ a2&b1 ^ a1&b2 ^ a0&b3 
    p2 = a2&b0 ^ a1&b1 ^ a0&b2
    p1 = a1&b0 ^ a0&b1
    p0 = a0&b0
    
    If the xor gates available only have 2 bit inputs, the logic can
    be split up. For example:
    
    p3 = (a3&b0 ^ a2&b1) ^ (a1&b2 ^ a0&b3)
    

    I don't know if your VHDL toolset includes a library for carryless multiply. For a 16 bit by 16 bit multiply resulting in a 31 bit product (p30 to p00), p15 has 16 outputs from the 16 ands (in parallel), which could be xor'ed using a tree like structure, 8 xors in parallel feeding into 4 xors in parallel feeding into 2 xor's in parallel into a single xor. So the propagation time would be 1 and and 4 xor propagation times.