CRC16 with VHDL (multiple input bytes)

The below VHDL snippet correctly gets me the 16-bit CRC for a single input byte. How would I extend that for multiple input bytes, e.g. a frame that now spans 128 bytes to be crc'd?

NB: function 'crc16' was generated using some online tool, but I derived it by myself too, so I am confident it works OK. Currently, the testbench below supplies the CRC function one byte per call.

CRC characteristics:

CRC polynomial: 0x8005
Input reflected: yes
Output reflected: yes
Seed value: 0xFFFF
XOR-out value: 0xFFFF (IIUC, negate the CRC)

Code:

library ieee; 
use ieee.std_logic_1164.all;

use ieee.numeric_std.all;

entity crc is 
port ( clk: in std_logic;
       data_in: in std_logic_vector(7 downto 0);             
       crc_out: out std_logic_vector(15 downto 0)
      );
end crc;

architecture crc_arch of crc is     

function reverse_vector(v: in std_logic_vector)
return std_logic_vector is
    variable result: std_logic_vector(v'RANGE);
    alias vr: std_logic_vector(v'REVERSE_RANGE) is v;
begin
    for i in vr'RANGE loop
        result(i) := vr(i);
    end loop;

    return result;
end;    


function crc16( data_i: in std_logic_vector(7 downto 0);             
                     crc_i: in std_logic_vector(15 downto 0))
return std_logic_vector is  
    variable crc_o: std_logic_vector(15 downto 0);
begin
    crc_o(15) := crc_i(7) xor crc_i(8) xor crc_i(9) xor crc_i(10) xor crc_i(11) xor crc_i(12) xor crc_i(13) xor crc_i(14) xor crc_i(15) xor 
                    data_i(0) xor data_i(1) xor data_i(2) xor data_i(3) xor data_i(4) xor data_i(5) xor data_i(6) xor data_i(7);          
    crc_o(14) := crc_i(6);      
    crc_o(13) := crc_i(5);
    crc_o(12) := crc_i(4);
    crc_o(11) := crc_i(3);  
    crc_o(10) := crc_i(2);  
    crc_o(9)  := crc_i(1) xor crc_i(15) xor data_i(7);
    crc_o(8)  := crc_i(0) xor crc_i(14) xor crc_i(15) xor data_i(6) xor data_i(7);
    crc_o(7)  := crc_i(13) xor crc_i(14) xor data_i(5) xor data_i(6);
    crc_o(6)  := crc_i(12) xor crc_i(13) xor data_i(4) xor data_i(5);               
    crc_o(5)  := crc_i(11) xor crc_i(12) xor data_i(3) xor data_i(4);
    crc_o(4)  := crc_i(10) xor crc_i(11) xor data_i(2) xor data_i(3);
    crc_o(3)  := crc_i(9) xor crc_i(10) xor data_i(1) xor data_i(2);
    crc_o(2)  := crc_i(8) xor crc_i(9) xor data_i(0) xor data_i(1);
    crc_o(1)  := crc_i(9) xor crc_i(10) xor crc_i(11) xor crc_i(12) xor crc_i(13) xor crc_i(14) xor crc_i(15) xor 
                    data_i(1) xor data_i(2) xor data_i(3) xor data_i(4) xor data_i(5) xor data_i(6) xor data_i(7);
    crc_o(0)  := crc_i(8) xor crc_i(9) xor crc_i(10) xor crc_i(11) xor crc_i(12) xor crc_i(13) xor crc_i(14) xor crc_i(15) xor 
                    data_i(0) xor data_i(1) xor data_i(2) xor data_i(3) xor data_i(4) xor data_i(5) xor data_i(6) xor data_i(7);

    return crc_o;
end;


begin 

    crc_out <= not reverse_vector(crc16(reverse_vector(data_in), x"FFFF"));

end architecture crc_arch;

Testbench:

LIBRARY ieee;
USE ieee.std_logic_1164.ALL;

ENTITY tb_crc IS
END tb_crc;

ARCHITECTURE behavior OF tb_crc IS 

-- Component Declaration for the Unit Under Test (UUT)

COMPONENT crc
PORT(
        clk: std_logic;
     data_in : IN  std_logic_vector(7 downto 0);
     crc_out : OUT  std_logic_vector(15 downto 0)
    );
END COMPONENT;


--Inputs
signal tb_clk : std_logic := '0';
signal tb_data_in : std_logic_vector(7 downto 0) := (others => '0');

--Outputs
signal tb_crc_out : std_logic_vector(15 downto 0);

-- Clock period definitions
constant clk_period : time := 10 ns;

BEGIN

-- Instantiate the Unit Under Test (UUT)
uut: crc PORT MAP (
         clk => tb_clk,
      data_in => tb_data_in,
      crc_out => tb_crc_out
    );

-- Clock process definitions
clk_process :process
begin
    tb_clk <= '1';
    wait for clk_period/2;
    tb_clk <= '0';
    wait for clk_period/2;
end process;

-- Stimulus process
stim_proc: process
begin       
  -- hold reset state for 100 ns.
  wait for 100 ns;  


  -- insert stimulus here

    tb_data_in <= x"01";        
    wait for clk_period;

    tb_data_in <= x"02";
    wait for clk_period;

    tb_data_in <= x"03";
    wait for clk_period;

    tb_data_in <= x"04";
    wait for clk_period;

    wait;
end process;

END;

Thanks for reading, Chris

Solution

The parallel CRC generator software commonly used on various websites is open source. I downloaded and converted the source from C++ to C (declarations for types boolean, bool and values true and false).

The license terms allow modification while retaining the copyright notice. I removed the invalid copyright claim on the output as well as fixing some comment characters in the disclaimer and altered the formatting to suit. (I always intended to also fit the output into 80 columns, lining up terms in columns would also be useful).

It generates code almost identical to yours:

crc-gen vhdl 8 16 8005

------------------------------------------------------------------------------- 
-- THIS SOURCE FILE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS
-- OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
-- WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
-------------------------------------------------------------------------------
-- CRC entity/architecture for
-- data(7:0)
-- crc(15:0) = 1+x^2+x^15+x^16;
--
library ieee;
use ieee.std_logic_1164.all;

entity crc is
    port (
        data_in:  in  std_logic_vector (7 downto 0);
        crc_en:   in  std_logic;
        rst:      in  std_logic;
        clk:      in  std_logic;
        crc_out:  out std_logic_vector (15 downto 0)
    );
end entity crc;

architecture imp_crc of crc is
    signal lfsr_q: std_logic_vector (15 downto 0);
    signal lfsr_c: std_logic_vector (15 downto 0);
begin
    crc_out <= lfsr_q;

    lfsr_c(0) <= lfsr_q(8) xor lfsr_q(9) xor lfsr_q(10) xor lfsr_q(11) xor 
                 lfsr_q(12) xor lfsr_q(13) xor lfsr_q(14) xor lfsr_q(15) xor 
                 data_in(0) xor data_in(1) xor data_in(2) xor data_in(3) xor 
                 data_in(4) xor data_in(5) xor data_in(6) xor data_in(7);
    lfsr_c(1) <= lfsr_q(9) xor lfsr_q(10) xor lfsr_q(11) xor lfsr_q(12) xor 
                 lfsr_q(13) xor lfsr_q(14) xor lfsr_q(15) xor data_in(1) xor 
                 data_in(2) xor data_in(3) xor data_in(4) xor data_in(5) xor 
                 data_in(6) xor data_in(7);
    lfsr_c(2) <= lfsr_q(8) xor lfsr_q(9) xor data_in(0) xor data_in(1);
    lfsr_c(3) <= lfsr_q(9) xor lfsr_q(10) xor data_in(1) xor data_in(2);
    lfsr_c(4) <= lfsr_q(10) xor lfsr_q(11) xor data_in(2) xor data_in(3);
    lfsr_c(5) <= lfsr_q(11) xor lfsr_q(12) xor data_in(3) xor data_in(4);
    lfsr_c(6) <= lfsr_q(12) xor lfsr_q(13) xor data_in(4) xor data_in(5);
    lfsr_c(7) <= lfsr_q(13) xor lfsr_q(14) xor data_in(5) xor data_in(6);
    lfsr_c(8) <= lfsr_q(0) xor lfsr_q(14) xor lfsr_q(15) xor data_in(6) xor 
                 data_in(7);
    lfsr_c(9) <= lfsr_q(1) xor lfsr_q(15) xor data_in(7);
    lfsr_c(10) <= lfsr_q(2);
    lfsr_c(11) <= lfsr_q(3);
    lfsr_c(12) <= lfsr_q(4);
    lfsr_c(13) <= lfsr_q(5);
    lfsr_c(14) <= lfsr_q(6);
    lfsr_c(15) <= lfsr_q(7) xor lfsr_q(8) xor lfsr_q(9) xor lfsr_q(10) xor 
                  lfsr_q(11) xor lfsr_q(12) xor lfsr_q(13) xor lfsr_q(14) xor 
                  lfsr_q(15) xor data_in(0) xor data_in(1) xor data_in(2) xor 
                  data_in(3) xor data_in(4) xor data_in(5) xor data_in(6) xor 
                  data_in(7);

REGISTERS:
    process (clk, rst)
    begin
        if rst = '1' then
            lfsr_q   <= (others => '1');
        elsif rising_edge(clk) then
            if crc_en = '1' then
                lfsr_q <= lfsr_c;
            end if;
        end if;
    end process;
end architecture imp_crc;

The interesting feature is the process, which uses a clocked register to keep a running tally of the CRC as well as to provide a seed through a reset (the (others => '1'), equivalent to x"FFFF" based on the length of lfsr_q).

You can use the reset to set the state to begin accumulating successive bytes of CRC along with crc_en to control which clocks input bytes are included in the CRC.

You could use a multiplexer instead to select between either x"FFFF" or the stored CRC so there is no 'downtime' between blocks being evaluated, that serializes the delay to include the multiplexer.

I'd imagine an enable would likely be essential in either case. You could get by with adding either one or two more signals to your port interface and testbench.

So a handcrafted testbench to use the generated CRC code:

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

entity crc_tb is
end entity;

architecture  foo of crc_tb is

    function reverse_vector(v: in std_logic_vector)
    return std_logic_vector is
        variable result: std_logic_vector(v'RANGE);
        alias vr: std_logic_vector(v'REVERSE_RANGE) is v;
    begin
        for i in vr'RANGE loop
            result(i) := vr(i);
        end loop;

        return result;
    end;    

    signal datain:   std_logic_vector (7 downto 0);
    signal data_in:  std_logic_vector (7 downto 0);
    signal crc_en:   std_logic := '0';
    signal rst:      std_logic;
    signal clk:      std_logic := '0';
    signal crc_out:  std_logic_vector (15 downto 0);

    signal crcout:   std_logic_vector (15 downto 0);
begin

    crcout <= not reverse_vector (crc_out);

DUT:
    entity work.crc
        port map (
            data_in => data_in,
            crc_en => crc_en,
            rst => rst,
            clk => clk,
            crc_out => crc_out
        );

CLOCK:
    process 
    begin
        wait for 5 ns; -- half the clock period
        clk <= not clk;
        if now > 160 ns then
            wait;
        end if;
    end process;
    STIMULI:
    process
    begin
        rst <= '1';
        for i in 0 to 9 loop
            wait until rising_edge(clk);
        end loop;
        rst <= '0';
        crc_en <= '1';
        for i in 1 to 4 loop
            datain <= std_logic_vector(to_unsigned (i,8));
            data_in <= reverse_vector (std_logic_vector(to_unsigned(i,8)));
            wait until rising_edge(clk);
        end loop;
        crc_en <= '0';
        wait until rising_edge(clk);
        wait;
    end process;
end architecture;

Which gives us:

From you're comment under your question this is the correct value for four successive bytes of x"01", x"02", x"03" and x"04", the value x"D45E".

So let's apply that to your code

First the changes:

library ieee; 
use ieee.std_logic_1164.all;

use ieee.numeric_std.all;

entity crc is 
port ( clk: in std_logic;
       data_in: in std_logic_vector(7 downto 0); 
       crc_en:   in  std_logic;   -- ADDED
       rst:      in  std_logic;   -- ADDED
       crc_out: out std_logic_vector(15 downto 0)
      );
end crc;

architecture crc_arch of crc is     

function reverse_vector(v: in std_logic_vector)
return std_logic_vector is
    variable result: std_logic_vector(v'RANGE);
    alias vr: std_logic_vector(v'REVERSE_RANGE) is v;
begin
    for i in vr'RANGE loop
        result(i) := vr(i);
    end loop;

    return result;
end;    


function crc16( data_i: in std_logic_vector(7 downto 0);             
                     crc_i: in std_logic_vector(15 downto 0))
return std_logic_vector is  
    variable crc_o: std_logic_vector(15 downto 0);
begin
    crc_o(15) := crc_i(7) xor crc_i(8) xor crc_i(9) xor crc_i(10) xor crc_i(11) xor crc_i(12) xor crc_i(13) xor crc_i(14) xor crc_i(15) xor 
                    data_i(0) xor data_i(1) xor data_i(2) xor data_i(3) xor data_i(4) xor data_i(5) xor data_i(6) xor data_i(7);          
    crc_o(14) := crc_i(6);      
    crc_o(13) := crc_i(5);
    crc_o(12) := crc_i(4);
    crc_o(11) := crc_i(3);  
    crc_o(10) := crc_i(2);  
    crc_o(9)  := crc_i(1) xor crc_i(15) xor data_i(7);
    crc_o(8)  := crc_i(0) xor crc_i(14) xor crc_i(15) xor data_i(6) xor data_i(7);
    crc_o(7)  := crc_i(13) xor crc_i(14) xor data_i(5) xor data_i(6);
    crc_o(6)  := crc_i(12) xor crc_i(13) xor data_i(4) xor data_i(5);               
    crc_o(5)  := crc_i(11) xor crc_i(12) xor data_i(3) xor data_i(4);
    crc_o(4)  := crc_i(10) xor crc_i(11) xor data_i(2) xor data_i(3);
    crc_o(3)  := crc_i(9) xor crc_i(10) xor data_i(1) xor data_i(2);
    crc_o(2)  := crc_i(8) xor crc_i(9) xor data_i(0) xor data_i(1);
    crc_o(1)  := crc_i(9) xor crc_i(10) xor crc_i(11) xor crc_i(12) xor crc_i(13) xor crc_i(14) xor crc_i(15) xor 
                    data_i(1) xor data_i(2) xor data_i(3) xor data_i(4) xor data_i(5) xor data_i(6) xor data_i(7);
    crc_o(0)  := crc_i(8) xor crc_i(9) xor crc_i(10) xor crc_i(11) xor crc_i(12) xor crc_i(13) xor crc_i(14) xor crc_i(15) xor 
                    data_i(0) xor data_i(1) xor data_i(2) xor data_i(3) xor data_i(4) xor data_i(5) xor data_i(6) xor data_i(7);

    return crc_o;

end;

    signal crc_o:   std_logic_vector (15 downto 0);  -- ADDED register

begin 

    -- crc_out <= not reverse_vector(crc16(reverse_vector(data_in), x"FFFF"));

    process (clk)  -- ADDED process
    begin
        if rst = '1' then
            crc_o <= x"FFFF";
        elsif rising_edge(clk) then  
            if crc_en = '1' then
                crc_o <= crc16(reverse_vector(data_in), crc_o);
            end if;
        end if;
    end process;

    crc_out <= not reverse_vector(crc_o);  -- ADDED

end architecture crc_arch;

Added controls rst and crc_en to the entity port, added a declaration for the signal holding the register CRC value and broke up the reversing and inversion so it's not in the path of the crc_i for the crc16 function call.

The input to the register is the return value of the crc16 function call. The register is reset to the the CRC seed value.

The testbench got simpler:

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

entity crc_tb is
end entity;

architecture  foo of crc_tb is

    signal data_in:  std_logic_vector (7 downto 0);
    signal crc_en:   std_logic := '0';
    signal rst:      std_logic;
    signal clk:      std_logic := '0';
    signal crc_out:  std_logic_vector (15 downto 0);

begin

DUT:
    entity work.crc
        port map (
            data_in => data_in,
            crc_en => crc_en,
            rst => rst,
            clk => clk,
            crc_out => crc_out
        );

CLOCK:
    process 
    begin
        wait for 5 ns; -- half the clock period
        clk <= not clk;
        if now > 160 ns then
            wait;
        end if;
    end process;
STIMULI:
    process
    begin
        rst <= '1';
        for i in 0 to 9 loop
            wait until rising_edge(clk);
        end loop;
        rst <= '0';
        crc_en <= '1';
        for i in 1 to 4 loop
            data_in <= std_logic_vector(to_unsigned (i,8));
            wait until rising_edge(clk);
        end loop;
        crc_en <= '0';
        wait until rising_edge(clk);
        wait;
    end process;
end architecture;

All the changes were subtractive.

And that gives:

The same answer as using the downloaded/generated VHDL code.

So the secret to using you crc16 function call is to not do any inversion or reversing from it's return value to the crc_i argument to the crc16 function call.