I created a verilog module for my project, and getting the expected output (but it doesn't has a testbench, it takes required input data by reading a .txt file using $readmemh
But a module without input or output ports cant be synthesized.
Need to synthesize it using cadence to get area, power reports (Not dumping it on FPGA). If required another top module which instantiates this module can also be used.
Need help to make this code synthesizable (Planar and DC outputs are required to be displayed -
Input : clk
Outputs : Y_Planar_output, Cb_Planar_output, Cr_Planar_output, Y_DC_output_corner, Y_DC_output_left, Y_DC_output_top.
Any extra input/output ports if required can be used to make it synthesizable)
module hevc_v6;
//Registers to store file contents and reference buffer for Y, Cb and Cr
reg [7:0] Y_data[0:291];
reg [7:0] Y_ref_buffer[0:34];
reg [7:0] Cb_data[0:82];
reg [7:0] Cb_ref_buffer[0:18];
reg [7:0] Cr_data[0:82];
reg [7:0] Cr_ref_buffer[0:18];
//Reading pixel values from text file
initial $readmemh("Luma_Y_data_16x16.txt", Y_data);
initial $readmemh("Chroma_Cb_data_8x8.txt", Cb_data);
initial $readmemh("Chroma_Cr_data_8x8.txt", Cr_data);
//Registers for Matrix
reg [7:0] Y_luma_matrix [0:15][0:15];
reg [7:0] Cb_chroma_matrix [0:7][0:7];
reg [7:0] Cr_chroma_matrix [0:7][0:7];
//Registers for Planar and DC prediction outputs for Y, Cb and Cr
reg [11:0] Y_Planar_output [0:15][0:15];
reg [11:0] Y_DC_output_corner;
reg [11:0] Y_DC_output_left [1:15];
reg [11:0] Y_DC_output_top [1:15];
reg [11:0] Cb_Planar_output [0:7][0:7];
reg [11:0] Cr_Planar_output [0:7][0:7];
//Extra registers for Planar prediction
//Luma Y
reg [11:0] Y_pred_h [0:15][0:15];
reg [11:0] Y_pred_v [0:15][0:15];
//Chroma Cb
reg [11:0] Cb_pred_h [0:7][0:7];
reg [11:0] Cb_pred_v [0:7][0:7];
//Chroma Cr
reg [11:0] Cr_pred_h [0:7][0:7];
reg [11:0] Cr_pred_v [0:7][0:7];
//Extra registers for DC prediction
reg [12:0] Y_dc_avg;
//Clock initialization
reg clk;
initial clk = 0;
always #5 clk = ~clk;
//Storing pixel values into matrix
integer x, y, z;
//Luma Y
for (x=0; x<256; x=x+1)
Y_luma_matrix [x/16][x%16] = Y_data[x];
//Chroma Cb
for (y=0; y<64; y=y+1)
Cb_chroma_matrix [y/8][y%8] = Cb_data[y];
//Chroma Cr
for (z=0; z<64; z=z+1)
Cr_chroma_matrix [z/8][z%8] = Cr_data[z];
//Ref buffer initialization
integer a, b, c;
//Luma Y
for (a=0; a<35; a=a+1)
//Y_ref_buffer [a] = 'h80;
Y_ref_buffer [a] = Y_data[256+a];
//Chroma Cb
for (b=0; b<19; b=b+1)
//Cb_ref_buffer [b] = 'h80;
Cb_ref_buffer [b] = Cb_data[64+b];
//Chroma Cr
for (c=0; c<19; c=c+1)
//Cr_ref_buffer [c] = 'h80;
Cr_ref_buffer [c] = Cr_data[64+c];
integer e,f,g,h,i,j;
//Luma Y
always @(posedge clk)
for (f=0; f<16; f=f+1)
for (e=0; e<16; e=e+1)
Y_pred_h [f][e] = (16-1-e) * Y_ref_buffer[19+f] + (e+1) * Y_ref_buffer[1];
Y_pred_v [f][e] = (16-1-f) * Y_ref_buffer[3+e] + (f+1) * Y_ref_buffer[2];
Y_Planar_output [f][e] = (Y_pred_h [f][e] + Y_pred_v [f][e] + 16 ) >> (4+1);
//Chroma Cb
always @(posedge clk)
for (h=0; h<8; h=h+1)
for (g=0; g<8; g=g+1)
Cb_pred_h [h][g] = (8-1-g) * Cb_ref_buffer[11+h] + (g+1) * Cb_ref_buffer[1];
Cb_pred_v [h][g] = (8-1-h) * Cb_ref_buffer[3+g] + (h+1) * Cb_ref_buffer[2];
Cb_Planar_output [h][g] = (Cb_pred_h [h][g] + Cb_pred_v [h][g] + 8 ) >> (3+1);
//Chroma Cr
always @(posedge clk)
for (j=0; j<8; j=j+1)
for (i=0; i<8; i=i+1)
Cr_pred_h [j][i] = (8-1-i) * Cr_ref_buffer[11+j] + (i+1) * Cr_ref_buffer[1];
Cr_pred_v [j][i] = (8-1-j) * Cr_ref_buffer[3+i] + (j+1) * Cr_ref_buffer[2];
Cr_Planar_output [j][i] = (Cr_pred_h [j][i] + Cr_pred_v [j][i] + 8 ) >> (3+1);
integer m,n;
//For only Luma Y
always @(posedge clk)
Y_dc_avg = (Y_ref_buffer[3] + Y_ref_buffer[4] + Y_ref_buffer[5] + Y_ref_buffer[6] + Y_ref_buffer[7] + Y_ref_buffer[8] + Y_ref_buffer[9] + Y_ref_buffer[10] + Y_ref_buffer[11] + Y_ref_buffer[12] + Y_ref_buffer[13] + Y_ref_buffer[14] + Y_ref_buffer[15] + Y_ref_buffer[16] + Y_ref_buffer[17] + Y_ref_buffer[18] + Y_ref_buffer[19] + Y_ref_buffer[20] + Y_ref_buffer[21] + Y_ref_buffer[22] + Y_ref_buffer[23] + Y_ref_buffer[24] + Y_ref_buffer[25] + Y_ref_buffer[26] + Y_ref_buffer[27] + Y_ref_buffer[28] + Y_ref_buffer[29] + Y_ref_buffer[30] + Y_ref_buffer[31] + Y_ref_buffer[32] + Y_ref_buffer[33] + Y_ref_buffer[34]) >> (4+1);
Y_DC_output_corner = (Y_ref_buffer [19] + 2 * Y_dc_avg + Y_ref_buffer[3] + 2) >> 2;
for (m=1; m<16; m=m+1)
Y_DC_output_left [m] = (Y_ref_buffer[3+m] + 3 * Y_dc_avg + 2) >> 2;
for (n=1; n<16; n=n+1)
Y_DC_output_top [n] = (Y_ref_buffer[19+n] + 3 * Y_dc_avg + 2) >> 2;
Help is appreciated!
These are the 3 text files, you can copy these data into notepad (notepad++ would be better i feel)
99 9C 99 96 9C 9A 99 9A 9B 97 99 98 98 98 96 98 9A 9B 9D 9D 9A 9C 99 9A 99 97 9B 98 98 98 9B 9B 9D 9B 9C 9B 9B 9E 97 9A 9A 99 99 9A 9A 98 99 9A 9F 9E 9A 99 9A 9C 9B 9B 9C 9C 9A 9A 99 9A 99 98 9B 9B 9A 99 9B 9E 9D 9E 9C 9B 9C 9A 9B 9A 9B 98 9F 9D 9B 9C 9C 9F 9D 9E 9B 9A 9A 9B 9E 9C 9A 99 A1 A0 9D A0 9D 9D 9F 9D 9B 9F 9A 9B 9D 9E 9B 9A 9F A0 A1 9C 9E 9E 9A 9F 9D 9F 9E 9F 9D 9C A0 9C 9F A1 A0 9C 9F 9F 9E 9D 9F A1 9F 9E 9D 9C 9F 9D A3 A1 A2 A1 9E A0 A1 9D 9F 9C A0 9E 9C 9E 9F 9F A0 9F A2 9F A3 9E 9F 9F A0 9F A0 A0 A2 A1 A0 9E A2 A2 A5 A4 A1 9F 9F A0 A1 9F 9F A1 A3 A0 9F A1 A5 A5 A2 A0 A3 A2 9F A0 A4 A2 A1 9E 9F A3 9B 9E A6 A2 A3 A1 A2 A2 A3 A1 A1 A5 A3 A2 A3 A1 9A A1 A5 A2 A2 A3 A1 A5 A3 A2 A2 A3 A2 A3 A3 A0 A0 9F A5 A7 A7 A2 A1 A5 A3 A3 A3 9F A4 A1 A2 A1 A0 9F
98 98 A4 99 96 97 9A 98 9C 9A 97 9A 99 99 9A 98 99 9A 99 9A 99 9D 9D 9B 9E 9F 9E 9F A1 A1 9F A3 A2 A5 A2
84 84 85 85 85 86 86 86 84 84 85 85 85 86 86 86 84 84 84 85 85 85 85 85 83 83 84 84 84 84 83 84 84 84 83 84 84 83 83 83 82 82 83 83 84 83 83 83 81 81 82 83 83 82 82 82 81 81 82 83 82 82 82 82
86 88 80 86 86 86 87 88 86 85 86 83 83 83 83 83 83 82 80
68 68 69 69 69 68 68 68 68 68 69 69 69 68 68 68 68 68 68 69 69 68 68 68 68 67 68 6A 69 68 68 68 6A 6A 6A 69 69 68 68 68 6A 6A 6A 6A 6A 6A 69 6A 6A 6A 6A 6A 6B 6A 6A 6A 6A 6A 6A 6A 6A 6A 6A 6A
68 66 6C 69 69 68 68 68 67 66 67 68 68 69 69 69 6A 6A 6A
Here is a list of synthesis concerns for the OP and how to address them:
The large number of bits created by adding module ports may be an issue. The number of module ports is greater than the number of IO pins than any physical FPGA has. Synthesis targets a specify physical device than has finite physical resources. Very large FPGAs have several hundred IO pins (I have never seen one advertised with 1000 user IO pins, if one exists it would be in the .1 percentile of all devices on the market). If this line of code becomes a port [11:0] Y_Planar_output [0:15][0:15]
, synthesis will attempt to allocate 12X16X16 = 3072 FPGA pins to connect to the ports. Since no physical FPGA has that many IO pins synthesis will fail. The design in the post would have several very large ports like this. There is no chance the number of ports needed will fit in a physical FPGA (in other words synthesis will fail).
Vivado synthesis has an build option called out_of_context
, which tells the tool that the module being synthesized is an internal module and not to infer IO pins.
I did a internet search and did not find such an option in Cadence synthesis. You will need to research this.
If Cadence does not have such an option then the code must be modified such that the memories are accessed one address at a time. The corresponding port size would be 12-bits rather than thousands of bits. The re-write could involve state machine controllers to load and unload the memories.
Change internal variables (the ones listed in the OP that need to change) to ports like this (be sure to remove the local corresponding variable):
module hevc_v6
input clk,
output reg [11:0] Y_Planar_output [0:15][0:15],
output reg [11:0] Cb_Planar_output [0:7][0:7],
// more ports
Get rid of the internal driver on the clock always #5 clk = ~clk
Change the initial
blocks that have for
to synchronous always blocks like this:
always @(posedge clk)
for (x=0; x<256; x=x+1)
Y_luma_matrix [x/16][x%16] = Y_data[x]
There are a couple of initial blocks whose output goes nowhere. Decide what to do with these (bring them out the ports?, get rid of them, change to synchronous always blocks?). For example:
for (z=0; z<64; z=z+1)
Cr_chroma_matrix [z/8][z%8] = Cr_data[z]
In this line and similar Cr_chroma_matrix [z/8][z%8] = Cr_data[z]
The / operator and the % operators will not synthesize to the math functions as expected.
Those operators can be used on parameters and things that are known and don't change at elaboration time only in RTL.
For the /
8 (divide by 8) operator, use shift to the right by 3 places.
For %
8 (mod 8) operator, drop the low three bits. This trick works for mod powers of 2.
The $readmemh()
If you are using a tool which does not support $readmemh(), then change the $readmemh() + file combination to a RTL ROM using a case statement. Search 'Verilog ROM' for more.
This block of code may infer more multipliers than the physical device has. I see at least 16X16X2 = 512 multipliers in this block. You may need to re-write it to do one multiply (or a reasonable number relative to the number of multiplies the device has) at a time.
// Chroma Cr
always @(posedge clk)
for (j=0; j<8; j=j+1) begin
for (i=0; i<8; i=i+1) begin
// infers a lot of multiplies
Keep in mind that synthesis targets a physical device with finite resources; write the RTL to model within the limits of those resources.
If these concerns are addressed, then the design should synthesize.
Write a testbench if you want to understand how the changed design works. The timing will be different for initial blocks with for loops replaced by synchronous processes.