Need help in converting verilog module without input & output ports into synthesizable. Because module without input/output ports is not synthesizable

I created a verilog module for my project, and getting the expected output (but it doesn't has a testbench, it takes required input data by reading a .txt file using $readmemh)

But a module without input or output ports cant be synthesized.

Need to synthesize it using cadence to get area, power reports (Not dumping it on FPGA). If required another top module which instantiates this module can also be used.

Need help to make this code synthesizable (Planar and DC outputs are required to be displayed -

Input : clk

Outputs : Y_Planar_output, Cb_Planar_output, Cr_Planar_output, Y_DC_output_corner, Y_DC_output_left, Y_DC_output_top.

Any extra input/output ports if required can be used to make it synthesizable)

Code

module hevc_v6;

//Registers to store file contents and reference buffer for Y, Cb and Cr
reg [7:0] Y_data[0:291];
reg [7:0] Y_ref_buffer[0:34];

reg [7:0] Cb_data[0:82];
reg [7:0] Cb_ref_buffer[0:18];

reg [7:0] Cr_data[0:82];
reg [7:0] Cr_ref_buffer[0:18];

//Reading pixel values from text file
initial $readmemh("Luma_Y_data_16x16.txt", Y_data);
initial $readmemh("Chroma_Cb_data_8x8.txt", Cb_data);
initial $readmemh("Chroma_Cr_data_8x8.txt", Cr_data);

//Registers for Matrix
reg [7:0] Y_luma_matrix [0:15][0:15];
reg [7:0] Cb_chroma_matrix [0:7][0:7];
reg [7:0] Cr_chroma_matrix [0:7][0:7];

//Registers for Planar and DC prediction outputs for Y, Cb and Cr
reg [11:0] Y_Planar_output [0:15][0:15];
reg [11:0] Y_DC_output_corner;
reg [11:0] Y_DC_output_left [1:15];
reg [11:0] Y_DC_output_top [1:15];

reg [11:0] Cb_Planar_output [0:7][0:7];

reg [11:0] Cr_Planar_output [0:7][0:7];

//Extra registers for Planar prediction
//Luma Y
reg [11:0] Y_pred_h [0:15][0:15];
reg [11:0] Y_pred_v [0:15][0:15];

//Chroma Cb
reg [11:0] Cb_pred_h [0:7][0:7];
reg [11:0] Cb_pred_v [0:7][0:7];

//Chroma Cr
reg [11:0] Cr_pred_h [0:7][0:7];
reg [11:0] Cr_pred_v [0:7][0:7];

//Extra registers for DC prediction
reg [12:0] Y_dc_avg;

//Clock initialization
reg clk;
initial clk = 0;
always #5 clk = ~clk;

//Storing pixel values into matrix  
integer x, y, z;

//Luma Y
initial
    for (x=0; x<256; x=x+1)
        Y_luma_matrix [x/16][x%16] = Y_data[x]; 

//Chroma Cb
initial
    for (y=0; y<64; y=y+1)
        Cb_chroma_matrix [y/8][y%8] = Cb_data[y]; 

//Chroma Cr
initial
    for (z=0; z<64; z=z+1)
        Cr_chroma_matrix [z/8][z%8] = Cr_data[z]; 
        
//Ref buffer initialization
integer a, b, c;

//Luma Y
initial 
    for (a=0; a<35; a=a+1)
        //Y_ref_buffer [a] = 'h80;
        Y_ref_buffer [a] = Y_data[256+a];

//Chroma Cb
initial 
    for (b=0; b<19; b=b+1)
        //Cb_ref_buffer [b] = 'h80;
        Cb_ref_buffer [b] = Cb_data[64+b];

//Chroma Cr
initial 
    for (c=0; c<19; c=c+1)
        //Cr_ref_buffer [c] = 'h80;
        Cr_ref_buffer [c] = Cr_data[64+c];

//PLANAR PREDICTION
integer e,f,g,h,i,j;

//Luma Y
always @(posedge clk)
begin
    for (f=0; f<16; f=f+1)  
    begin
        for (e=0; e<16; e=e+1) 
        begin
            Y_pred_h [f][e] = (16-1-e) * Y_ref_buffer[19+f] + (e+1) * Y_ref_buffer[1];
            Y_pred_v [f][e] = (16-1-f) * Y_ref_buffer[3+e] + (f+1) * Y_ref_buffer[2];
            Y_Planar_output [f][e] = (Y_pred_h [f][e] + Y_pred_v [f][e] + 16 ) >> (4+1);                
        end
    end
end

//Chroma Cb
always @(posedge clk)
begin
    for (h=0; h<8; h=h+1)   
    begin
        for (g=0; g<8; g=g+1) 
        begin
            Cb_pred_h [h][g] = (8-1-g) * Cb_ref_buffer[11+h] + (g+1) * Cb_ref_buffer[1];
            Cb_pred_v [h][g] = (8-1-h) * Cb_ref_buffer[3+g] + (h+1) * Cb_ref_buffer[2];
            Cb_Planar_output [h][g] = (Cb_pred_h [h][g] + Cb_pred_v [h][g] + 8 ) >> (3+1);              
        end
    end
end

//Chroma Cr
always @(posedge clk)
begin
    for (j=0; j<8; j=j+1)   
    begin
        for (i=0; i<8; i=i+1) 
        begin
            Cr_pred_h [j][i] = (8-1-i) * Cr_ref_buffer[11+j] + (i+1) * Cr_ref_buffer[1];
            Cr_pred_v [j][i] = (8-1-j) * Cr_ref_buffer[3+i] + (j+1) * Cr_ref_buffer[2];
            Cr_Planar_output [j][i] = (Cr_pred_h [j][i] + Cr_pred_v [j][i] + 8 ) >> (3+1);              
        end
    end
end

//DC PREDICTION
integer m,n;

//For only Luma Y
always @(posedge clk)
begin
    Y_dc_avg = (Y_ref_buffer[3] + Y_ref_buffer[4] + Y_ref_buffer[5] + Y_ref_buffer[6] + Y_ref_buffer[7] + Y_ref_buffer[8] + Y_ref_buffer[9] + Y_ref_buffer[10] + Y_ref_buffer[11] + Y_ref_buffer[12] + Y_ref_buffer[13] + Y_ref_buffer[14] + Y_ref_buffer[15] + Y_ref_buffer[16] + Y_ref_buffer[17] + Y_ref_buffer[18] + Y_ref_buffer[19] + Y_ref_buffer[20] + Y_ref_buffer[21] + Y_ref_buffer[22] + Y_ref_buffer[23] + Y_ref_buffer[24] + Y_ref_buffer[25] + Y_ref_buffer[26] + Y_ref_buffer[27] + Y_ref_buffer[28] + Y_ref_buffer[29] + Y_ref_buffer[30] + Y_ref_buffer[31] + Y_ref_buffer[32] + Y_ref_buffer[33] + Y_ref_buffer[34]) >> (4+1);
    
    Y_DC_output_corner = (Y_ref_buffer [19] + 2 * Y_dc_avg + Y_ref_buffer[3] + 2) >> 2;
    
    for (m=1; m<16; m=m+1)
        Y_DC_output_left [m] = (Y_ref_buffer[3+m] + 3 * Y_dc_avg + 2) >> 2;
    for (n=1; n<16; n=n+1)
        Y_DC_output_top [n] = (Y_ref_buffer[19+n] + 3 * Y_dc_avg + 2) >> 2;
end     

endmodule

Help is appreciated!

These are the 3 text files, you can copy these data into notepad (notepad++ would be better i feel)

Luma_Y_data_16x16.txt

99 9C 99 96 9C 9A 99 9A 9B 97 99 98 98 98 96 98 9A 9B 9D 9D 9A 9C 99 9A 99 97 9B 98 98 98 9B 9B 9D 9B 9C 9B 9B 9E 97 9A 9A 99 99 9A 9A 98 99 9A 9F 9E 9A 99 9A 9C 9B 9B 9C 9C 9A 9A 99 9A 99 98 9B 9B 9A 99 9B 9E 9D 9E 9C 9B 9C 9A 9B 9A 9B 98 9F 9D 9B 9C 9C 9F 9D 9E 9B 9A 9A 9B 9E 9C 9A 99 A1 A0 9D A0 9D 9D 9F 9D 9B 9F 9A 9B 9D 9E 9B 9A 9F A0 A1 9C 9E 9E 9A 9F 9D 9F 9E 9F 9D 9C A0 9C 9F A1 A0 9C 9F 9F 9E 9D 9F A1 9F 9E 9D 9C 9F 9D A3 A1 A2 A1 9E A0 A1 9D 9F 9C A0 9E 9C 9E 9F 9F A0 9F A2 9F A3 9E 9F 9F A0 9F A0 A0 A2 A1 A0 9E A2 A2 A5 A4 A1 9F 9F A0 A1 9F 9F A1 A3 A0 9F A1 A5 A5 A2 A0 A3 A2 9F A0 A4 A2 A1 9E 9F A3 9B 9E A6 A2 A3 A1 A2 A2 A3 A1 A1 A5 A3 A2 A3 A1 9A A1 A5 A2 A2 A3 A1 A5 A3 A2 A2 A3 A2 A3 A3 A0 A0 9F A5 A7 A7 A2 A1 A5 A3 A3 A3 9F A4 A1 A2 A1 A0 9F

98 98 A4 99 96 97 9A 98 9C 9A 97 9A 99 99 9A 98 99 9A 99 9A 99 9D 9D 9B 9E 9F 9E 9F A1 A1 9F A3 A2 A5 A2

Chroma_Cb_data_8x8.txt

84 84 85 85 85 86 86 86 84 84 85 85 85 86 86 86 84 84 84 85 85 85 85 85 83 83 84 84 84 84 83 84 84 84 83 84 84 83 83 83 82 82 83 83 84 83 83 83 81 81 82 83 83 82 82 82 81 81 82 83 82 82 82 82

86 88 80 86 86 86 87 88 86 85 86 83 83 83 83 83 83 82 80

Chroma_Cr_data_8x8.txt

68 68 69 69 69 68 68 68 68 68 69 69 69 68 68 68 68 68 68 69 69 68 68 68 68 67 68 6A 69 68 68 68 6A 6A 6A 69 69 68 68 68 6A 6A 6A 6A 6A 6A 69 6A 6A 6A 6A 6A 6B 6A 6A 6A 6A 6A 6A 6A 6A 6A 6A 6A

68 66 6C 69 69 68 68 68 67 66 67 68 68 69 69 69 6A 6A 6A

Solution

Here is a list of synthesis concerns for the OP and how to address them:

The large number of bits created by adding module ports may be an issue. The number of module ports is greater than the number of IO pins than any physical FPGA has. Synthesis targets a specify physical device than has finite physical resources. Very large FPGAs have several hundred IO pins (I have never seen one advertised with 1000 user IO pins, if one exists it would be in the .1 percentile of all devices on the market). If this line of code becomes a port [11:0] Y_Planar_output [0:15][0:15], synthesis will attempt to allocate 12X16X16 = 3072 FPGA pins to connect to the ports. Since no physical FPGA has that many IO pins synthesis will fail. The design in the post would have several very large ports like this. There is no chance the number of ports needed will fit in a physical FPGA (in other words synthesis will fail).

Vivado synthesis has an build option called out_of_context, which tells the tool that the module being synthesized is an internal module and not to infer IO pins.
I did a internet search and did not find such an option in Cadence synthesis. You will need to research this.

If Cadence does not have such an option then the code must be modified such that the memories are accessed one address at a time. The corresponding port size would be 12-bits rather than thousands of bits. The re-write could involve state machine controllers to load and unload the memories.

Change internal variables (the ones listed in the OP that need to change) to ports like this (be sure to remove the local corresponding variable):

      module hevc_v6
        (
        input clk,
        output reg [11:0] Y_Planar_output [0:15][0:15],
        output reg [11:0] Cb_Planar_output [0:7][0:7],
        // more ports
        );

Get rid of the internal driver on the clock always #5 clk = ~clk

Change the initial blocks that have for to synchronous always blocks like this:

always @(posedge clk)
  for (x=0; x<256; x=x+1)
      Y_luma_matrix [x/16][x%16] = Y_data[x]

There are a couple of initial blocks whose output goes nowhere. Decide what to do with these (bring them out the ports?, get rid of them, change to synchronous always blocks?). For example:

    initial
      for (z=0; z<64; z=z+1)
        Cr_chroma_matrix [z/8][z%8] = Cr_data[z]

In this line and similar Cr_chroma_matrix [z/8][z%8] = Cr_data[z]
The / operator and the % operators will not synthesize to the math functions as expected.
Those operators can be used on parameters and things that are known and don't change at elaboration time only in RTL.
For the /8 (divide by 8) operator, use shift to the right by 3 places.
For %8 (mod 8) operator, drop the low three bits. This trick works for mod powers of 2.
The $readmemh() call
If you are using a tool which does not support $readmemh(), then change the $readmemh() + file combination to a RTL ROM using a case statement. Search 'Verilog ROM' for more.
This block of code may infer more multipliers than the physical device has. I see at least 16X16X2 = 512 multipliers in this block. You may need to re-write it to do one multiply (or a reasonable number relative to the number of multiplies the device has) at a time.

    // Chroma Cr
    always @(posedge clk)
    begin
      for (j=0; j<8; j=j+1) begin
        for (i=0; i<8; i=i+1) begin
          // infers a lot of multiplies             
        end
      end

Keep in mind that synthesis targets a physical device with finite resources; write the RTL to model within the limits of those resources.

If these concerns are addressed, then the design should synthesize.

Write a testbench if you want to understand how the changed design works. The timing will be different for initial blocks with for loops replaced by synchronous processes.