📄 dct.v
字号:
/**********************************************************************
** -----------------------------------------------------------------------------**
** dct.v
**
** 8x8 discrete Cosine Transform
**
**
**
** Author: Latha Pillai
** Senior Applications Engineer
**
** Video Applications
** Advanced Products Group
** Xilinx, Inc.
**
** Copyright (c) 2001 Xilinx, Inc.
** All rights reserved
**
** Date: Feb. 10, 2002
**
** RESTRICTED RIGHTS LEGEND
**
** This software has not been published by the author, and
** has been disclosed to others for the purpose of enhancing
** and promoting design productivity in Xilinx products.
**
** Therefore use, duplication or disclosure, now and in the
** future should give consideration to the productivity
** enhancements afforded the user of this code by the author's
** efforts. Thank you for using our products !
**
** Disclaimer: THESE DESIGNS ARE PROVIDED "AS IS" WITH NO WARRANTY
** WHATSOEVER AND XILINX SPECIFICALLY DISCLAIMS ANY
** IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR
** A PARTICULAR PURPOSE, OR AGAINST INFRINGEMENT.
** Module: dct8x8 :
** A 1D-DCT is implemented on the input pixels first. The output of this
** called the intermediate value is stored in a RAM. The 2nd 1D-DCT operation
** is done on this stored value to give the final 2D-DCT ouput dct_2d. The
** inputs are 8 bits wide and the 2d-dct ouputs are 9 bits wide.
** 1st 1D section
** The input signals are taken one pixel at a time in the order x00 to x07,
** x10 to x07 and so on upto x77. These inputs are fed into a 8 bit shift
** register. The outputs of the 8 bit shift registers are registered by the
** div8clk which is the CLK signal divided by 8. This will enable us to
** register in 8 pixels (one row) at a time. The pixels are paired up in an
** adder subtractor in the order xk0,xk7:xk1,xk6:xk2,xk5:xk3,xk4. The adder
** subtractor is tied to CLK. For every clk, the adder/subtractor module
** alternaltley chooses addtion and subtraction. This selection is done by
** the toggle flop. The ouput of the addsub is fed into a muliplier whose
** other input is connected to stored values in registers which act as
** memory. The ouput of the 4 mulipliers are added at every CLK in the
** final adder. The ouput of the adder z_out is the 1D-DCT values given
** out in the order in which the inputs were read in.
** It takes 8 clks to read in the first set of inputs, 1 clk to register
** inputs,1 clk to do add/sub, 1clk to get absolute value,
** 1 clk for multiplication, 2 clk for the final adder. total = 14 clks to get
** the 1st z_out value. Every subsequent clk gives out the next z_out value.
** So to get all the 64 values we need 11+63=74 clks.
** Storage / RAM section
** The ouputs z_out of the adder are stored in RAMs. Two RAMs are used so
** that data write can be continuous. The 1st valid input for the RAM1 is
** available at the 15th clk. So the RAM1 enable is active after 15 clks.
** After this the write operation continues for 64 clks . At the 65th clock,
** since z_out is continuous, we get the next valid z_out_00. This 2nd set of
** valid 1D-DCT coefficients are written into RAM2 which is enabled at 15+64
** clks. So at 65th clk, RAM1 goes into read mode for the next 64 clks and
** RAM2 is in write mode. After this for every 64 clks, the read and write
** switches between the 2 RAMS.
** 2nd 1D-DCT section
** After the 1st 79th clk when RAM1 is full, the 2nd 1d calculations can
** start. The second 1D implementation is the same as the 1st 1D
** implementation with the inputs now coming from either RAM1 or RAM2. Also,
** the inputs are read in one column at a time in the order z00 to z70, z10 to
** z70 upto z77. The oupts from the adder in the 2nd section are the 2D-DCT
** coeeficients.
***********************************************************************/
`timescale 1ns/1ps
module dct ( CLK, RST, xin,dct_2d,rdy_out);
output [11:0] dct_2d;
input CLK, RST;
input[7:0] xin; /* input */
output rdy_out;
wire[11:0] dct_2d;
/* constants */
reg[7:0] memory1a, memory2a, memory3a, memory4a;
/* 1D section */
/* The max value of a pixel after processing (to make their expected mean to zero)
is 127. If all the values in a row are 127, the max value of the product terms
would be (127*8)*(23170/256) and that of z_out_int would be (127*8)*23170/65536.
This value divided by 2raised to 16 is equivalent to ignoring the 16 lsb bits of the value */
reg[7:0] xa0_in, xa1_in, xa2_in, xa3_in, xa4_in, xa5_in, xa6_in, xa7_in;
reg[8:0] xa0_reg, xa1_reg, xa2_reg, xa3_reg, xa4_reg, xa5_reg, xa6_reg, xa7_reg;
reg[7:0] addsub1a_comp,addsub2a_comp,addsub3a_comp,addsub4a_comp;
reg[9:0] add_sub1a,add_sub2a,add_sub3a,add_sub4a;
reg save_sign1a, save_sign2a, save_sign3a, save_sign4a;
reg[18:0] p1a,p2a,p3a,p4a;
wire[35:0] p1a_all,p2a_all,p3a_all,p4a_all;
reg[1:0] i_wait;
reg toggleA;
reg[18:0] z_out_int1,z_out_int2;
reg[18:0] z_out_int;
wire[10:0] z_out_rnd;
wire[10:0] z_out;
integer indexi;
/* clks and counters */
reg[3:0] cntr12 ;
reg[3:0] cntr8;
reg[6:0] cntr79;
reg[6:0] wr_cntr,rd_cntr;
reg[6:0] cntr92;
/* memory section */
reg[10:0] data_out;
wire en_ram1,en_dct2d;
reg en_ram1reg,en_dct2d_reg;
reg[10:0] ram1_mem[63:0],ram2_mem[63:0]; // add the following to infer block RAM in synlpicity
// synthesis syn_ramstyle = "block_ram" //shd be within /*..*/
/* 2D section */
wire[10:0] data_out_final;
reg[10:0] xb0_in, xb1_in, xb2_in, xb3_in, xb4_in, xb5_in, xb6_in, xb7_in;
reg[11:0] xb0_reg, xb1_reg, xb2_reg, xb3_reg, xb4_reg, xb5_reg, xb6_reg, xb7_reg;
reg[11:0] add_sub1b,add_sub2b,add_sub3b,add_sub4b;
reg[10:0] addsub1b_comp,addsub2b_comp,addsub3b_comp,addsub4b_comp;
reg save_sign1b, save_sign2b, save_sign3b, save_sign4b;
reg[19:0] p1b,p2b,p3b,p4b;
wire[35:0] p1b_all,p2b_all,p3b_all,p4b_all;
reg toggleB;
reg[19:0] dct2d_int1,dct2d_int2;
reg[19:0] dct_2d_int;
wire[11:0] dct_2d_rnd;
/* 1D-DCT BEGIN */
// store 1D-DCT constant coeeficient values for multipliers */
always @ (posedge RST or posedge CLK)
begin
if (RST)
begin
memory1a <= 8'd0; memory2a <= 8'd0; memory3a <= 8'd0; memory4a <= 8'd0;
end
else
begin
case (indexi)
0 : begin memory1a <= 8'd91;
memory2a <= 8'd91;
memory3a <= 8'd91;
memory4a <= 8'd91;end
1 : begin memory1a <= 8'd126;
memory2a <= 8'd106;
memory3a <= 8'd71;
memory4a <= 8'd25;end
2 : begin memory1a <= 8'd118;
memory2a <= 8'd49;
memory3a[7] <= 1'b1; memory3a[6:0] <= 7'd49;//-8'd49;
memory4a[7] <= 1'b1; memory4a[6:0] <= 7'd118;// end -8'd118;end
end
3 : begin memory1a <= 8'd106;
memory2a[7] <= 1'b1; memory2a[6:0] <= 7'd25;//-8'd25;
memory3a[7] <= 1'b1; memory3a[6:0] <= 7'd126;//-8'd126;
memory4a[7] <= 1'b1; memory4a[6:0] <= 7'd71;end//-8'd71;end
4 : begin memory1a <= 8'd91;
memory2a[7] <= 1'b1; memory2a[6:0] <= 7'd91;//-8'd91;
memory3a[7] <= 1'b1; memory3a[6:0] <= 7'd91;//-8'd91;
memory4a <= 8'd91;end
5 : begin memory1a <= 8'd71;
memory2a[7] <= 1'b1; memory2a[6:0] <= 7'd126;//-8'd126;
memory3a <= 8'd25;
memory4a <= 8'd106;end
6 : begin memory1a <= 8'd49;
memory2a[7] <= 1'b1; memory2a[6:0] <= 7'd118;//-8'd118;
memory3a <= 8'd118;
memory4a[7] <= 1'b1; memory4a[6:0] <= 7'd49;end//-8'd49;end
7 : begin memory1a <= 8'd25;
memory2a[7] <= 1'b1; memory2a[6:0] <= 7'd71;//-8'd71;
memory3a <= 8'd106;
memory4a[7] <= 1'b1; memory4a[6:0] <= 7'd126;end//-8'd126;end
endcase
end
end
/* 8-bit input shifted 8 times thru a shift register*/
always @ (posedge CLK or posedge RST)
begin
if (RST)
begin
xa0_in <= 8'b0; xa1_in <= 8'b0; xa2_in <= 8'b0; xa3_in <= 8'b0;
xa4_in <= 8'b0; xa5_in <= 8'b0; xa6_in <= 8'b0; xa7_in <= 8'b0;
end
else
begin
xa0_in <= xin; xa1_in <= xa0_in; xa2_in <= xa1_in; xa3_in <= xa2_in;
xa4_in <= xa3_in; xa5_in <= xa4_in; xa6_in <= xa5_in; xa7_in <= xa6_in;
end
end
/* shifted inputs registered every 8th clk (using cntr8)*/
always @ (posedge CLK or posedge RST)
begin
if (RST)
begin
cntr8 <= 4'b0;
end
else if (cntr8 < 4'b1000)
begin
cntr8 <= cntr8 + 1;
end
else
begin
cntr8 <= 4'b0001;
end
end
always @ (posedge CLK or posedge RST)
begin
if (RST)
begin
xa0_reg <= 9'b0; xa1_reg <= 9'b0; xa2_reg <= 9'b0; xa3_reg <= 9'b0;
xa4_reg <= 9'b0; xa5_reg <= 9'b0; xa6_reg <= 9'b0; xa7_reg <= 9'b0;
end
else if (cntr8 == 4'b1000)
begin
xa0_reg <= {xa0_in[7],xa0_in}; xa1_reg <= {xa1_in[7],xa1_in};
xa2_reg <= {xa2_in[7],xa2_in}; xa3_reg <= {xa3_in[7],xa3_in};
xa4_reg <= {xa4_in[7],xa4_in}; xa5_reg <= {xa5_in[7],xa5_in};
xa6_reg <= {xa6_in[7],xa6_in}; xa7_reg <= {xa7_in[7],xa7_in};
end
else
begin
end
end
always @ (posedge CLK or posedge RST)
begin
if (RST)
begin
toggleA <= 1'b0;
end
else
begin
toggleA <= ~toggleA;
end
end
/* adder / subtractor block */
always @ (posedge CLK or posedge RST)
begin
if (RST)
begin
add_sub1a <= 10'b0; add_sub2a <= 10'b0; add_sub3a <= 10'b0; add_sub4a <= 10'b0;
end
else
begin
if (toggleA == 1'b1)
begin
add_sub1a <= (xa7_reg + xa0_reg);
add_sub2a <= (xa6_reg + xa1_reg);
add_sub3a <= (xa5_reg + xa2_reg);
add_sub4a <= (xa4_reg + xa3_reg);
end
else if (toggleA == 1'b0)
begin
add_sub1a <= (xa7_reg - xa0_reg);
add_sub2a <= (xa6_reg - xa1_reg);
add_sub3a <= (xa5_reg - xa2_reg);
add_sub4a <= (xa4_reg - xa3_reg);
end
end
end
/* The above if else statement used to get the add_sub signals can also be implemented
using the adsu16 library element as follows */
//ADSU8 adsu8_1 (.A(xa0_reg), .B(xa7_reg), .ADD(toggleA), .CI(1'b0), .S(add_sub1a_all), .OFL(add_sub1a_ofl), .CO(open));
//ADSU8 adsu8_2 (.A(xa1_reg), .B(xa6_reg), .ADD(toggleA), .CI(1'b0), .S(add_sub2a_all), .OFL(add_sub1a_ofl), .CO(open));
//ADSU8 adsu8_3 (.A(xa2_reg), .B(xa5_reg), .ADD(toggleA), .CI(1'b0), .S(add_sub3a_all), .OFL(add_sub1a_ofl), .CO(open));
//ADSU8 adsu8_4 (.A(xa3_reg), .B(xa4_reg), .ADD(toggleA), .CI(1'b0), .S(add_sub4a_all), .OFL(add_sub1a_ofl), .CO(open));
/* In addition, Coregen can be used to create adder/subtractor units specific to a particular
device. The coregen model is then instantiated in the design file. The verilog file from
coregen can be used along with the design files for simulation and implementation. */
//add_sub adsu8_1 (.A(xa0_reg), .B(xa7_reg), .ADD(toggleA), .CLK(CLK), .Q(add_sub1a));
//add_sub adsu8_2 (.A(xa1_reg), .B(xa6_reg), .ADD(toggleA), .CLK(CLK), .Q(add_sub2a));
//add_sub adsu8_3 (.A(xa2_reg), .B(xa5_reg), .ADD(toggleA), .CLK(CLK), .Q(add_sub3a));
//add_sub adsu8_4 (.A(xa3_reg), .B(xa4_reg), .ADD(toggleA), .CLK(CLK), .Q(add_sub4a));
/* multiply the outputs of the add/sub block with the 8 sets of stored coefficients */
/* The inputs are shifted thru 8 registers in 8 clk cycles. The ouput of the shift
registers are registered at the 9th clk. The values are then added or subtracted at the 10th
clk. The first mutiplier output is obtained at the 11th clk. Memoryx[0] shd be accessed
at the 11th clk*/
/*wait state counter */
always @ (posedge RST or posedge CLK)
begin
if (RST)
begin
i_wait <= 2'b01;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -