📄 xfft_v4_1_timing_calculator_fft64.vhd
字号:
--
-- Copyright(C) 2007 by Xilinx, Inc. All rights reserved.
-- This text/file contains proprietary, confidential
-- information of Xilinx, Inc., is distributed under license
-- from Xilinx, Inc., and may be used, copied and/or
-- disclosed only pursuant to the terms of a valid license
-- agreement with Xilinx, Inc. Xilinx hereby grants you
-- a license to use this text/file solely for design, simulation,
-- implementation and creation of design files limited
-- to Xilinx devices or technologies. Use with non-Xilinx
-- devices or technologies is expressly prohibited and
-- immediately terminates your license unless covered by
-- a separate agreement.
--
-- Xilinx is providing this design, code, or information
-- "as is" solely for use in developing programs and
-- solutions for Xilinx devices. By providing this design,
-- code, or information as one possible implementation of
-- this feature, application or standard, Xilinx is making no
-- representation that this implementation is free from any
-- claims of infringement. You are responsible for
-- obtaining any rights you may require for your implementation.
-- Xilinx expressly disclaims any warranty whatsoever with
-- respect to the adequacy of the implementation, including
-- but not limited to any warranties or representations that this
-- implementation is free from claims of infringement, implied
-- warranties of merchantability or fitness for a particular
-- purpose.
--
-- Xilinx products are not intended for use in life support
-- appliances, devices, or systems. Use in such applications are
-- expressly prohibited.
--
-- This copyright and support notice must be retained as part
-- of this text at all times. (c) Copyright 1995-2007 Xilinx, Inc.
-- All rights reserved.
------------------------------------------------------------------------------
-- Timing model for FFT v4.1 (all architectures)
-- This model takes top level control inputs and models
-- the timing and latency of the FFT core to generate
-- expected top level control outputs at the expected time.
-- These outputs can be compared cycle-by-cycle with the
-- equivalent outputs of the FFT core in the testbench.
-----------------------------------------------------------------------
LIBRARY ieee;
USE ieee.std_logic_1164.ALL;
USE ieee.std_logic_unsigned.ALL; -- for conv_integer
USE ieee.std_logic_arith.ALL; -- for conv_std_logic_vector
LIBRARY work;
USE work.timing_model_pkg.ALL; -- for latency functions
PACKAGE timing_pkg IS
FUNCTION get_unload_delay(C_FAMILY, C_XDEVICEFAMILY : STRING; C_ARCH, C_DATA_MEM_TYPE, C_NFFT_MAX, C_TWIDDLE_MEM_TYPE, C_TWIDDLE_WIDTH : INTEGER) RETURN INTEGER;
FUNCTION get_extra_latency_r4(C_FAMILY, C_XDEVICEFAMILY : STRING; C_TWIDDLE_MEM_TYPE, C_NFFT_MAX, C_TWIDDLE_WIDTH, C_OUTPUT_WIDTH, C_FAST_CMPY, C_FAST_BFY, C_HAS_SCALING, C_HAS_ROUNDING, C_DATA_MEM_TYPE : INTEGER) RETURN INTEGER;
FUNCTION get_extra_latency_r2(C_FAMILY, C_XDEVICEFAMILY : STRING; C_TWIDDLE_MEM_TYPE, C_NFFT_MAX, C_TWIDDLE_WIDTH, C_OUTPUT_WIDTH, C_FAST_CMPY, C_FAST_BFY, C_HAS_SCALING, C_HAS_ROUNDING, C_DATA_MEM_TYPE : INTEGER) RETURN INTEGER;
FUNCTION get_extra_latency_r22(C_FAMILY, C_XDEVICEFAMILY : STRING; NFFT, C_NFFT_MAX, C_BRAM_STAGES, C_HAS_SCALING, C_INPUT_WIDTH, C_FAST_BFY, C_FAST_CMPY, C_HAS_NFFT, C_TWIDDLE_WIDTH, C_HAS_ROUNDING : INTEGER) RETURN INTEGER;
FUNCTION get_output_order_latency_r22(C_FAMILY, C_XDEVICEFAMILY : STRING; C_HAS_NATURAL_OUTPUT, C_BRAM_STAGES, C_HAS_NFFT, C_NFFT_MAX, NFFT : INTEGER) RETURN INTEGER;
FUNCTION get_fast_bfy_latency_r22(C_NFFT_MAX, C_HAS_NFFT, C_FAST_BFY, NFFT : INTEGER) RETURN INTEGER;
FUNCTION get_extra_latency_so(C_FAMILY, C_XDEVICEFAMILY : STRING; C_DATA_MEM_TYPE, C_HAS_ROUNDING, C_HAS_SCALING, C_NFFT_MAX, C_OUTPUT_WIDTH, C_TWIDDLE_MEM_TYPE, C_TWIDDLE_WIDTH, C_FAST_BFY, C_FAST_CMPY, C_HAS_NFFT, C_HAS_BFP : INTEGER) RETURN INTEGER;
FUNCTION get_output_order_latency_so(C_FAMILY, C_XDEVICEFAMILY : STRING; C_HAS_NATURAL_OUTPUT, C_DATA_MEM_TYPE, C_NFFT_MAX, C_TWIDDLE_MEM_TYPE, C_TWIDDLE_WIDTH : INTEGER) RETURN INTEGER;
FUNCTION get_run_latency(C_FAMILY, C_XDEVICEFAMILY : STRING; NFFT, C_ARCH, C_DATA_MEM_TYPE, C_HAS_ROUNDING, C_HAS_SCALING, C_NFFT_MAX, C_OUTPUT_WIDTH, C_TWIDDLE_MEM_TYPE, C_TWIDDLE_WIDTH, C_FAST_BFY, C_FAST_CMPY, C_HAS_NFFT, C_HAS_BFP, C_BRAM_STAGES, C_INPUT_WIDTH, C_HAS_NATURAL_OUTPUT : INTEGER) RETURN INTEGER;
END timing_pkg;
PACKAGE BODY timing_pkg IS
-- purpose: get the number of cycles delay between UNLOAD->high and data being output
FUNCTION get_unload_delay(C_FAMILY, C_XDEVICEFAMILY : STRING; C_ARCH, C_DATA_MEM_TYPE, C_NFFT_MAX, C_TWIDDLE_MEM_TYPE, C_TWIDDLE_WIDTH : INTEGER) RETURN INTEGER IS
-- There are an extra 3 cycles for getting data out of the memory for Arch
-- E - two of these appear to be address setup latency. The final cycle
-- comes from the output register of the bus mux controlled by xk_enable
CONSTANT ARCH_E_FUDGE_FACTOR : INTEGER := 3;
VARIABLE result : INTEGER;
BEGIN
-- Number of cycles delay is architecture and family dependent
CASE C_ARCH IS
WHEN 1 => -- radix-4 burst
result := get_mem_delay(C_FAMILY, C_XDEVICEFAMILY, C_DATA_MEM_TYPE, C_NFFT_MAX-2, get_twiddle_latency(C_FAMILY, C_XDEVICEFAMILY, C_TWIDDLE_MEM_TYPE, C_NFFT_MAX-1, C_TWIDDLE_WIDTH), 4, 4, 1, 1) + 5;
WHEN 2 => -- radix-2 burst
result := get_mem_delay(C_FAMILY, C_XDEVICEFAMILY, C_DATA_MEM_TYPE, C_NFFT_MAX-1, get_twiddle_latency(C_FAMILY, C_XDEVICEFAMILY, C_TWIDDLE_MEM_TYPE, C_NFFT_MAX-1, C_TWIDDLE_WIDTH), 2, 2, 1, 1) + 3;
WHEN 3 => -- streaming
result := 0;
WHEN 4 => -- single output
result := get_min_mem_delay(C_FAMILY, C_XDEVICEFAMILY, C_DATA_MEM_TYPE, C_NFFT_MAX) + ARCH_E_FUDGE_FACTOR;
WHEN OTHERS => -- unknown architecture: throw an error
ASSERT FALSE REPORT "timing_model : get_unload_delay : unknown value of C_ARCH" SEVERITY ERROR;
END CASE;
RETURN result;
END get_unload_delay;
-- purpose: get extra rank end latency in radix-4 burst arch
FUNCTION get_extra_latency_r4(C_FAMILY, C_XDEVICEFAMILY : STRING; C_TWIDDLE_MEM_TYPE, C_NFFT_MAX, C_TWIDDLE_WIDTH, C_OUTPUT_WIDTH, C_FAST_CMPY, C_FAST_BFY, C_HAS_SCALING, C_HAS_ROUNDING, C_DATA_MEM_TYPE : INTEGER) RETURN INTEGER IS
-- number of clk cycles between xn_index and the corresponding data
CONSTANT INPUT_MEMORY_DELAY : INTEGER := 3;
-- actual latencies of sub-modules
CONSTANT SWITCH_DELAY : INTEGER := 1;
CONSTANT TWGEN_DELAY : INTEGER := get_twiddle_latency(C_FAMILY, C_XDEVICEFAMILY, C_TWIDDLE_MEM_TYPE, C_NFFT_MAX-1, C_TWIDDLE_WIDTH);
CONSTANT DRFLY_WIDTH : INTEGER := C_OUTPUT_WIDTH+4;
CONSTANT MULT_OUT_WIDTH : INTEGER := cmult_out_width(C_FAMILY, DRFLY_WIDTH, C_OUTPUT_WIDTH, C_TWIDDLE_WIDTH);
CONSTANT MULT_TYPE : INTEGER := cmpy_arch(C_FAMILY, C_XDEVICEFAMILY, eval(C_FAST_CMPY = 0), max_i(C_OUTPUT_WIDTH, C_TWIDDLE_WIDTH),
min_i(C_OUTPUT_WIDTH, C_TWIDDLE_WIDTH));
CONSTANT MULT_DELAY : INTEGER := mult_latency_bc(C_FAMILY, C_XDEVICEFAMILY, eval(C_FAST_CMPY = 0), C_OUTPUT_WIDTH, C_TWIDDLE_WIDTH,
MULT_OUT_WIDTH, 0, eval((MULT_TYPE /= 4) AND (MULT_TYPE /= 5)), 1, 1, 0);
CONSTANT PE_DELAY : INTEGER := pe_latency_b(C_FAST_BFY, MULT_DELAY, 1, C_HAS_SCALING, C_HAS_ROUNDING);
CONSTANT MUX_DELAY : INTEGER := 1;
CONSTANT RW_ADDR_GEN_DELAY : INTEGER := 4;
CONSTANT TW_ADDR_GEN_DELAY : INTEGER := 4;
-- delay line depths calculated from the above latencies
CONSTANT INPUT_MEM_WR_DELAY : INTEGER := INPUT_MEMORY_DELAY;
CONSTANT MEM_DELAY : INTEGER := get_mem_delay(C_FAMILY, C_XDEVICEFAMILY, C_DATA_MEM_TYPE, C_NFFT_MAX-2, TWGEN_DELAY, TW_ADDR_GEN_DELAY,
RW_ADDR_GEN_DELAY, MUX_DELAY, SWITCH_DELAY);
--for last frame of digit-reversed order:
CONSTANT PE_PAD_DELAY : INTEGER := (RW_ADDR_GEN_DELAY + MUX_DELAY + MEM_DELAY + SWITCH_DELAY + PE_DELAY + SWITCH_DELAY + MUX_DELAY) - (INPUT_MEM_WR_DELAY + MUX_DELAY);
BEGIN
RETURN PE_PAD_DELAY + 1; -- +1 because flow_control_b uses cnt_tc_rtl to count to PE_PAD_DELAY, but cnt_tc_rtl counts 1 past that.
END get_extra_latency_r4;
-- purpose: get extra rank end latency in radix-2 burst arch
FUNCTION get_extra_latency_r2(C_FAMILY, C_XDEVICEFAMILY : STRING; C_TWIDDLE_MEM_TYPE, C_NFFT_MAX, C_TWIDDLE_WIDTH, C_OUTPUT_WIDTH, C_FAST_CMPY, C_FAST_BFY, C_HAS_SCALING, C_HAS_ROUNDING, C_DATA_MEM_TYPE : INTEGER) RETURN INTEGER IS
-- actual latencies of sub-modules
CONSTANT SWITCH_DELAY : INTEGER := 1;
CONSTANT TWGEN_DELAY : INTEGER := get_twiddle_latency(C_FAMILY, C_XDEVICEFAMILY, C_TWIDDLE_MEM_TYPE, C_NFFT_MAX-1, C_TWIDDLE_WIDTH);
CONSTANT BTRFLY_WIDTH : INTEGER := C_OUTPUT_WIDTH + 4;
CONSTANT MULT_OUT_WIDTH : INTEGER := cmult_out_width(C_FAMILY, BTRFLY_WIDTH, C_OUTPUT_WIDTH, C_TWIDDLE_WIDTH);
CONSTANT MULT_TYPE : INTEGER := cmpy_arch(C_FAMILY, C_XDEVICEFAMILY, eval(C_FAST_CMPY = 0), max_i(C_OUTPUT_WIDTH, C_TWIDDLE_WIDTH),
min_i(C_OUTPUT_WIDTH, C_TWIDDLE_WIDTH));
CONSTANT MULT_DELAY : INTEGER := mult_latency_bc(C_FAMILY, C_XDEVICEFAMILY, eval(C_FAST_CMPY = 0), C_OUTPUT_WIDTH, C_TWIDDLE_WIDTH,
MULT_OUT_WIDTH, 0, eval((MULT_TYPE /= 4) AND (MULT_TYPE /= 5)), 1, 1, 0);
CONSTANT PE_DELAY : INTEGER := r2_pe_latency(C_FAST_BFY, MULT_DELAY, C_HAS_SCALING, C_HAS_ROUNDING);
CONSTANT MUX_DELAY : INTEGER := 1;
CONSTANT RW_ADDR_GEN_DELAY : INTEGER := 2;
CONSTANT TW_ADDR_GEN_DELAY : INTEGER := 2;
-- delay line depths calculated from the above latencies
CONSTANT MEM_DELAY : INTEGER := get_mem_delay(C_FAMILY, C_XDEVICEFAMILY, C_DATA_MEM_TYPE, C_NFFT_MAX-1, TWGEN_DELAY, TW_ADDR_GEN_DELAY, RW_ADDR_GEN_DELAY, MUX_DELAY, SWITCH_DELAY);
CONSTANT PE_PAD_DELAY : INTEGER := (RW_ADDR_GEN_DELAY + MUX_DELAY + MEM_DELAY + SWITCH_DELAY + PE_DELAY + SWITCH_DELAY + MUX_DELAY) - (RW_ADDR_GEN_DELAY + MUX_DELAY);
BEGIN
RETURN PE_PAD_DELAY + 1; -- +1 because flow_control_c uses cnt_tc_rtl to count to PE_PAD_DELAY, but cnt_tc_rtl counts 1 past that.
END get_extra_latency_r2;
-- purpose: get extra latency incurred in each pipeline stage in streaming arch
FUNCTION get_extra_latency_r22(C_FAMILY, C_XDEVICEFAMILY : STRING; NFFT, C_NFFT_MAX, C_BRAM_STAGES, C_HAS_SCALING, C_INPUT_WIDTH, C_FAST_BFY, C_FAST_CMPY, C_HAS_NFFT, C_TWIDDLE_WIDTH, C_HAS_ROUNDING : INTEGER) RETURN INTEGER IS
CONSTANT NUMBER_OF_PEs : INTEGER := (C_NFFT_MAX+1)/2;
CONSTANT LAST_PE_HAS_1_BF : INTEGER := C_NFFT_MAX MOD 2;
CONSTANT PEs_IN_USE : INTEGER := (NFFT+1+LAST_PE_HAS_1_BF)/2;
CONSTANT MEM_TYPE : r22_const_array := r22_mem_type(C_NFFT_MAX, C_BRAM_STAGES);
CONSTANT WIDTH_OF_PE : r22_const_array := r22_pe_width(C_HAS_SCALING, C_NFFT_MAX, C_INPUT_WIDTH);
CONSTANT PE_LATENCIES : r22_const_array := r22_pe_latency(C_FAMILY, C_XDEVICEFAMILY, C_FAST_BFY, C_FAST_CMPY, 0, C_HAS_NFFT, C_NFFT_MAX, C_TWIDDLE_WIDTH, C_HAS_SCALING, C_HAS_ROUNDING, 1, width_of_pe, mem_type);
VARIABLE result : INTEGER := 0;
BEGIN
-- Sum the latency of all pipeline stages, excluding stages bypassed because NFFT<C_NFFT_MAX
FOR i IN NUMBER_OF_PEs-PEs_IN_USE TO NUMBER_OF_PEs-1 LOOP
result := result + PE_LATENCIES(i);
END LOOP;
RETURN result;
END get_extra_latency_r22;
-- purpose: get latency adjustment depending on output ordering in streaming arch
FUNCTION get_output_order_latency_r22(C_FAMILY, C_XDEVICEFAMILY : STRING; C_HAS_NATURAL_OUTPUT, C_BRAM_STAGES, C_HAS_NFFT, C_NFFT_MAX, NFFT : INTEGER) RETURN INTEGER IS
CONSTANT REORDER_BUFFER_MEM_TYPE : INTEGER := eval(C_NFFT_MAX > 7 OR C_BRAM_STAGES > 0);
VARIABLE result : INTEGER;
BEGIN
-- Latency adjustment is different for bit-reversed or natural order output
IF C_HAS_NATURAL_OUTPUT = 0 THEN
-- Bit-reversed output
IF C_HAS_NFFT = 0 THEN
-- Fixed point size: Latency adjustment is -1
result := -1;
ELSE
-- Variable point size: Latency adjustment is 0 if any PE is bypassed, -1 if all PEs are used
IF C_NFFT_MAX - NFFT > 1 THEN
result := 0;
ELSE
result := -1;
END IF;
END IF;
ELSE
-- Natural order output
-- Initial latency adjustment is 2**(NFFT)
result := 2**(NFFT);
-- Additional latency for the reorder buffer read latency
result := result + get_min_mem_delay(C_FAMILY, C_XDEVICEFAMILY, REORDER_BUFFER_MEM_TYPE, C_NFFT_MAX+1) - 1;
-- Additional latency adjustment of +1 for variable point size if any PE is bypassed
IF C_HAS_NFFT = 1 AND C_NFFT_MAX - NFFT > 1 THEN
result := result + 1;
END IF;
END IF;
RETURN result;
END get_output_order_latency_r22;
-- purpose: get extra latency due to DSP48-based butterflies with variable point size
FUNCTION get_fast_bfy_latency_r22(C_NFFT_MAX, C_HAS_NFFT, C_FAST_BFY, NFFT : INTEGER) RETURN INTEGER IS
VARIABLE result : INTEGER;
BEGIN
-- If DSP48s are used in butterflies, and point size is variable, and the last butterfly
-- in the last PE is bypassed (C_NFFT_MAX - NFFT is odd), add 4 to latency for DSP48 bypass delay
IF C_FAST_BFY = 1 AND C_HAS_NFFT = 1 AND (C_NFFT_MAX - NFFT) MOD 2 = 1 THEN
result := 4;
ELSE
result := 0;
END IF;
RETURN result;
END get_fast_bfy_latency_r22;
-- purpose: get extra rank end latency in single output arch
FUNCTION get_extra_latency_so(C_FAMILY, C_XDEVICEFAMILY : STRING; C_DATA_MEM_TYPE, C_HAS_ROUNDING, C_HAS_SCALING, C_NFFT_MAX, C_OUTPUT_WIDTH, C_TWIDDLE_MEM_TYPE, C_TWIDDLE_WIDTH, C_FAST_BFY, C_FAST_CMPY, C_HAS_NFFT, C_HAS_BFP : INTEGER) RETURN INTEGER IS
CONSTANT PE_LATENCY_ACTUAL : INTEGER := so_pe_latency(C_FAMILY, C_XDEVICEFAMILY, C_DATA_MEM_TYPE, C_HAS_ROUNDING, C_HAS_SCALING, C_NFFT_MAX, C_OUTPUT_WIDTH, C_TWIDDLE_MEM_TYPE, C_TWIDDLE_WIDTH, C_FAST_BFY, C_FAST_CMPY); -- FFT processing engine latency
CONSTANT PE_LATENCY : INTEGER := PE_LATENCY_ACTUAL + 1; -- +1 to ensure write occurs before read of same address
CONSTANT NFFT_MIN : INTEGER := get_nfft_min(4, C_HAS_NFFT, C_NFFT_MAX); -- Minimum point size
CONSTANT NFFT_MIN_REUSE : INTEGER := so_data_reuse(NFFT_MIN); -- Data sample reuse time at min point size
CONSTANT BFP_SCALE_LATENCY : INTEGER := so_bfp_scale_gen_latency(C_HAS_SCALING, C_HAS_ROUNDING) * C_HAS_BFP; -- Latency of generating BFP scaling factor
CONSTANT MAX_REUSE_WAIT : INTEGER := max_i(PE_LATENCY-NFFT_MIN_REUSE, 0); -- Maximum rank end wait due to data reuse
CONSTANT MAX_WAIT : INTEGER := max_i(MAX_REUSE_WAIT, BFP_SCALE_LATENCY); -- Maximum rank end wait for any reason
CONSTANT MAX_WAIT_ADJUST : INTEGER := BOOLEAN'pos(PE_LATENCY_ACTUAL = NFFT_MIN_REUSE); -- Need to add a cycle if the PE latency is the same as the reuse latency
BEGIN
RETURN MAX_WAIT + MAX_WAIT_ADJUST;
END get_extra_latency_so;
-- purpose: get latency adjustment depending on output ordering in single output arch
FUNCTION get_output_order_latency_so(C_FAMILY, C_XDEVICEFAMILY : STRING; C_HAS_NATURAL_OUTPUT, C_DATA_MEM_TYPE, C_NFFT_MAX, C_TWIDDLE_MEM_TYPE, C_TWIDDLE_WIDTH : INTEGER) RETURN INTEGER IS -- REVISIT parameters
CONSTANT SO_RUN_ADDR_GEN_LATENCY : INTEGER := 4; -- Latency of run address generator
CONSTANT XN_RE_DELAY : INTEGER := 3; -- Cycles XN_RE and XN_IM are delayed relative to XN_INDEX
VARIABLE result : INTEGER;
BEGIN
-- Latency is always increased by the latency of the run address generator
result := SO_RUN_ADDR_GEN_LATENCY;
-- Extra latency because the core doesn't
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -