⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bkfir_flp32.asm

📁 本源程序是用于数字信号处理中32位的傅立叶变换的程序
💻 ASM
字号:
/*************************************************************************

-rev1.0 PM, 9/2003
-rev1.1 PM, 12/2003
-rev1.2 PM, 7/2004, project passed into VDSP3.5

-this program implements a floating point block FIR
-it can be compiled and run on TS101 and TS201.
-TS101 has only 2 memory blocks (section 1 and 2) where data buffers may be placed.
-TS201 has 5 memory blocks (section 1, 2, 3, 4, 5) where data buffers may be placed.
-In this program only 2 memory blocks have been used to maintain compatibility between
TS101 and TS201

-2 tcl files are provided, one for TS101 and one for TS201. Both of them build
the correspondent project and save the output buffer into a file output.dat
-.align_code 4 instruction has been introduced throughout the main part of the program
to efficientize the cycle count for TS201 (On this processor they may be even discarded
if the cycle count is not of interest). For TS101, the assembly option -align-branch-lines 
placed in the project properties tab has the same effect (On TS101 they are a must for 
IF instructions).

-at the beginning, the delay line is copied into the scratch buffer.
-Then the filter is computed: each time the LC1 loop is performed,
2 outputs are calculated, one in Y block, the other in X block. This
allows to use SIMD instructions and achieves 100% efficiency: there is no stall
and all the instructions contain 2 floating point multiplications.
-in the end, the last T filter outputs are copied into the delay line.


-the program functions for T divisible by 8
-The filter itself may be organized in two nested loops:
    -one that computes two outputs, one in Y and other in X block
    -one that repeats the filter N/2 times

- at the end of the program, the cycle_count variable contains the
cycle count of the main program
-the number of cycles for TS101 is 7688 and for TS201 is 7685
-the ideal number of operations is T/8*2+N*T/2+T/8*2=T/2+N*T/2=(N+1)*T/2=7236,
  so the efficiency is  7236/7685=94% for TS201 and 7236/7688=94% for TS101
*************************************************************************/

#define N 200
#define	TAPS 72

/*************************************************************************/

.section data1;
.VAR indata[N] = "indata.dat";
.VAR output[N];
.VAR INaddr[N+TAPS-1];


.section data2;
.VAR coeff[TAPS] = "coeff.dat";
.VAR DLYaddr[TAPS];
.VAR OUTaddr[N];
.var cycle_count;    // program cycle counts

#ifdef __ADSPTS201__
  #include <defts201.h>
#endif

  #include "cache_macros.h"

/* Main Program Section */

.section program;


//-------------POWER UP---------------------------------------------------
powerup:
	// initialize delay line to zero


#ifdef __ADSPTS201__
/*in the case of TS201, at the beginning of the program the
cache must be enabled. The procedure is contained in the
cache_enable macro that uses the refresh rate as input parameter
      -if CCLK=500MHz, refresh_rate=750
      -if CCLK=400MHz, refresh_rate=600
      -if CCLK=300MHz, refresh_rate=450
      -if CCLK=250MHz, refresh_rate=375
*/
  cache_enable(750);


//cache preload.

      j0 = j31 + indata; LC0 = N/4;;
//due to a TS201 rev0 anomaly, the initialization of LC0 must be at least
//4 instruction lines before the end of the loop (jump instruction included)
//and the loop must be at least 2 cycles long
    nop;nop;;
.align_code 4;
ini_cache:
      xr3:0 = q[j0+=0];;
.align_code 4;
      if NLC0E, jump ini_cache; q[j0+=4] = xr3:0;;

      j0 = j31 + coeff; LC0 = TAPS/4;;
//due to a TS201 rev0 anomaly, the initialization of LC0 must be at least
//4 instruction lines before the end of the loop (jump instruction included)
//and the loop must be at least 2 cycles long
    nop;nop;;
.align_code 4;
ini_cache1:
      xr3:0 = q[j0+=0];;
.align_code 4;
      if NLC0E, jump ini_cache1; q[j0+=4] = xr3:0;;

#endif

//write 0 across the delay line

  xr1:0 = r1:0-r1:0;; //xr1:0=0
  j0 = j31 + DLYaddr; LC0 = TAPS/2;;
#ifdef __ADSPTS201__
//due to a TS201 rev0 anomaly, the initialization of LC0 must be at least
//4 instruction lines before the end of the loop (jump instruction included)
//and the loop must be at least 2 cycles long
    nop;nop;;
#endif
ini_delay_line:
  nop;;
.align_code 4;
  if NLC0E, jump ini_delay_line; l[j0+=2] = xr1:0;;

  ini_linear_addressing;

end_powerup:

//-------------TEST LOOP--------------------------------------------------
  j1 = j31 + indata;; //pointer initialized at the beginning of indata buffer
  k11 = k31 + output;;//pointer initialized at the beginning of the output buffer

// copy N samples from the input buffer to INaddr buffer
//pointer initialized where N input data must be written
  k1 = k31 + INaddr + TAPS-1; LC0 = N;;
#ifdef __ADSPTS201__
//due to a TS201 rev0 anomaly, the initialization of LC0 must be at least
//4 instruction lines before the end of the loop (jump instruction included)
//and the loop must be at least 2 cycles long
  nop;nop;;
#endif
.align_code 4;
ini_delay_buffer:
  xr0 = [j1+=1];;
.align_code 4;
  if NLC0E, jump ini_delay_buffer; [k1+=1] = xr0;;


//-------------Start of code--------------------------------------------------
main:
//read cycle counter

  ini_cycle_count;

//the delay line elements are copied to the scratch buffer at the start of
//INaddr and pointers are initialized

  j0 = j31 + DLYaddr; LC0 = TAPS/8-1;;
  k3 = k31 + INaddr;;
  xr3:0 = q[j0+=4];;

.align_code 4;
ini_dly_buffer:
  q[k3+=4] = xr3:0; xr7:4 = q[j0+=4];;
.align_code 4;
  if NLC0E, jump ini_dly_buffer; q[k3+=4] = xr7:4; xr3:0 = q[j0+=4];;

  q[k3+=4] = xr3:0; xr11:8 = q[j0+=4];;
  l[k3+=2] = xr9:8; j0 = j31 + INaddr;;

  k2  = k31 + coeff;;
  k0 = k31 + INaddr + 1+4;;

//ini dab for both X and Y
  r3:0 = dab q[j0+=4]; LC1 = N/2;;

  yr3:0 = dab q[j0+=4]; r7:4 = q[k2+=4];;
  xr3:0 = dab q[k0+=4];;

//the last element of the delay line is saved into the scratch buffer
//               j10=pointer to output buffer
  [k3+=1] = xr10; j10 = j31 + OUTaddr;;

//                                          M0
  yr11:8 = dab q[j0+=4]; r15:12 = q[k2+=4]; fr20 = r0 * r4;;

//                                          M1
  xr11:8 = dab q[k0+=4];                    fr21 = r1 * r5;;

//                                          M2
  LC0 = TAPS/8 - 2;                         fr16 = r2 * r6;;

//                                          M3
                                            fr17 = r3 * r7;;
.align_code 4;
loop:

//                                       S0                M4
  yr3:0 = dab q[j0+=4]; r7:4 = q[k2+=4]; fr20 = r20 + r16; fr16 = r8 * r12;;

//                      S1                M5
  xr3:0 = dab q[k0+=4]; fr21 = r21 + r17; fr17 = r9 * r13;;

//                      S0                M6
                        fr20 = r20 + r16; fr16 = r10 * r14;;

//                      S1                M7
                        fr21 = r21 + r17; fr17 = r11 * r15;;

//                                          S0                M8
  yr11:8 = dab q[j0+=4]; r15:12 = q[k2+=4]; fr20 = r20 + r16; fr16 = r0 * r4;;

//                      S1                M9
  xr11:8 = dab q[k0+=4];fr21 = r21 + r17; fr17 = r1 * r5;;

//                      S0                M10
                        fr20 = r20 + r16; fr16 = r2 * r6;;

//                      S1                M11
.align_code 4;
  if NLC0E, jump loop;  fr21 = r21 + r17; fr17 = r3 * r7;;


  yr3:0 = dab q[j0+=4]; r7:4 = q[k2+=4]; fr20 = r20 + r16; fr16 = r8 * r12;;
  xr3:0 = dab q[k0+=4];                  fr21 = r21 + r17; fr17 = r9 * r13;;
                                         fr20 = r20 + r16; fr16 = r10 * r14;;
                                         fr21 = r21 + r17; fr17 = r11 * r15;;
//these are the last elements for this round of outputs
  yr11:8 = dab q[j0+=4]; r15:12 = q[k2+=4]; fr20 = r20 + r16; fr16 = r0 * r4;;
  xr11:8 = dab q[k0+=4];                    fr21 = r21 + r17; fr17 = r1 * r5;;
  j0 = j0 - ((TAPS-2)+4); k0 = k0 - (TAPS-2); fr20 = r20 + r16; fr16 = r2 * r6;;
  k2 = k2 - TAPS;                           fr21 = r21 + r17; fr17 = r3 * r7;;

//the fetches are for the next round of outputs
//ini DAB for both X and Y
  r3:0 = dab q[j0+=4]; r7:4 = q[k2+=4];  fr20 = r20 + r16; fr16 = r8 * r12;;
  yr3:0 = dab q[j0+=4];                  fr21 = r21 + r17; fr17 = r9 * r13;;
  xr3:0 = dab q[k0+=4];                  fr20 = r20 + r16; fr16 = r10 * r14;;
                                         fr21 = r21 + r17; fr17 = r11 * r15;;

  yr11:8 = dab q[j0+=4]; r15:12 = q[k2+=4]; fr18 = r20 + r16; fr20 = r0 * r4;;
  xr11:8 = dab q[k0+=4];                    fr19 = r21 + r17; fr21 = r1 * r5;;
                                                              fr16 = r2 * r6;;
                                            fr18 = r18 + r19; fr17 = r3 * r7;;

  yr3:0 = dab q[j0+=4]; r7:4 = q[k2+=4]; fr20 = r20 + r16; fr16 = r8 * r12;;
  xr3:0 = dab q[k0+=4];                  fr21 = r21 + r17; fr17 = r9 * r13;;
//the outputs are saved into OUTaddr
  l[j10+=2] = r18;                       fr20 = r20 + r16; fr16 = r10 * r14;;
                                         fr21 = r21 + r17; fr17 = r11 * r15;;

  yr11:8 = dab q[j0+=4]; r15:12 = q[k2+=4]; fr20 = r20 + r16; fr16 = r0 * r4;;
  xr11:8 = dab q[k0+=4];                    fr21 = r21 + r17; fr17 = r1 * r5;;
                                            fr20 = r20 + r16; fr16 = r2 * r6;;
.align_code 4;
  if NLC1E, jump loop; LC0=TAPS/8-3;        fr21 = r21 + r17; fr17 = r3 * r7;;

//update the delay line

  j0 = j31 + (INaddr+N); LC0 = TAPS/8-1;;
  k0 = k31 + DLYaddr;;

  xr3:0 = q[j0+=4];;
.align_code 4;
update_dly_buffer:
  q[k0+=4] = xr3:0; xr7:4 = q[j0+=4];;
.align_code 4;
  if NLC0E, jump update_dly_buffer; q[k0+=4] = xr7:4; xr3:0 = q[j0+=4];;

  q[k0+=4] = xr3:0; xr7:4 = q[j0+=4];;
  l[k0+=2] = xr5:4;;
  [k0+=1] = xr6;;

main.end:
//read cycle counter and compute the benchmark cycle count

  comp_cycle_count;

//-------------TEST LOOP--------------------------------------------------
// copy N samples from benchmark's output buffer to test output
  j2 = j31 + OUTaddr; LC0 = N/4;; //k11 init at the beginning of testloop at output
#ifdef __ADSPTS201__
//due to a TS201 rev0 anomaly, the initialization of LC0 must be at least
//4 instruction lines before the end of the loop (jump instruction included)
//and the loop must be at least 2 cycles long
  nop;nop;;
#endif
.align_code 4;
copy_to_output:
  xr3:0 = q[j2+=4];;
.align_code 4;
  if NLC0E, jump copy_to_output; q[k11+=4] = xr3:0;;

___lib_prog_term:
    nop;nop;nop;nop;;


⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -