lms_flp32.asm

来自「基于TS201 Floating Point LMS Filter」· 汇编代码 · 共 303 行
ASM
303 行
/*************************************************************************

-this program implements a floating point lms filter
-it computes one output at a time
-it can be compiled and run on TS101 and TS201.
-TS101 has only 2 memory blocks (section 1 and 2) where data buffers may be placed.
-TS201 has 5 memory blocks (section 1, 2, 3, 4, 5) where data buffers may be placed.
-In this program only 2 memory blocks have been used to maintain compatibility between
TS101 and TS201

-2 tcl files are provided, one for TS101 and one for TS201. Both of them build
the correspondent project and save the output buffer into a file output.dat
-.align_code 4 instruction has been introduced throughout the main part of the program
to efficientize the cycle count for TS201 (On this processor they may be even discarded
if the cycle count is not of interest). For TS101, the assembly option -align-branch-lines
placed in the project properties tab has the same effect (On TS101 they are a must for
IF instructions).

-the number of cycles on TS201 is T+18=34 cycles for each output

-the filter equations are:
  y(n)=sum hi(n)*x(n-i), sum is for i=0 to i=T-1 and n=0,1,2,3,...
        y(n)=h0(n)*x(n)+h1(n)*x(n-1)+h2(n)*x(n-2)+...+h(T-1)(n)*x(n-(T-1))
  e(n)=d(n)-y(n)
  hi(n+1)=hi(n)+epsilon*e(n)*x(n-i)
        h0(n+1)=h0(n)+epsilon*e(n)*x(n)
        h1(n+1)=h1(n)+epsilon*e(n)*x(n-1)
        ...
-epsilon=0.3

-In order to reduce the number of cycles, the algorithm is executed in this order:
1)ee=epsilon*e(n-1) already computed
2)update the delay line
        hi(n)=h(n-1)+epsilon*e(n-1)*x(n-i-1), i=0,1,..,15
          h0(n)=h0(n-1)+ee*x(n-1)
          h1(n)=h1(n-1)+ee*x(n-2)
          ...
          h15(n)=h15(n-1)+ee*x(n-16)
3)introduce x(n) into the delay line
4)compute y(n)=h0(n)*x(n)+h1(n)*x(n-1)+h2(n)*x(n-2)+...+h(T-1)(n)*x(n-(T-1))
5)compute e(n)=d(n)-y(n)
6)compute ee=epsilon*e(n) which will be used at the next iteration

-the initial state of the delay line is presented in the first column.
-The accesses are done in quads.
    x(-16)    x(-15)      x(-14)       x(-13)
    x(-12)    x(-11)      x(-10)       x(-9)
    x(-8)     x(-7)       x(-6)        x(-5)
    x(-4)     x(-3)       x(-2)        x(-1)

    x(-15)    x(-14)      x(-13)       x(-12)
    x(-11)    x(-10)      x(-9)        x(-8)
    x(-7)     x(-6)       x(-5)        x(-4)
    x(-3)     x(-2)       x(-1)        x(0)

    x(-14)    x(-13)      x(-12)       x(-11)
    x(-10)    x(-9)       x(-8)        x(-7)
    x(-6)     x(-5)       x(-4)        x(-3)
    x(-2)     x(-1)       x(0)         x(1)

    x(-13)    x(-12)      x(-11)       x(-10)
    x(-9)     x(-8)       x(-7)        x(-6)
    x(-5)     x(-4)       x(-3)        x(-2)
    x(-1)     x(-16)      x(-15)       x(-14)

-based on these values, the new coefficients hi(n-1) are computed using
the expressions above
-then the quads x(-15) x(-11) x(-7) x(-3) are saved at the beginning of the buffer,
followed by x(-14) x(-10) x(-6) x(-2) and x(-13) x(-9) x(-5) x(-1).
-the last quad is obtained from x(-16) x(-12) x(-8) x(-4) by introducing x(0)
in the place of x(-16) and rotating the qaud to the left one location: the new
quad is x(-12) x(-8) x(-4) x(0)

-Because of this arrangement, the coefficients are arranged as following:

      h(15)
      h(11)
      h(7)
      h(3)

      h(14)
      h(10)
      h(6)
      h(2)

      h(13)
      h(9)
      h(5)
      h(1)

      h(12)
      h(8)
      h(4)
      h(0)



************************************************************************/
#define N 256
#define	TAPS 16

/*************************************************************************/

.section data1;
.VAR indata[N] = "test1\input.dat";
.VAR output[N];
.var Epsilon_Error;
.align 4;
.var delay_line[TAPS];


.section data2;
.VAR coeff[TAPS];
.VAR LMS_desired[N] = "test1\desired.dat";
.VAR Error[N];

.var cycle_count;    // program cycle counts

#ifdef __ADSPTS201__
#include <defts201.h>
#endif

#include "cache_macros.h"


/* Main Program Section */

.section program;


//-------------POWER UP---------------------------------------------------
powerup:

#ifdef __ADSPTS201__

/*in the case of TS201, at the beginning of the program the
cache must be enabled. The procedure is contained in the
cache_enable macro that uses the refresh rate as input parameter
      -if CCLK=500MHz, refresh_rate=750
      -if CCLK=400MHz, refresh_rate=600
      -if CCLK=300MHz, refresh_rate=450
      -if CCLK=250MHz, refresh_rate=375
*/
  cache_enable(750);

//-------------Optional examples to preload cache-----------------------------

  j0 = j31 + indata; LC0 = N/4;;
  k0 = k31 + LMS_desired;;
ini_cache:
.align_code 4;
  xr3:0 = q[j0+=0]; yr3:0 = q[k0+=0];;
.align_code 4;
  if NLC0E, jump ini_cache; q[j0+=4] = xr3:0; q[k0+=4] = yr3:0;;

  j0 = j31 + coeff; LC0 = TAPS/4;;
ini_cache1:
.align_code 4;
  xr3:0 = q[j0+=0];;
.align_code 4;
  if NLC0E, jump ini_cache1; q[j0+=4] = xr3:0;;

//----------------------------------------------------------------------------

#endif

  ini_linear_addressing;

//write 0 across the delay line

  xr1:0 = r1:0-r1:0;; //xr1:0=0
  j0 = j31 + delay_line; LC0 = TAPS/4;;
ini_delay_line:
.align_code 4;
  l[j0+=2] = xr1:0;;
.align_code 4;
  if NLC0E, jump ini_delay_line; l[j0+=2] = xr1:0;;

//Epsilon*[des(n)-y(n)] initialized 0
  [j31 + Epsilon_Error] = xr0;;
end_powerup:

//-------------TEST LOOP--------------------------------------------------
main:
  j0 = j31 + indata;;
  j1 = j31 + output;;
  j2 = j31 + LMS_desired;;
  j5 = j31 + Error; LC0 = N;;

testloop:
//the input x(0) is fetched
  xr31 = [j0+=1];;

//the desired value is fetched
  xr30 = [j2+=1];;

//read cycle counter

  ini_cycle_count;

//-------------program--------------------------------------------------

_main_program:

  j3 = j31 + delay_line;;
  k3 = k31 + coeff;;
  j4 = j31 + j3; k4 = k31 + k3;;

//load epsilon*[des(n-1)-y(n-1)] computed previous iteration
  r29 = [j31+Epsilon_Error];;

//                                yr1:0=x(-12) x(-16)
//                                xr1:0=x(-4) x(-8) from the delay line
//                                                 yr3:2=h(11) h(15)
//                                                 xr3:2=h(3) h(7)
                                  r1:0 = q[j3+=4]; r3:2 = q[k3+=4];;

//                                yr9:8=x(-11) x(-15)
//                                xr9:8=x(-3) x(-7) from the delay line
//                                                 yr11:10=h(10) h(14)
//                                                 xr11:10=h(2) h(6)
                                  r9:8 = q[j3+=4]; r11:10 = q[k3+=4];;
//begin computing the new coefficients hi(n)=hi(n-1)*ee*x(n-1-i)
//                                update the delay line
  fr4 = r0 * r29;                 q[j4+=4] = r9:8;;
  fr5 = r1 * r29;;
//                yr2=h15(n)
//                xr2=h7(n)
  fr4 = r8 * r29; fr2 = r2 + r4;;
//                                yr13:12=x(-10) x(-14)
//                                xr13:12=x(-2) x(-6) from the delay line
//                yr3=h(11)                          yr15:14=h(9) h(13)
//                xr3=h(3)                           xr15:14=h(1) h(5)
  fr5 = r9 * r29; fr3 = r3 + r5;  r13:12 = q[j3+=4]; r15:14 = q[k3+=4];;

//yr6=x(-15)*h15  yr10=h14(n)
//xr6=x(-7)*h7    xr10=h6(n)      update the coeff buffer
//begin computing the filter
  fr6 = r8 * r2;  fr10= r10 + r4; q[k4+=4] = r3:2;;

//                                yr17:16=x(-9) x(-13)
//                                xr17:16=x(-1) x(-5) from the delay line
//                                                   yr19:18=h(8) h(12)
//                                                   xr19:18=h(0) h(4)
  fr7 = r9 * r3;  fr11= r11 + r5; r17:16 = q[j3+=4]; r19:18 = q[k3+=4];;

//                                update the coeff buffer
  fr4 = r12 * r10;                q[k4+=4] = r11:10;;

//                                update the delay line
  fr5 = r13 * r11;                q[j4+=4] = r13:12;;

//                                update the delay line
  fr4 = r12 * r29;fr6 = r6 + r4;  q[j4+=4] = r17:16;;
//                               begin rotating x(-12) x(-8) x(-4) x(0)
  fr5 = r13 * r29;fr7 = r7 + r5; yr0 = yr1;;

//                yr14=h13
//                xr14=h5
  fr4 = r16 * r29;fr14= r14 + r4; yr1 = xr0;;

//                yr15=h9
//                xr15=h1
  fr5 = r17 * r29;fr15= r15 + r5; xr0 = xr1;;

//                                update the coeff buffer
  fr4 = r16 * r14;fr18= r18 + r4; q[k4+=4] = r15:14;;

  fr5 = r17 * r15;fr19= r19 + r5; xr1 = xr31;;

//                                update the coeff buffer
  fr4 = r0 * r18; fr6 = r6 + r4;  q[k4+=4] = r19:18;;

//                                update the delay line
  fr5 = r1 * r19; fr7 = r7 + r5; q[j4+=4] = r1:0;;
                  fr6 = r6 + r4;;
                  fr7 = r7 + r5;;
                  fr6 = r6 + r7; xr4 = 0.3;;

                  xr7 = yr6;;

                  xfr6 = r6 + r7;;//y(n)
                  xfr7 = r30 - r6;;//des(n)-y(n)
                  xfr8 = r4 * r7;;
                  [j31+Epsilon_Error] = xr8;;

_main_program.end:
//read cycle counter and compute the program cycle count

  comp_cycle_count;

//-------------TEST LOOP--------------------------------------------------

  [j1+=1] = xr6;;
.align_code 4;
  if NLC0E, jump testloop;  [j5+=1] = xr7;;



main.end:
___lib_prog_term:
    nop;nop;nop;nop;;
lms_flp32.asm - 源码说明

本页面展示了「基于TS201 Floating Point LMS Filter」中的 lms_flp32.asm 源码文件，采用汇编编程语言编写，共 303 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Floating相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?