📄 fast hadamard transform.txt

📁 c6000的应用程序比较常用比如说fft ifft等一些原文件
💻 TXT
📖 第 1 页 / 共 2 页
字号:
12 下一页
*===========================================================
*
*     TEXAS INSTRUMENTS, INC.
*
*     NAME:             FHT (Fast Hadamard Transform)
*
*     Revision Date:    04/01/98
*     Author:           Jelena Nikolic-Popovic, DSP Apps. Canada
*
*     USAGE
*           This routine is C-callable and can be called as:
*
*           void fht(short *data)
*
*           data = 64-point            input vector on entry
*                  64-point FHT of the input vector on exit
*                  
*     C CODE
*           This is the C equivalent of the assembly code
*           without restrictions:
*           Note that the assembly code is hand optimized
*           and restrictions may  apply.
*
*       void fht(short *data)
*       {
*         short tmp,k,i,count,j,n;
*
*         for (k=0;k<6;k++) {
*           n=32>>k;
*           for (i=0;i<n;i++) {
*             tmp=data[i]+data[i+n];
*             data[i+n]=data[i]-data[i+n];
*             data[i]=tmp;
*           }
*           count=(2<<(k-1))-1;
*           if (count>0) {
*             for (j=1;j<=count;j++) {
*               for (i=n*j*2;i<n*(j*2+1);i++) {
*                 tmp=data[i]+data[i+n];
*                 data[i+n]=data[i]-data[i+n];
*                 data[i]=tmp;
*               }
*             }   // end of for j=1:count
*           }     // endif
*         }       //  end of for k=0:5  
*       }
*
*
*     DESCRIPTION
*         The  routine  replaces the 64-point  input  data
*         vector  with  its FHT (Fast Hadamard Transform).
*         The FHT is an efficient way to multiply a vector
*         by  the Hadamard  matrix. In  IS-95, the size of
*         input vector  is 64 points  and  Hadamard matrix
*         H6 is a  64 x 64 matrix  defined recursively as:
*
*         H0 = [1]
*         H1 = H0   H0  =  1   1
*              H0  -H0     1  -1 
*
*         ...
*
*         H6 = H5   H5
*              H5  -H5
*
*
*         FHT reduces the vector * matrix multiplication
*         task to butterfly computation, similar to
*         Viterbi.
*         For multiplication by H6, there are 6 stages of
*         32 butterflies each. From one stage to the next,
*         the butterfly "span" is divided by 2 (from 32
*         points in the first stage to 2 points in the
*         last stage).
*         The algorithm requires: 
*         6 stages * 32 butterflies/stage * 2 op's/butterfly 
*                = 384 add/subtract operations
*  
*
*     TECHNIQUES
*
*         The algorithm is optimized for speed of execution
*         rather than code size.
*
*         A potential bottleneck in the C6x implementation
*         is in the requirement to store/load  intermediate
*         64-point array after each stage.
*         This was overcome by making following modification
*         to the H6 - FHT algorithm:
*
*         Multiplication of input vector d[64] by the H6
*         matrix ( 64x64 ) is broken down in two
*         multiplications by H5 (32x32 matrix) as follows:
*         d[64] = [ d1[32] d2[32] ], i.e. split the input
*         vector in two 32-point vectors
*         H6*d[64]  =         H5*d1[32] + H5*d2[32]            
*                             H5*d1[32] - H5*d2[32]
*         First, H5*d1[32] is calculated using the butterfly
*         method. Since there are only 32 intermediate
*         results of type short, they can be stored in
*         16 registers - no need to store out after each
*         stage and then load in for the next one.
*         Next, H5*d1[32] will be stored out to memory and
*         H5*d2[32] will be calculated in the same manner as
*         H5*d1[32]. Finally, H6*d[64] is computed from
*         H5*d1[32] and H5*d2[32] as shown above.
*
*     ASSUMPTIONS
*         - interrupts are disabled
*         - the range of data in the input  vector is such
*           that the data in the output vector will not
*           overflow 16 bits. Since there are 6 stages of
*           add/sub's, this means that the range of input
*           data should not be more than 10 bits.
*           ( in the worst case).
*         - input vector is aligned on 4-byte boundary.
*
*     MEMORY NOTE
*             no memory bank hits.  
*     CYCLES
*           191 cycles (including C calling overhead)
*
*===========================================================


        .global _fht
        .text
_fht:

        ; save context  

        sub.l1x    b15,4,a7             ; copy stack pointer
||      stw.d2     b3,*B15--[2]

        stw.d1     a10,*a7--[2] ; push a10 
||      stw.d2     b10,*b15--[2]; push b10 
        stw.d1     a11,*A7--[2] ; push a11 
||      stw.d2     b11,*b15--[2]; push b11 
        stw.d1     a12,*a7--[2] ; push a12 
||      stw.d2     b12,*b15--[2]; push b12 
        stw.d1     a13,*a7--[2] ; push a13 
||      stw.d2     b13,*b15--[2]; push b13 
        stw.d1     a14,*a7      ; push a14 
||      stw.d2     b14,*b15     ; push b14 

        ; intialization


        mv.l1   a4,a14          ; a14 = &d[0]       
||      mvk     32,a13          ; a13 = 32
||      mvk     32,b13          ; b13 = 32         

        mvk     2,b13           ; b13 = 00010002    
||      mvk     0ffffh,a15      ; a15 = 0000ffff   
||      add.l2  a14,b13,b14     ; b14 = &d(16)     

        mvklh   1,b13           ; b13 = 00010002   
||      mvklh   0,a15           ; a15 = 0000ffff

        ; H5 loop computes H5*d1(32) and H5*d2(32) 
        ; Notation : di(), where i=0,1,2,3,4,5,
        ; are the intermediate vectors after stage i

H5_loop:

        ldw.d1 *a14++,b0        ; d0(0)  and d0(1)    
                                            

        ldw.d2 *b14++,a0        ; d0(16) and d0(17)
||      ldw.d1 *a14++,b1        ; d0(2)  and d0(3)   

        ldw.d2 *b14++,a1        ; d0(18) and d0(19)
||      ldw.d1 *a14++,b2        ; d0(4)  and d0(5)

        ldw.d2 *b14++,a2        ; d0(20) and d0(21)
||      ldw.d1 *a14++,b3        ; d0(6)  and d0(7) 

        ldw.d2 *b14++,a3        ; d0(22) and d0(23)
||      ldw.d1 *a14++,b4        ; d0(8)  and d0(9)

        ldw.d2 *b14++,a4        ; d0(24) and d0(25)
||      ldw.d1 *a14++,b5        ; d0(10) and d0(11) 

        ldw.d2 *b14++,a5        ; d0(26) and d0(27) 
||      add2.s1 a0,b0,a0        ; d1(0)  and d1(1)  
||      sub2.s2 b0,a0,b0        ; d1(16) and d1(17) 
||      ldw.d1 *a14++,b6        ; d0(12) and d0(13) 

        ldw.d2 *b14++,a6        ; d0(28) and d0(29) 
||      add2.s1 a1,b1,a1        ; d1(2)  and d1(3)  
||      sub2.s2 b1,a1,b1        ; d1(18  and d1(19) 
||      ldw.d1 *a14++,b7        ; d0(14) and d0(15)

        ldw.d2 *b14++,a7        ; d0(30) and d0(31)
||      add2.s1 a2,b2,a2        ; d1(4)  and d1(5)  
||      sub2.s2 b2,a2,b2        ; d1(20) and d1(21)

        add2.s1 a3,b3,a3        ; d1(6)  and d1(7) 
||      sub2.s2 b3,a3,b3        ; d(22)  and d(23) 

        add2.s1 a4,b4,a4        ; d1(8)  and d1(9) 
||      sub2.s2 b4,a4,b4        ; d1(24) and d1(25)

        add2.s1 a5,b5,a5        ; d1(10) and d1(11) 
||      sub2.s2 b5,a5,b5        ; d1(26) and d1(27) 

        add2.s1 a6,b6,a6        ; d1(12) and d1(13) 
||      sub2.s2 b6,a6,b6        ; d1(28) and d1(29) 

        add2.s1 a7,b7,a7        ; d1(14) and d1(15) 
||      sub2.s2 b7,a7,b7        ; d1(30) and d1(31)

        ; End   of 1st stage 
        ; Start of 2nd stage

        sub2.s1 a0,a4,a8        ; d2(8)  and d2(9)                  
||      sub2.s2 b0,b4,b8        ; d2(24) and d2(25) 

        add2.s1 a0,a4,a0        ; d2(0)  and d2(1)   
||      add2.s2 b0,b4,b0        ; d2(16) and d2(17) 

        sub2.s1 a2,a6,a4        ; d2(12) and d2(13)      
||      sub2.s2 b2,b6,b4        ; d2(28) and d2(29)       

        add2.s1 a2,a6,a2        ; d2(4)  and d2(5)        
||      add2.s2 b2,b6,b2        ; d2(20) and d2(21)         

        sub2.s1 a1,a5,a6        ; d2(10) and d2(11)      
||      sub2.s2 b1,b5,b6        ; d2(26) and d2(27)        

        add2.s1 a1,a5,a1        ; d2(2)  and d2(3)       
||      add2.s2 b1,b5,b1        ; d2(18) and d2(19)      

        sub2.s1 a3,a7,a5        ; d2(14) and d2(15) 
||      sub2.s2 b3,b7,b5        ; d2(30) and d2(31)        

        add2.s1 a3,a7,a3        ; d2(6)  and d2(7)       
||      add2.s2 b3,b7,b3        ; d2(22) and d2(23)       


        ; End   of 2nd stage
        ; Start of 3rd stage

        sub2.s1 a0,a2,a7        ; d3(4)  and d3(5)
||      sub2.s2 b0,b2,b7        ; d3(20) and d3(21)        

        add2.s1 a0,a2,a0        ; d3(0)  and d3(1)   
||      add2.s2 b0,b2,b0        ; d3(16) and d3(17) 

        sub2.s1 a1,a3,a2        ; d3(6)  and d3(7)      
||      sub2.s2 b1,b3,b2        ; d3(22) and d3(23)        

        add2.s1 a1,a3,a1        ; d3(2)  and d3(3)       
||      add2.s2 b1,b3,b1        ; d3(18) and d3(19) 

        sub2.s1 a8,a4,a3        ; d3(12) and d3(13)                
||      sub2.s2 b8,b4,b3        ; d3(28) and d3(29)        

        add2.s1 a8,a4,a4        ; d3(8   and d3(9)     
||      add2.s2 b8,b4,b4        ; d3(24) and d3(25)  

        sub2.s1 a6,a5,a8        ; d3(14) and d3(15)                
||      sub2.s2 b6,b5,b8        ; d3(30) and d3(31)        

        add2.s1 a6,a5,a5        ; d3(10) and d3(11)  
||      add2.s2 b6,b5,b5        ; d3(26  and d3(27) 

        ; End   of 3nd stage
        ; Start of 4th stage ( pipelined with 5th stage)

        sub2.s1 a0,a1,a6        ; d4(2)  and d4(3)  
||      sub2.s2 b0,b1,b6        ; d4(18) and d4(19) 

        add2.s1 a0,a1,a0        ; d4(0)  and d4(1) 
||      add2.s2 b0,b1,b0        ; d4(16) and d4(17) 
||      sub.l1 a14,a13,a14      ; adjust store pointers
||      sub.l2 b14,a13,b14
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -