📄 fft.asm

📁 davinci技术源码视频监控汇编源码
💻 ASM
📖 第 1 页 / 共 3 页
字号:
*       in the lower half, and the imaginary part is in the upper half.     *
*       The flow breaks in case of "xl0" and "xl1" because in this case     *
*       the real part needs to be combined with the imaginary part because  *
*       of the multiplication by "j". This requires a packed quantity like  *
*       "xl21xl20" to be rotated as "xl20xl21" so that it can be combined   *
*        using add2's and sub2's. Hence the natural version of C code       *
*       shown below is transformed using packed data processing as shown:   *
*                                                                           *
*                        xl0  = x[2 * i0    ] - x[2 * i2    ];              *
*                        xl1  = x[2 * i0 + 1] - x[2 * i2 + 1];              *
*                        xl20 = x[2 * i1    ] - x[2 * i3    ];              *
*                        xl21 = x[2 * i1 + 1] - x[2 * i3 + 1];              *
*                                                                           *
*                        xt1  = xl0 + xl21;                                 *
*                        yt2  = xl1 + xl20;                                 *
*                        xt2  = xl0 - xl21;                                 *
*                        yt1  = xl1 - xl20;                                 *
*                                                                           *
*                        xl1_xl0   = _sub2(x21_x20, x21_x20)                *
*                        xl21_xl20 = _sub2(x32_x22, x23_x22)                *
*                        xl20_xl21 = _rotl(xl21_xl20, 16)                   *
*                                                                           *
*                        yt2_xt1   = _add2(xl1_xl0, xl20_xl21)              *
*                        yt1_xt2   = _sub2(xl1_xl0, xl20_xl21)              *
*                                                                           *
*       Also notice that xt1, yt1 endup on seperate words, these need to    *
*       be packed together to take advantage of the packed twiddle fact     *
*       ors that have been loaded. In order for this to be achieved they    *
*       are re-aligned as follows:                                          *
*                                                                           *
*       yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1)                                *
*       yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2)                                *
*                                                                           *
*       The packed words "yt1_xt1" allows the loaded"sc" twiddle factor     *
*       to be used for the complex multiplies. The complex multiply         *
*       is implemented on the C64x+ using _cmpyr1.                          *
*                                                                           *
*       (X + jY) ( C + j S) = (XC + YS) + j (YC - XS).                      *
*                                                                           *
*       The actual twiddle factors for the FFT are cosine, - sine. The      *
*       twiddle factors stored in the table are csine and sine, hence       *
*       the sign of the "sine" term is comprehended during multipli-        *
*       cation as shown above.                                              *
*                                                                           *
*                                                                           *
*   ASSUMPTIONS                                                             *
*                                                                           *
*       The size of the FFT, n, must be a power of 4 and greater than       *
*       or equal to 16 and less than 32768.                                 *
*                                                                           *
*       The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a         *
*       double-word boundary for the "optimized" implementations.           *
*                                                                           *
*       The input and output data are complex, with the real/imaginary      *
*       components stored in adjacent locations in the array.  The real     *
*       components are stored at even array indices, and the imaginary      *
*       components are stored at odd array indices.                         *
*                                                                           *
*   C CODE                                                                  *
*                                                                           *
*                                                                           *
*   NOTES                                                                   *
*                                                                           *
*                                                                           *
*   CYCLES                                                                  *
*                                                                           *
*       cycles = 0.75*nx*log4(nx) + 38                                      *
*       For nx = 1024, cycles = 3878                                        *
*                                                                           *
*   CODESIZE                                                                *
*                                                                           *
*       704 bytes                                                           *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2005 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *


* ======================================================================== *
* ======================================================================== *

        .text        .global _fft_fft:
* ======================================================================== *
* ======================================================================== *
******************* SYMBOLIC REGISTER ASSIGNMENTS: SETUP *********************

* ===================== LOOP 1 SYMBOLIC REGISTER ASSIGNMENTS ============== *
        .asg            A0,         A_h2
        .asg            A1,         A_ifj
        .asg            A2,         A_xl20_1_xl21_1
        .asg            A2,         A_xh2_1_0
        .asg            A3,         A_xh2_3_2
        .asg            A3,         A_x_h2_0_1
        .asg            A3,         A_x_h2_2_3
        .asg            A4,         A_w0
        .asg            A4,         A_ptr_w
        .asg            A5,         A_c2
        .asg            A6,         A_ptr_x
        .asg            A7,         A_w1

        .asg            A8,         A_x
        .asg            A9,         A_l1
        .asg            A10,        A_shr2
        .asg            A11,        A_fft_jmp
        .asg            A16,        A_l2
        .asg            A17,        A_fft_jmp_3
        .asg            A18,        A_yt0_1_xt0_1
        .asg            A18,        A_xl20_0_xl21_0
        .asg            A18,        A_xh2_1_xh2_0
        .asg            A19,        A_xh2_3_xh2_2
        .asg            A20,        A_fft_jmp1 ; 20, 21 <-> 2,3
        .asg            A21,        A_j
        .asg            A22,        A_xl1_1_0
        .asg            A23,        A_xl1_3_2
        .asg            A22,        A_xl2_1_xl2_0
        .asg            A23,        A_xl2_3_xl2_2
        .asg            A24,        A_co10_si10
        .asg            A25,        A_co11_si11
        .asg            A26,        A_co20_si20
        .asg            A27,        A_co21_si21
        .asg            A27,        A_yt0_0_xt0_0
        .asg            A28,        A_xl21_0_xl20_0
        .asg            A29,        A_xh21_0_xh20_0
        .asg            A30,        A_xl21_1_xl20_1
        .asg            A31,        A_xh21_1_xh20_1

        .asg            B0,         B_x_1_x_0
        .asg            B1,         B_x_3_x_2
        .asg            B0,         B_yt1_1_xt2_1
        .asg            B1,         B_yt2_1_xt1_1
        .asg            B0,         B_x_l2_0_1
        .asg            B0,         B_x_l2_2_3
        .asg            B1,         B_j
        .asg            B2,         B_wh
        .asg            B4,         B_n
        .asg            B5,         B_x
        .asg            B6,         B_ptr_y
        .asg            B7,         B_fft_jmp

        .asg            B8,         B_w2
        .asg            B9,         B_h2
        .asg            B10,        B_i
        .asg            B16,        B_l2
        .asg            B17,        B_l1
        .asg            B18,        B_l2_
        .asg            B19,        B_x_
        .asg            B20,        B_xt2_1_yt2_1
        .asg            B20,        B_yt1_0_xt2_0
        .asg            B20,        B_xt2_0_yt2_0
        .asg            B21,        B_xt1_1_yt1_1
        .asg            B21,        B_xt1_0_yt1_0
        .asg            B21,        B_yt2_0_xt1_0
        .asg            B22,        B_xl1_0_xl0_0
        .asg            B22,        B_xl2_1_0
        .asg            B23,        B_xl2_3_2
        .asg            B23,        B_xh1_0_xh0_0
        .asg            B24,        B_co30_si30
        .asg            B25,        B_co31_si31
        .asg            B26,        B_xl1_1_xl1_0
        .asg            B27,        B_xl1_3_xl1_2
        .asg            B28,        B_xl1_1_xl0_1
        .asg            B29,        B_xh1_1_xh0_1
        .asg            B30,        B_x_1o_x_0o
        .asg            B31,        B_x_3o_x_2o
        .asg            B20,        B_log4n
        .asg            B21,        B_thone
        .asg            B15,        B_SP
* ========================================================================= *
* void fft(short * w, int nx, short * x, short * y);
* ========================================================================= *

        STW     .D2T2   B10,        *B_SP--[2]                  ; Reserve stack, Save B10
||      LMBD    .L2     1,          B_n,        B_log4n         ; 31-log4n
||      MVK     .S2     31,         B_thone

        STDW    .D2T1   A11:A10,    *+B_SP[0]                   ; Save A11:A10
||      SUB     .L2     B_thone,    B_log4n,    B_log4n         ; log4n

        SHR     .S2     B_log4n,    1,          B_log4n

        SHRU    .S2     B_n,        3,          B_i             ;[ 2,1]

        SUB     .L2     B_i,        1,          B_i             ;[ 4,1]
||      MVC     .S2     B_i,        RILC                        ;['reload' inner loop counter
||      SHRU    .S1X    B_n,        1,          A_h2            ;[ 1,1] low   1
||      MVK     .L1     2,          A_c2                        ;[ 2,1]

 [B_wh] SPLOOPD         6
||      MVC     .S2     B_i,        ILC                         ;[ 7,1]
||      ADD     .S1X    A_h2,       B_n,        A_l2            ;bott 3
||      SUB     .L2     B_log4n,    2,          B_wh            ;log4n-2
* ================================ LOOP STAGE I =========================== *
        SPMASK
||^     SSHVR   .M1     A_l2,       A_c2,       A_fft_jmp       ;prolog
||^     SHRU    .S2     B_n,        2,          B_l1            ;[ 1,1]
||^     SUB     .L2X    A_ptr_x,    8,          B_x             ;[ 7,1] 1st time only
||^     ZERO    .D1     A_j                                     ;prolog

        ADD     .S2     B_x,        8,          B_x             ;[ 1,1]

        SPMASK
||      LDDW    .D2T2   *B_x[B_l1], B_xl1_3_xl1_2:B_xl1_1_xl1_0 ;[ 2,1]
||^     SSHVR   .M1     A_l2,       A_c2,       A_fft_jmp1      ;prolog
||^     SHRU    .S2X    A_h2,       2,          B_h2            ;prolog
||^     SHRU    .S1     A_h2,       2,          A_h2            ;prolog

        SPMASK
||      LDDW    .D2T1   *B_x[B_h2], A_xh2_3_xh2_2:A_xh2_1_xh2_0 ;[ 3,1]
||^     SHRU    .S2X    A_l2,       2,          B_l2            ;prolog
||^     MPY     .M2X    2,          A_l2,       B_fft_jmp       ;prolog
||^     ADD     .L1     A_w0,       8,          A_w1            ;prolog
||^     SHRU    .S1     A_l2,       2,          A_l2            ;prolog

        SPMASK
||      LDDW    .D2T1   *B_x[B_l2], A_xl2_3_xl2_2:A_xl2_1_xl2_0 ;[ 4,1]
||^     SUB     .L1     A_fft_jmp1, 3,          A_fft_jmp_3     ;prolog
||^     SHRU    .S1     A_fft_jmp1, 2,          A_fft_jmp1      ;prolog

        SUB     .L1     A_fft_jmp_3,    A_j,        A_ifj       ;[ 5,1]
||      LDDW    .D2T2   *B_x[0],    B_x_3_x_2:B_x_1_x_0         ;[ 5,1]
||      ROTL    .M2X    A_j,        0,          B_j             ;[ 5,1]
* ================================ STAGE II =============================== *
        SPMASK
||[!A_ifj]ADD   .D2     B_x,        B_fft_jmp,  B_x             ;[ 6,1]
||      MVD     .M2     B_x,        B_x_                        ;[ 6,1]
||      LDDW    .D1T1   *A_w1[A_j], A_co21_si21:A_co20_si20     ;[ 6,1]
||^     ADD     .L2X    A_w1,       8,          B_w2            ;prolog
||^     SSHVR   .M1X    B_n,        A_c2,       A_l1            ;prolog
||^     MVKL    .S1     020000000h, A_shr2                      ;prolog

        ADD     .S1     A_j,        3,          A_j             ;[ 7,1] was 3
||      LDDW    .D2T2   *B_w2[B_j], B_co31_si31:B_co30_si30     ;[ 7,1]
||      LDDW    .D1T1   *A_w0[A_j], A_co11_si11:A_co10_si10     ;[ 7,1]

        SPMASK
||[!A_ifj]ZERO  .L1     A_j                                     ;[ 8,1]
||^     SHRU    .S2X    A_l2,       2,          B_l2_           ;prolog
||^     MVKH    .S1     020000000h, A_shr2                      ;prolog
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -