📄 inverse discrete cosine transform.txt

📁 c6000的应用程序比较常用比如说fft ifft等一些原文件
💻 TXT
📖 第 1 页 / 共 5 页
字号:
*               r1 = X2*c6 - X6*c2;                                         *
*               r0 = X2*c2 + X6*c6;                                         *
*                                                                           *
*               g0 = p0 + r0;                                               *
*               g1 = p1 + r1;                                               *
*               h1 = p1 - r1;                                               *
*               h0 = p0 - r0;                                               *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Odd part of decomp.                                 */  *
*               /* ---------------------------------------------------- */  *
*               g2 = (X1*c7 - X3*c5) + (X5*c3 - X7*c1);                     *
*               g3 = (X1*c5 - X3*c1) + (X5*c7 + X7*c3);                     *
*               h3 = (X1*c3 - X3*c7) - (X5*c1 + X7*c5);                     *
*               h2 = (X1*c1 + X3*c3) + (X5*c5 + X7*c7);                     *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Final butterfly.                                    */  *
*               /* ---------------------------------------------------- */  *
*               x0 = g0 + h2;                                               *
*               x1 = g1 + h3;                                               *
*               x2 = h1 + g3;                                               *
*               x3 = h0 + g2;                                               *
*               x4 = h0 - g2;                                               *
*               x5 = h1 - g3;                                               *
*               x6 = g1 - h3;                                               *
*               x7 = g0 - h2;                                               *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Truncate and saturate final results.                */  *
*               /* ---------------------------------------------------- */  *
*               x0t = x0 >> trunc2;                                         *
*               x1t = x1 >> trunc2;                                         *
*               x2t = x2 >> trunc2;                                         *
*               x3t = x3 >> trunc2;                                         *
*               x4t = x4 >> trunc2;                                         *
*               x5t = x5 >> trunc2;                                         *
*               x6t = x6 >> trunc2;                                         *
*               x7t = x7 >> trunc2;                                         *
*                                                                           *
*               x0s = x0t < -256 ? -256 : x0t > 255 ? 255 : x0t;            *
*               x1s = x1t < -256 ? -256 : x1t > 255 ? 255 : x1t;            *
*               x2s = x2t < -256 ? -256 : x2t > 255 ? 255 : x2t;            *
*               x3s = x3t < -256 ? -256 : x3t > 255 ? 255 : x3t;            *
*               x4s = x4t < -256 ? -256 : x4t > 255 ? 255 : x4t;            *
*               x5s = x5t < -256 ? -256 : x5t > 255 ? 255 : x5t;            *
*               x6s = x6t < -256 ? -256 : x6t > 255 ? 255 : x6t;            *
*               x7s = x7t < -256 ? -256 : x7t > 255 ? 255 : x7t;            *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Store the results transposed in the result area.    */  *
*               /* ---------------------------------------------------- */  *
*               o_ptr[ 0] = x0s;                                            *
*               o_ptr[ 8] = x1s;                                            *
*               o_ptr[16] = x2s;                                            *
*               o_ptr[24] = x3s;                                            *
*               o_ptr[32] = x4s;                                            *
*               o_ptr[40] = x5s;                                            *
*               o_ptr[48] = x6s;                                            *
*               o_ptr[56] = x7s;                                            *
*                                                                           *
*               o_ptr++;                /* increment ptr to next column */  *
*           }                                                               *
*           /* -------------------------------------------------------- */  *
*           /*  Update output pointer to point to next block.           */  *
*           /* -------------------------------------------------------- */  *
*           o_ptr = o_ptr - 8 + 64;                                         *
*         }                                                                 *
*       }                                                                   *
*                                                                           *
*                                                                           *
*       Note:  This code guarantees correct operation, even in the case     *
*       that 'num_idcts == 0'.  In that case, the function runs for only    *
*       35 cycles (counting 6 cycles of function-call overhead), due to     *
*       early-exit code.  The early-exit case performs no accesses to the   *
*       idct_data[] array.                                                  *
*                                                                           *
*   TECHNIQUES                                                              *
*       All levels of looping are collapsed into single loops which are     *
*       pipelined.  The outer loop focuses on 8-pt IDCTs, whereas the       *
*       inner loop controls the column-pointer to handle jumps between      *
*       IDCT blocks.                                                        *
*                                                                           *
*       For performance, portions of the code outside the loops have been   *
*       inter-scheduled with the prolog and epilog code of the loops.       *
*       Also, twin stack-pointers are used to accelerate stack accesses.    *
*       Finally, pointer values and cosine term registers are reused        *
*       between the horizontal and vertical loops to save the need for      *
*       messy pointer and constant reinitialization.                        *
*                                                                           *
*       To save codesize, prolog and epilog collapsing have been performed  *
*       to the extent that it does not impact performance.  Also, code      *
*       outside the loops has been scheduled to pack as tightly into        *
*       fetch packets as possible to avoid alignment padding NOPs.          *
*                                                                           *
*       The IDCTs cannot be performed completely in-place due to the        *
*       transpose that each pass performs.  In order to save data memory,   *
*       the horizontal pass works from the end of the array towards the     *
*       begining, writing its result one IDCT block later in memory,        *
*       thus performing the IDCT nearly-in-place.  The vertical pass        *
*       performs its IDCTs in the opposite direction, working from the      *
*       start of the array towards the end, writing the results in-place.   *
*       A nice side effect of this is that the pointer values at the        *
*       end of the horizontal loop are a fixed offset relative to their     *
*       required values for the vertical loop, regardless of the number     *
*       of IDCTs performed.  This makes the pointer reinitialization        *
*       exceptionally cheap.                                                *
*                                                                           *
*       Additional section-specific optimization notes are provided below.  *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       The input array must be aligned on a word boundary, and one         *
*       extra block's worth of storage must be present after the list       *
*       of IDCT input blocks.                                               *
*                                                                           *
*   MEMORY NOTE                                                             *
*       No bank conflicts occur.  The code requires 16 words of stack       *
*       space to save Save-On-Entry (SOE) registers, CSR, IRP, and a        *
*       spill value.  For correct operation, the input array must be        *
*       aligned to a word boundary.                                         *
*                                                                           *
*       Bank usage on C6201:                                                *
*                                                                           *
*           Horiz loop accesses: 1 of 4 banks for 80% of cycles             *
*                                4 of 4 banks for 20% of cycles             *
*                                                                           *
*           Vert loop accesses:  1 of 4 banks for 73% of cycles             *
*                                4 of 4 banks for 18% of cycles             *
*                                0 of 4 banks for  9% of cycles             *
*                                                                           *
*   NOTES                                                                   *
*       This is a LITTLE ENDIAN implementation.                             *
*                                                                           *
*       This code masks interrupts for nearly its entire duration.          *
*       Interrupts are locked out for '53 + 168 * num_idcts' cycles.  As    *
*       a result, the code is interrupt-tolerant, but not interruptible.    *
*                                                                           *
*       The cosine terms have all been scaled by sqrt(2), so that the       *
*       "c4" term is basically an even power of 2.                          *
*                                                                           *
*       The precision of the final results can be changed by modifying      *
*       the constants at the top of the code and reassembling.  Usually,    *
*       modifying the final-shift constants in the "Symbolic Constants"     *
*       section is sufficient.                                              *
*                                                                           *
*   SOURCE                                                                  *
*       The IDCT form used is the Even-Odd Decomposition IDCT.              *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 1999 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *
            .sect       ".data:copyright_h"
_Copyright: .string     "Copyright (C) 1999 Texas Instruments Incorporated. "
            .string     "All Rights Reserved."

            .sect       ".text:hand"
            .global     _idct_8x8_asm
_idct_8x8_asm:
; ============================ SYMBOLIC CONSTANTS ============================
        .asg            0x0B19,     cst_c1  ; Cosine term c1
        .asg            0x0A74,     cst_c2  ; Cosine term c2
        .asg            0x0968,     cst_c3  ; Cosine term c3
        .asg            0x0800,     cst_c4  ; Cosine term c4
        .asg            0x0649,     cst_c5  ; Cosine term c5
        .asg            0x0454,     cst_c6  ; Cosine term c6
        .asg            0x0235,     cst_c7  ; Cosine term c7
        .asg            11,         q_pt    ; Q-point for calculations
        .asg            16,         kq_a    ; Extract const for c4 "mpy"
        .asg            16-q_pt,    kq_b    ; Extract const for c4 "mpy"
        .asg            9,          trunc1  ; Truncation after horizontal pass
        .asg            9,          results ; Final precision of results
        .asg            32-results, trunc2  ; Final truncation right-shift
        .asg            16-results, satl    ; Final saturation left-shift
; =============== SYMBOLIC REGISTER ASSIGNMENTS FOR HORIZ LOOP ===============
        .asg            B13,        B_c7c5  ; Cosine terms c7, c5   (packed)
        .asg            A13,        A_c7c5  ; Cosine terms c7, c5   (packed)
        .asg            B12,        B_c3c1  ; Cosine terms c3, c1   (packed)
        .asg            A12,        A_c3c1  ; Cosine terms c3, c1   (packed)
        .asg            B14,        B_c6c2  ; Cosine terms c6, c2   (packed)
        .asg            A14,        A_i_ptr ; Input pointer #1
        .asg            B15,        B_i_ptr ; Input pointer #2
        .asg            A11,        A_o_ptr ; Output pointer #1
        .asg            B11,        B_o_ptr ; Output pointer #2
        .asg            B2,         B_o     ; Outer loop counter
        .asg            A5,         A_X1X0  ; Incoming coefs X1, X0 (packed)
        .asg            A10,        A_X3X2  ; Incoming coefs X3, X2 (packed)
        .asg            B7,         B_X5X4  ; Incoming coefs X5, X4 (packed)
        .asg            B10,        B_X7X6  ; Incoming coefs X7, X6 (packed)
        .asg            A7,         A_X2c6  ; X2 * c6
        .asg            B0,         B_X6c2  ; X6 * c2
        .asg            A0,         A_X2c2  ; X2 * c2
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -