📄 img_fdct_8x8.h64

📁 TMS320C64x Image／Video Processing Library (V1.04)
💻 H64
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
*               p0 = g0 + h0 + 4;               /*  Results in Q2.5 */      *
*               p1 = g1 + h1;                   /*  Results in Q2.5 */      *
*               r0 = g0 - h0;                   /*  Results in Q2.5 */      *
*               r1 = g1 - h1;                   /*  Results in Q2.5 */      *
*                                                                           *
*               q1a= (g2 * C4 + 0x8000) >> 16;  /*  q1a now in Q2   */      *
*               s1a= (h2 * C4 + 0x8000) >> 16;  /*  s1a now in Q2   */      *
*               q1 = q1a + q1a;                 /*  Results in Q3   */      *
*               s1 = s1a + s1a;                 /*  Results in Q3   */      *
*                                                                           *
*               s0 = h3 + g3;                   /*  Results in Q3   */      *
*               q0 = h3 - g3;                   /*  Results in Q3   */      *
*                                                                           *
*               /* ------------------------------------------------ */      *
*               /*  Stage 3                                         */      *
*               /*                                                  */      *
*               /*  Now, the even-half becomes Q0.  This happens    */      *
*               /*  on P0 and P1 because the multiply-by-c4 was     */      *
*               /*  canceled with an upward scaling by sqrt(2),     */      *
*               /*  yielding Q3 intermediate values.  The final     */      *
*               /*  >> 3 leaves these at Q0.  On R0 and R1, this    */      *
*               /*  happens because c2 and c6 are at Q13.5,         */      *
*               /*  yielding Q16 intermediate values.  The final    */      *
*               /*  >> 16 then leaves those values at Q0.           */      *
*               /* ------------------------------------------------ */      *
*               P0 = ((short)(p0 + p1)) >> 3;   /*  Results in Q0   */      *
*               P1 = ((short)(p0 - p1)) >> 3;   /*  Results in Q0   */      *
*               R1 = (c6 * r1 + c2 * r0 + 0x8000) >> 16; /* .. Q0   */      *
*               R0 = (c6 * r0 - c2 * r1 + 0x8000) >> 16; /* .. Q0   */      *
*                                                                           *
*               Q1 = q1 + q0;                   /*  Results in Q3   */      *
*               Q0 = q1 - q0;                   /*  Results in Q3   */      *
*               S1 = s1 + s0;                   /*  Results in Q3   */      *
*               S0 = s1 - s0;                   /*  Results in Q3   */      *
*                                                                           *
*               /* ------------------------------------------------ */      *
*               /*  Stage 4                                         */      *
*               /*                                                  */      *
*               /*  Next, the odd-half ends up in Q0. This happens  */      *
*               /*  because our values are in Q3 and our cosine     */      *
*               /*  terms are in Q13, giving us Q16 intermediate    */      *
*               /*  values. The final >> 16 leaves us a Q0 result.  */      *
*               /* ------------------------------------------------ */      *
*               F0 = P0;                        /*  Results in Q0   */      *
*               F4 = P1;                        /*  Results in Q0   */      *
*               F2 = R1;                        /*  Results in Q0   */      *
*               F6 = R0;                        /*  Results in Q0   */      *
*                                                                           *
*               F1 = (c7 * Q1 + c1 * S1 + 0x8000) >> 16; /* .. Q0   */      *
*               F7 = (c7 * S1 - c1 * Q1 + 0x8000) >> 16; /* .. Q0   */      *
*               F5 = (c3 * Q0 + c5 * S0 + 0x8000) >> 16; /* .. Q0   */      *
*               F3 = (c3 * S0 - c5 * Q0 + 0x8000) >> 16; /* .. Q0   */      *
*                                                                           *
*               /* ------------------------------------------------ */      *
*               /*  Store the results                               */      *
*               /* ------------------------------------------------ */      *
*               dct_io_ptr[0] = F0;                                         *
*               dct_io_ptr[1] = F1;                                         *
*               dct_io_ptr[2] = F2;                                         *
*               dct_io_ptr[3] = F3;                                         *
*               dct_io_ptr[4] = F4;                                         *
*               dct_io_ptr[5] = F5;                                         *
*               dct_io_ptr[6] = F6;                                         *
*               dct_io_ptr[7] = F7;                                         *
*                                                                           *
*               /* ------------------------------------------------ */      *
*               /*  Update pointer to next FDCT row.                */      *
*               /* ------------------------------------------------ */      *
*               dct_io_ptr += 8;                                            *
*           }                                                               *
*                                                                           *
*           return;                                                         *
*       }                                                                   *
*                                                                           *
*                                                                           *
*   TECHNIQUES                                                              *
*       The loop nest in the vertical pass has been collapsed into a        *
*       single-level loop.  Both vertical and horizontal loops have         *
*       been software pipelined.                                            *
*                                                                           *
*       For performance, portions of the code outside the loops have been   *
*       inter-scheduled with the prolog and epilog code of the loops.       *
*       Also, twin stack-pointers are used to accelerate stack accesses.    *
*       Finally, pointer values and cosine term registers are reused        *
*       between the horizontal and vertical loops to reduce the impact of   *
*       pointer and constant reinitialization.                              *
*                                                                           *
*       To save codesize, prolog and epilog collapsing have been performed  *
*       to the extent that it does not impact performance.                  *
*                                                                           *
*       To reduce register pressure and save some code, the horizontal      *
*       loop uses the same pair of pointer register for both reading and    *
*       writing.  The pointer increments are on the LDs to permit prolog    *
*       and epilog collapsing, since LDs can be speculated.                 *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       Stack is aligned to a double-word boundary.                         *
*                                                                           *
*   MEMORY NOTE                                                             *
*       No bank conflicts occur.                                            *
*                                                                           *
*       The "fdct_data[]" array must be aligned on a double-word (8 byte)   *
*       boundary.                                                           *
*                                                                           *
*       The code requires 4 words of stack space to save Save-On-Entry      *
*       (SOE) registers.                                                    *
*                                                                           *
*       Nearly every cycle of this function performs at least one           *
*       memory access.                                                      *
*                                                                           *
*   NOTES                                                                   *
*       This code is fully interruptible.  Interrupts are blocked only      *
*       branch delay slots.                                                 *
*                                                                           *
*       The cosine terms have all been scaled by sqrt(2), so that the       *
*       "c4" term is basically an even power of 2.                          *
*                                                                           *
*       The code is LITTLE ENDIAN.                                          *
*                                                                           *
*   CYCLES                                                                  *
*       cycles = 50 + 76 * num_fdcts                                        *
*                                                                           *
*       For num_fdcts =  6, cycles = 506.                                   *
*       For num_fdcts = 24, cycles = 1848.                                  *
*                                                                           *
*   CODESIZE                                                                *
*       980 bytes.                                                          *
*                                                                           *
*   SOURCE                                                                  *
*       Chen FDCT.                                                          *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *

        .global _IMG_fdct_8x8

* ========================================================================= *
*   End of file:  img_fdct_8x8.h64                                          *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2003 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -