📄 inverse discrete cosine transform.txt

📁 c6000的应用程序比较常用比如说fft ifft等一些原文件
💻 TXT
📖 第 1 页 / 共 5 页
字号:
        .asg            B1,         B_X6c6  ; X6 * c6
        .asg            A6,         A_P0    ; Node P0 in signal flow graph
        .asg            B8,         B_P1    ; Node P1 in signal flow graph
        .asg            A8,         A_p0    ; Node p0 in signal flow graph
        .asg            A0,         A_p1    ; Node p1 in signal flow graph
        .asg            B0,         B_r1    ; Node r1 in signal flow graph
        .asg            B4,         B_r0    ; Node r0 in signal flow graph
        .asg            B7,         B_g0    ; Node g0 in signal flow graph
        .asg            B3,         B_g1    ; Node g1 in signal flow graph
        .asg            A15,        A_h1    ; Node h1 in signal flow graph
        .asg            A15,        A_h0    ; Node h0 in signal flow graph
        .asg            A3,         A_X1c1  ; X1 * c1
        .asg            A0,         A_X1c3  ; X1 * c3
        .asg            A3,         A_X1c5  ; X1 * c5
        .asg            A9,         A_X1c7  ; X1 * c7
        .asg            A9,         A_X3c1  ; X3 * c1
        .asg            A0,         A_X3c3  ; X3 * c3
        .asg            A5,         A_X3c5  ; X3 * c5
        .asg            A5,         A_X3c7  ; X3 * c7
        .asg            B0,         B_X5c1  ; X5 * c1
        .asg            B4,         B_X5c3  ; X5 * c3
        .asg            B3,         B_X5c5  ; X5 * c5
        .asg            B6,         B_X5c7  ; X5 * c7
        .asg            B0,         B_X7c1  ; X7 * c1
        .asg            B3,         B_X7c3  ; X7 * c3
        .asg            B9,         B_X7c5  ; X7 * c5
        .asg            B1,         B_X7c7  ; X7 * c7
        .asg            A7,         A_g2a   ; X1 * c7 - X3 * c5
        .asg            B8,         B_g2b   ; X5 * c3 - X7 * c1
        .asg            A6,         A_g2    ; Node g2 in signal flow graph
        .asg            A3,         A_g3a   ; X1 * c5 - X3 * c1
        .asg            B6,         B_g3b   ; X5 * c7 + X7 * c3
        .asg            A4,         A_g3    ; Node g3 in signal flow graph
        .asg            A6,         A_h3a   ; X1 * c3 - X2 * c7
        .asg            B7,         B_h3b   ; X5 * c1 + X7 * c5
        .asg            B5,         B_h3n   ; Node h3, negated.
        .asg            A0,         A_h2a   ; X1 * c1 + X3 * c3
        .asg            B3,         B_h2b   ; X5 * c5 + X7 * c7
        .asg            B1,         B_h2    ; Node h2 in signal flow graph
        .asg            B4,         B_x0    ; Output x0, pre-truncation
        .asg            B0,         B_x1    ; Output x1, pre-truncation
        .asg            A4,         A_x2    ; Output x2, pre-truncation
        .asg            A4,         A_x3    ; Output x3, pre-truncation
        .asg            A7,         A_x4    ; Output x4, pre-truncation
        .asg            A15,        A_x5    ; Output x5, pre-truncation
        .asg            B6,         B_x6    ; Output x6, pre-truncation
        .asg            B3,         B_x7    ; Output x7, pre-truncation
        .asg            B4,         B_x0t   ; Output x0, truncated to 16 bits
        .asg            B5,         B_x1t   ; Output x1, truncated to 16 bits
        .asg            A4,         A_x2t   ; Output x2, truncated to 16 bits
        .asg            A8,         A_x3t   ; Output x3, truncated to 16 bits
        .asg            A7,         A_x4t   ; Output x4, truncated to 16 bits
        .asg            A5,         A_x5t   ; Output x5, truncated to 16 bits
        .asg            B3,         B_x6t   ; Output x6, truncated to 16 bits
        .asg            B9,         B_x7t   ; Output x7, truncated to 16 bits
        .asg            A2,         A_i     ; Inner-loop counter.
; ============================================================================

* ========================================================================= *
*   Initialization code for horizontal loop:  Saves registers to            *
*   the stack, sets up cosine terms, pointers and loop control.             *
*                                                                           *
*   The stack frame for this code is 16 words large.  It holds the Save     *
*   on Entry (SOE) registers A10..A15, B10..B14, as well as the return      *
*   address (B3), CSR, IRP, and a single spill value.  (The loop counter    *
*   initializer is shared between both loops and so I spill it to the       *
*   stack.)  I twin the stack pointer to speed up stack accesses.  The      *
*   stack frame layout is slightly funky to avoid bank conflicts while      *
*   allowing me to get to everything when I need it most.                   *
*                                                                           *
*   The horizontal loop starts at the end of the IDCT array and works back  *
*   towards the beginning.  As a result, the input and output pointers are  *
*   initialized like so:                                                    *
*                                                                           *
*    -- A_i_ptr is set to point to the coefficients "X0" and "X1" in the    *
*       last row of the last valid IDCT block in the input.  B_i_ptr is     *
*       set to point to the coefficients "X4" and "X5" in that same row.    *
*                                                                           *
*    -- A_o_ptr is set to point to the coefficient "x4" in the rightmost    *
*       column of the scratch block I require at the end of the array.      *
*       B_o_ptr is set to point to "x3" in that same column.                *
*                                                                           *
*   The loop count is simply the number of IDCTs times 8, minus 1 to        *
*   handle the parallel iterations in the kernel.  (It would've been more,  *
*   except that I've performed some limited prolog and epilog collapsing,   *
*   so I need to iterate the kernel more times.)  A happy coincidence       *
*   gives both horizontal and vertical loops the exact same trip count,     *
*   so I spill this value to the stack and simply restore it unchanged      *
*   for the second loop, rather than recalculating it.                      *
*                                                                           *
*   Since I was able to free up a single predication register in the first  *
*   loop, I prolog-collapsed one stage of the prolog.  I use A1 as my       *
*   prolog-collapsation fuse.  To save a MVK (since this code bottlenecks   *
*   heavily on S units), I initialize it to -1 with an OR, rather than a    *
*   more traditional 1.                                                     *
*                                                                           *
*   Both loops use all 32 registers, so I have saved the stack pointer in   *
*   IRP.  This is safe since interrupts are explicitly disabled for the     *
*   entire function.                                                        *
*                                                                           *
*   Note:  This setup code could possibly be a cycle or two faster.  For    *
*   instance, I could copy B15 to A15 before the decrement and use          *
*   negative indexes for the STWs through A15, saving a whole cycle on      *
*   the stack saves.  The resulting code doesn't pack as nicely, though.    *
* ========================================================================= *

;-
        STW     .D2T1   A15,        *B15--[16]      ; Save A15, get stack frame
||      MVC     .S2     CSR,        B0              ; Grab the current CSR

        AND     .L2     B0,         -2,         B1  ; Clear GIE bit in CSR
||      MV      .L1X    B15,        A15             ; Twin the stack pointer

        STW     .D1T1   A14,        *+A15 [13]      ; Save SOE reg A14
||      STW     .D2T2   B14,        *+B15 [12]      ; Save SOE reg B14
||      MV      .L1X    B0,         A0              ; Partitioning MV.
||      MVC     .S2     B1,         CSR             ; Interrupts disabled here

;-
        STW     .D1T1   A13,        *+A15 [10]      ; Save SOE reg A13
||      STW     .D2T2   B13,        *+B15 [11]      ; Save SOE reg B13

        STW     .D1T1   A12,        *+A15 [ 9]      ; Save SOE reg A12
||      STW     .D2T2   B12,        *+B15 [ 8]      ; Save SOE reg B12

        STW     .D1T1   A11,        *+A15 [ 7]      ; Save SOE reg A11
||      STW     .D2T2   B11,        *+B15 [ 6]      ; Save SOE reg B11
||      SHL     .S2     B4,         3,      B_o     ; Set up outer loop counter
||      OR      .L1     A1,         -1,     A1      ; Prolog collapse counter

;-
        STW     .D1T1   A10,        *+A15 [ 5]      ; Save SOE reg A10
||      STW     .D2T2   B10,        *+B15 [ 4]      ; Save SOE reg B10
||      SHL     .S2     B4,         7,      B4      ; Set up end-of-array ptr
||[B_o] SUB     .L2     B_o,        1,      B_o     ; Loop count = IDCTs*8 - 1

        STW     .D2T2   B3,         *+B15 [ 2]      ; Remember the return addr
||      STW     .D1T1   A0,         *+A15 [ 3]      ; Remember the CSR state
||      ADD     .L2X    A4,         B4,     B4      ; Point to scratch area
||      MVC     .S2     IRP,        B0

;-
        STW     .D2T2   B_o,        *+B15 [ 1]      ; Spill our loop count init
||      MVC     .S2     B15,        IRP             ; Save stack ptr in IRP
||      SUB     .L2     B4,         8,      B_i_ptr ; Point to X5X4, row 7
||      MV      .L1X    B4,         A_o_ptr
||      MVK     .S1     7,          A_i             ; Set up inner loop counter

        SUB     .L1X    B_i_ptr,    8,      A_i_ptr ; Point to X1X0, row 7
||      ADDAH   .D2     B4,         31,     B_o_ptr ; Point to x3, col 7
||      ADDK    .S1     78,         A_o_ptr         ; Point to x4, col 7
;-
; ============================ PIPE LOOP PROLOG ==============================
h_prolog:
  [ B_o]LDW     .D1T1   * A_i_ptr--[4],         A_X1X0          ;[ 1,1] 
||[ B_o]LDW     .D2T2   *+B_i_ptr[1],           B_X7X6          ;[ 1,1] 
||      MVK     .S1     cst_c1,     A_c3c1                      ; c1
||[!B_o]B       .S2     idct_8x8_abort          ; Abort if num_idcts == 0

  [ B_o]LDW     .D1T1   *+A_i_ptr[5],           A_X3X2          ;[ 2,1] 
||[ B_o]LDW     .D2T2   * B_i_ptr--[4],         B_X5X4          ;[ 2,1] 
||      MVK     .S1     cst_c5,     A_c7c5                      ; c5
||      MVK     .S2     cst_c2,     B_c6c2                      ; c2
;-
        STW     .D1T2   B0,         *A15[14]                    ; save IRP

        MVKLH   .S1     cst_c7,     A_c7c5                      ; c7
||      MVKLH   .S2     cst_c6,     B_c6c2                      ; c6

        MVKLH   .S1     cst_c3,     A_c3c1                      ; c3
||      MVK     .S2     cst_c5,     B_c7c5                      ; c5

        MPYH    .M1     A_X1X0,     A_c7c5,     A_X1c7          ;[ 6,1] 
||      MPYLH   .M2     B_X7X6,     B_c6c2,     B_X6c6          ;[ 6,1] 
||      MVKLH   .S2     cst_c7,     B_c7c5                      ; c7

; ===== Branch Occurs =====
;-
        EXT     .S1     A_X1X0,     kq_a, kq_b, A_P0            ;[ 7,1] 
||      MPY     .M1X    A_X3X2,     B_c6c2,     A_X2c2          ;[ 7,1] 
||      MPYHL   .M2     B_X7X6,     B_c7c5,     B_X7c5          ;[ 7,1] 
||      MV      .L2X    A_c3c1,     B_c3c1

        ADDK    .S1     256,        A_P0                        ;[ 8,1] 
||      EXT     .S2     B_X5X4,     kq_a, kq_b, B_P1            ;[ 8,1] 
||      MPYHL   .M1     A_X1X0,     A_c3c1,     A_X1c1          ;[ 8,1] 
||      MPYH    .M2     B_X7X6,     B_c7c5,     B_X7c7          ;[ 8,1] 
;-
; ============================ PIPE LOOP KERNEL ==============================
h_loop:
h_loop_0:
        SUB     .L2     B_g1,       B_h3n,      B_x1            ;[19,1] 
||      STH     .D2T2   B_x0t,      *-B_o_ptr[24]               ;[19,1]
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -