📄 idct.asm

📁 BF533上移植Xvid编解码中的DCT部分（AD汇编）
💻 ASM
字号:

	.section data1;	
	.byte2 _coeff[8] = {
	0x30FB,           //cos(3pi/8)  C6 
	0x7641,           //cos(pi/8)   C2
	0x6A6D,           //cos(3pi/16)
	0x471C,           //cos(5pi/16)
	0x18F8,           //cos(7pi/16)
	0x7D8A,           //cos(pi/16)
	0x5a82,           //  C4
	0x5a82 
	};
	
	.byte2 _temp_dct[64];
	
		
	.section program;	
	
	.global _idct_asm;
_idct_asm:
	link 0;
	[ -- sp ] = (r7:1, p5:0);
	r1.l=_coeff;
	r1.h=_coeff;
	
	r2.l=_temp_dct;
	r2.h=_temp_dct;
	
	//r0 for in buffer
//	CLI r7;
	call _r8x8invdct;
//	STI r7;
	 
	 
	(r7:1, p5:0) = [ sp ++ ];
	unlink;
	RTS;
_idct_asm.end:

/*

Prototype       : void _r8x8dct(fract16 *in, fract16 *coeff, fract16 *temp);

                  *in    -> Pointer to Input vector.
                  *coeff -> Pointer to coefficients.
                  *temp  -> Pointer to temporary data. 

Registers Used  : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M3, L0-L3, P0-P5, LC0, 
                  LC1.

Performance     :
                    Code Size   : 340 Bytes.
                    Cycle Count : 319 Cycles.
*/

.global     _r8x8invdct;
.align      8;
    
_r8x8invdct:

/******************** Function Prologue ***************************************/
    link 0;  // Pushing the registers on stack.
    [ -- sp ] = (r7:1, p5:0);
    [--SP]=lc0;
	[--SP]=lt0;
	[--SP]=lb0;
	[--SP]=lc1;
	[--SP]=lt1;
	[--SP]=lb1;
	
	[-- sp ] = i0; [-- sp ] = i1; [-- sp ] = i2; [-- sp ] = i3;
	[-- sp ] = b0; [-- sp ] = b1; [-- sp ] = b2; [-- sp ] = b3;
	[-- sp ] = l0; [-- sp ] = l1; [-- sp ] = l2; [-- sp ] = l3;
	[-- sp ] = m0; [-- sp ] = m1; [-- sp ] = m2; [-- sp ] = m3;
	
	[--sp]=A0.x;[--sp]=A0.W;[--sp]=A1.X;[--sp]=A1.W;
	
    B0 = R0;                // Pointer to Input matrix.
    B3 = R1;                // Pointer to Coefficients
    B2 = R2;                // Pointer to Temporary matrix.
    L0 = 0;                 // L registers are initialized to 0
    L1 = 0;                 // --------- do --------
    L2 = 0;                 // --------- do --------
    L3 = 16;                // L3 is used for making coefficients array
                            // circular.
    
    M1 = 16 (X);            // All these registers are initialized for
    M2 = 7 (X);             // modifying the address offsets.
    M3 = 8(X);
    P2 = 16;
    P3 = 32 (X);
    P4 = -110 (X);
    P5 = -62 (X);
    P0 = 2;
    NOP;
    
/*
*   According to Chen's algorithm, first 8-point IDCT will be calculated for all
*   the 8 rows. The output of this calculation is stored in another transpose 
*   matrix. Now again the 8-point IDCT is applied on all the 8 rows. The output
*   is stored in matrix transpose form. This is the final output. Therefore,
*   a loop of 2 iteration (IDCT_START, IDCT_END) is set.
*
*   B0 points to the "in" buffer and B2 points to "temp" buffer in the first 
*   iteration. The input is read from "in" buffer and output is written to
*   "temp" buffer. In the second iteration of IDCT_START B0 points to "temp" and
*   B2 points to "in" buffer. The input is read from "temp" buffer and output
*   is written to "in" buffer. "in" buffer holds the final output. 
*/
    
    LSETUP(IDCT_START, IDCT_END) LC0 = P0;
IDCT_START: 
        I0 = B0;            // I0 points to Input Element (0, 0)
        I2 = B0;            // I2 points to Input Element (0, 0)
        I2 += M3 || R3.L = W[I0];
                            // Element 0 is read in R3.L 
        I1 = I2;            // I1 points to input Element (0, 6)
        I1 += 4  || R3.H = W[I2++];
                            // I2 points to input Element (0, 4) 
                            // Element 4 is read in R3.H
        I3 = B3;            // I3 points to Coefficients
        P0 = B2;            // P0 points to array Element (0, 0) for writing 
                            // output
        P1 = B2;
        R7.L = 0x5a82;      // R7.L holds the coefficients C4.
        P1 = P1 + P2;       // P1 points to array element (1, 0) for writing 
                            //output
    
/********************** Implementation of Part 1 ******************************/
    
/*
*   The following operation is done in 2 instructions.
*   A1 = Element 0 * cos(pi/4) 
*   A0 =  Element 0 * cos(pi/4)
*   A1 = A1 - Element 4 * cos(pi/4)
*   A0 = A0 + Element 4 * cos(pi/4)
*   At the same time the value of Element 2 and 6 are read in RH3 and RL3 
respectively.
*/
    
        A1 = R3.L * R7.L, A0 = R3.L * R7.L  || I0 += 4  || R1.L = W[I1++];
        R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L) 
        || R1.H = W[I0++] || R7 = [I3++];
    
/*
*   The following two instructions do -
*   A1 = Element 2 * cos(3pi/8) 
*   A0 =  Element 6 * cos(3pi/8)
*   A1 = A1 - Element 6 * cos(pi/8)
*   A0 = A0 + Element 2 * cos(pi/8)
*   R2 reads the input elements ( 5, 3).
*   R7 reads the coefficients value C5 and C3.
*/
    
        A1= R1.H * R7.L, A0 = R1.L * R7.L || I0 -= 4 || R2.L = W[I0];
        R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H) || R2.H = W[I2--]
        || R7 = [I3++];
    
/*
*   The following three instructions do -
*   A1 = Element 5 * cos(3pi/16) 
*   A0 =  Element 3 * cos(3pi/16)
*   A1 = A1 - Element 3 * cos(5pi/16)
*   A0 = A0 + Element 5 * cos(5pi/16)
*   Element 0 = (Element 0 + Element 6) / 2.
*   Element 4 = (Element 4 + Element 2) / 2.
*   Element 2 = (Element 4 - Element 2) / 2.
*   Element 6 = (Element 0 - Element 6) / 2.
*   The writing W[P0] = R5.L is done for packing purpose. The register locations
*   for element 4 and 6 are swapped.
*/
    
        A1 = R2.H * R7.L, A0 = R2.L * R7.L  || R0.H = W[I0--]   || NOP;
        R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || R0.L = W[I1--]  || NOP;
        R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H)
        || W[P0] = R5.L || R7 = [I3++];
    
/*
*   At the end of part 1 R0 has (1, 7), R5 has (2, 6), R2 has (5, 3) and
*   R3 has (4, 0). 
*   Where notation (x, y) means the element from column x is in upper half of 
register
*   and element from column y is in lower half of the register.
*/
    
// The loop for 7 is set. The last iteration is computed separately.
  P2 = M2;

        LSETUP (R0W_START, R0W_END) LC1 = P2;
P2 = 16;        
R0W_START:
    
/****************** Implementation of Part 2 **********************************/
/*
*   The following two instruction does the following job -
*   A1 = Element 1 * cos(7pi/16) 
*   A0 =  Element 7 * cos(7pi/16)
*   A1 = A1 - Element 7 * cos(pi/16)
*   A0 = A0 + Element 1 * cos(pi/16)
*   The read to R1 is dummy.
*   R7 reads the coefficient value C2 and C6.
*/
    
            A1 = R0.H * R7.L, A0 = R0.L * R7.L  || R1 = [I1++M1] 
            || W[P1] = R3.H;
            R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H)  || I0 += M1 
            || R7 = [I3++];
    
/*
*   The following single instructions operates on 4 data as -
*   Element 1 = (Element 1 + Element 5) / 2.
*   Element 5 = (Element 1 - Element 5) / 2.
*   Element 3 = (Element 7 - Element 3) / 2.
*   Element 7 = (Element 7 + Element 3) / 2.
*/
    
            R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0]
            || R1.L = W[I1++];
    
/*
*   At the end of part 2 R0 has (1, 7), and R2 has (5, 3).
*   The The registers R3.H and R5.L are being swapped. 
*/
    
/******************** Implementation of Part 3 ********************************/
/*
*  The following instruction does the following job.
*  Element 0 = Element 0 + Element 7.
*  Element 7 = Element 7 - Element 0.
*  Element 6 = Element 6 + Element 1.
*  Element 1 = Element 6 - Element 1.
*  The elements 0, 1, 6 and 7 are final.
*/
    
            R4 = R3 +|+ R0, R0 = R3 -|- R0 || I2 += M1 || R3.L = W[I0];
    
/*
*  The following two instructions do -
*  A1 = Element 3 * cos(pi/4) 
*  A0 =  Element 3 * cos(pi/4)
*  A1 = A1 - Element 5 * cos(pi/4)
*  A0 = A0 + Element 5 * cos(pi/4)
*/
    
            A1 = R2.L * R7.L, A0 = R2.L * R7.L || I0 += 4 || R3.H = W[I2++];
            R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L)
            || R5.L = W[P1] || R1.H = W[I0++];
    
/*
*   At the end of part 3 R0 has (1, 7), and R2 has (5, 3), R4 has (6, 0) 
*   and R5 has (2, 4). Registers R4 and R0 holds final output.
*/
    
/******************** Implementation of Part 4 ********************************/
/*
*  It is the final stage computation.
*  Element 4 = Element 4 + Element 3.
*  Element 2 = Element 2 + Element 5.
*  Element 5 = Element 5 - Element 2.
*  Element 3 = Element 3 - Element 4.
*/
    
            R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L  
            || R2.L = W[I0--];
                            //R5 = 2, 4 and R6 = 5, 3 final
/*
*   At the end of part 4 R0 has (1, 7), and R4 has (6, 0), R5 has (2, 4) 
*   and R6 has (5, 3). All the registers hold final output.
*/
    
/**************************** Implementation of Part 1 
**********************************/
/*
*  This is the same part as part 1 specified earlier. First time the part 1 
calculation is
*  done outside the loop, after wards it is done here. It serves two purpose.
*  First it computes part 1 and it writes the data 2, 1, 3, 7, 4, 6 and 5 to 
its bit 
*  reversed order in transpose way. 
*/
            A1 = R3.L * R7.L, A0 = R3.L * R7.L || W[P0++P3] = R5.H 
            || R2.H = W[I0--];
            R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L) 
            || W[P0++P3] = R0.H || R7 = [I3++];
            A1= R1.H * R7.L, A0 = R1.L * R7.L || W[P0++P2] = R6.L  
            || R2.H = W[I2--];
            R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H) 
            || W[P0++P4] = R0.L || R7 = [I3++];
            A1 = R2.H * R7.L, A0 = R2.L * R7.L || W[P1++P3] = R5.L 
            || R0.H = W[I0--];
            R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || W[P1++P3] = R4.H 
            || R0.L = W[I1--];
            W[P0] = R5.L;
R0W_END:    R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H)
            || W[P1++P5] = R6.H || R7 = [I3++];
    
/*
*  The computation for 7 rows are over. The last row computation is done here.
*  This is the same part as part 2, 3 and 4 done inside the loop. 
*/
    
        A1 = R0.H * R7.L, A0 = R0.L * R7.L  || NOP  || W[P1] = R3.H; 
        R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H) || R7 = [I3++];
        R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0];
        R4 = R3 +|+ R0, R0 = R3 -|- R0;
        A1 = R2.L * R7.L, A0 = R2.L * R7.L;
        R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L) || R5.L = W[P1];
        R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L || NOP;
    
        W[P0++P3] = R5.H;   // The last outputs are written here.
        W[P0++P3] = R0.H;
        W[P0++P2] = R6.L;
        W[P0++P4] = R0.L;
        W[P1++P3] = R5.L;
        W[P1++P3] = R4.H;
        W[P1++P5] = R6.H;
    
        B1 = B2;            // The pointers to output and input are swapped
        B2 = B0;            // B0 points to Input buffer
IDCT_END: 
        B0 = B1;            // B2 points to output buffer
    
TERMINATE:

	A1.W=[sp++]; A1.X=[sp++]; A0.W=[sp++];A0.X=[sp++];

	m3=[sp++];m2=[sp++];m1=[sp++];m0=[sp++];
	l3=[sp++];l2=[sp++];l1=[sp++];l0=[sp++];
	b3=[sp++];b2=[sp++];b1=[sp++];b0=[sp++];
	i3=[sp++];i2=[sp++];i1=[sp++];i0=[sp++];

    lb1=[SP++];
	lt1=[SP++];
	lc1=[SP++];
	
	lb0=[SP++];
	lt0=[SP++];
	lc0=[SP++];

	(r7:1, p5:0) = [ sp ++ ];
	unlink;
	RTS;
    NOP;                    //to avoid one stall if LINK or UNLINK happens to be
                            //the next instruction after RTS in the memory.
_r8x8invdct.end:
💿 文件大小 17 K
👤 上传用户 pipinooad
📂 所属分类 DSP编程
📄 代码行数 346 行
💻 语言类型 ASM
🏷️ 相关标签

#Xvid #533 #DCT #BF
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -