📄 fdct.asm

📁 adi bf533视频编码程序
💻 ASM
📖 第 1 页 / 共 2 页
字号:
上一页 12
   (R7:4,P5:3)=[SP++];     //Pop the registers before returning.
    RTS;                    //Return.
    NOP;                    //to avoid one stall if LINK or UNLINK happens to be
 
    
//the next instruction after RTS in the memory.



//void xhDCT8x8Fwd_8u16s_C1R(unsigned char  *pSrc, int srcStep, short *pDst,Ipp16s *coeff,Ipp16s *temp);

_xhDCT8x8Fwd_8u16s_C1R:


/******************************* Function Prologue ***************************/
    [--SP] = (R7:4, P5:3);  //Pushing the Registers on stack.
    [--SP]=RETS;
     B0 = R2;                //Pointer to Input matrix.
     R7 = [SP+32+12];         //Pointer to Coefficients.
     B3=R7;
     R6 = [SP+32+16];        //Pointer to Temporary matrix.
     B2=R6;
     
     
    L0 = 0;                 //L registers are initialized to 0
    L1 = 0;                 //-------- do --------
    L2 = 0;
    
    
	 I0=R0;
	 I2=R2;
	 M0=R1;
	 P0=8;
	 R0+=4;
	 I1=R0;
	 R0=[I0++M0]||R1=[I1++M0];
	 P0=8;
	 LSETUP(transfer8to16_start,transfer8to16_end)LC0=P0;
	        (R7,R6) = BYTEUNPACK R1:0;
	 transfer8to16_start:
	 
	        (R5,R4) = BYTEUNPACK R1:0(R)||[I2++]=R6;
	         R0=[I0++M0]||[I2++]=R7;
	        (R7,R6) = BYTEUNPACK R1:0||[I2++]=R4;
	        
	 transfer8to16_end:
	         R1=[I1++M0]||[I2++]=R5;
	       
	  /* 
	 I0=R0;
	 P1=I0;
	 I2=R2;
	 M0=R1;
	 R1+=-8;
	 P2=R1;
	 
	 P0=64;
	 R0=B[P1++](Z);
	 LSETUP(transfer8to16_start,transfer8to16_end)LC0=P0;
	    
	 transfer8to16_start:
	 
	           W[I2++]=R0.L;
	           R0=B[P1++](Z);
	           
	           W[I2++]=R0.L;
	           R0=B[P1++](Z);
	           
	           W[I2++]=R0.L;
	           R0=B[P1++](Z);
	           
	           W[I2++]=R0.L;
	           R0=B[P1++](Z);
	          
	            W[I2++]=R0.L;
	            R0=B[P1++](Z);
	            W[I2++]=R0.L;
	            R0=B[P1++](Z);
	            W[I2++]=R0.L;
	            R0=B[P1++](Z);
	            W[I2++]=R0.L;
	           
	            P1=P1+P2;
	 transfer8to16_end:
	           R0=B[P1++](Z);*/
	         
	         
	         
	        
	R0=B0;
	R1=B3;      
    R2=B2;
                
                     //-------- do --------
    L3 = 16;                //L3 is set to 16 to make the coefficients
                            //array Circular.
/*
 I0, I1, and I2 registers are used to read the input data. I3 register is used
 to read the coefficients. P0 and P1 registers are used for writing the output
 data.  
*/  
    
    M0 = 12 (X);            // All these initialization are used in the
                            // modification of address offsets.
    M1 = 16 (X);
    P2 = 16;        
    P3 = 32 (X);
    P4 = -110 (X);
    P5 = -62 (X);  
    P0 = 2;
/*
   According to Chen's algorithm, first 8-point DCT will be calculated for all
   the 8 rows. The output of this calculation is stored in another transpose 
   matrix. Now again the 8-point DCT is applied on all the 8 rows. The output
   is stored in matrix in transposed form. This is the final output. 
   Therefore,
   a loop of 2 iteration (DCT_START, DCT_end) is set.

   B0 points to the "in" buffer and B2 points to "temp" buffer in the first 
   iteration. The input is read from "in" buffer and output is written to
   "temp" buffer. In the second iteration of DCT_START B0 points to "temp" and
   B2 points to "in" buffer. The input is read from "temp" buffer and output
   is written to "in" buffer. "in" buffer holds the final output.
*/
    
    LSETUP (DCT_START_8U16S, DCT_END_8U16S) LC0 = P0;
DCT_START_8U16S:
        I0 = B0;            //I0 points to Input Element (0, 0)
        I1 = B0;            //Element 1 and 0 is read in R0.
        I1 += M0  || R0 = [I0++];
                            //I1 points to Input Element (0, 6) 
        I2 = I1;            //Element 6 is read in R3.H
        I2 -= 4   || R3.H = W[I1++];
                            //I2 points to Input Element (0, 4) 
    
        I3 = B3;            //I3 points to Coefficients
        P0 = B2;            //P0 points to temporary array Element (0, 0)
        P1 = B2;            //P1 points to temporary array
        R7 = [P1++P2] || R2 = [I2++];
                            //P1 points to temporary array Element (1, 0) 
                            //R7 is a dummy read. Element 4 and 5 are read in R2
        R3.L = W[I1--];     //Element 7 is read in R3.L
        R1.H = W[I0++];     //Element 2 is read in R1.H
    
//******************************* Implementation of Part 1 ********************
/*
  All the additions from Stage 1 and Stage 2 are implemented in Part1 1. for 
the optimization
  It is taken out of the loop for optimization point of view. (Part 1)
  The following instruction does the following job.
  Element 0 = (Element 0 + Element 7) / 2.
  Element 1 = (Element 1 + Element 6) / 2.
  Element 6 = (Element 1 - Element 6) / 2.
  Element 7 = (Element 0 - Element 7) / 2.
  It reads the data 3 in R1.L. 
*/
    
        R0 = R0 +|+ R3, R3 = R0 -|- R3 (ASR) || R1.L = W[I0++] || NOP;
    
/*
 This single instruction does the following job.
 Element 2 = (Element 2 + Element 5) / 2.
 Element 3 = (Element 3 + Element 4) / 2.
 Element 4 = (Element 3 - Element 4) / 2.
 Element 5 = (Element 2 - Element 5) / 2.
 It reads the Coefficients C4 = cos(4*pi/16) in register R7.
*/
    
        R1 = R1 +|+ R2, R2 = R1 -|- R2 (ASR, CO) || NOP ||  R7 = [I3++];
    
/*
   At the end of stage 1 R0 has (1,0), R1 has (2,3), R2 has (4, 5) and
   R3 has (6,7). Where notation (x, y) means the element from column x is in
   upper half of register and element from column y is in lower half of the
   register.
*/
    
//******************************* Implementation of Part 2 *********************
/*
   The following addition/subtraction  instruction does - 
   Element 0 = Element 0 + Element 3.
   Element 1 = Element 1 + Element 2.
   Element 2 = Element 1 - Element 2.
   Element 3 = Element 0 - Element 3.
*/
        R0 = R0 +|+ R1, R1 = R0 -|- R1;
    
        LSETUP (ROW_START_8U16S, ROW_END_8U16S) LC1 = P2 >> 1;
                            //The loop is set for 8 rows. 
ROW_START_8U16S:
/*
   This is part 2 computation continued.....
   The following two instructions do -
   A1 = Element 6 * cos(pi/4) 
   A0 =  Element 6 * cos(pi/4)
   A1 = A1 - Element 5 * cos(pi/4)
   A0 = A0 + Element 5 * cos(pi/4).
   The instruction W[I0] = R3.L is used for packing it to R2.L. 
*/
            A1 = R3.H * R7.L, A0 = R3.H * R7.L  ||  I1 += M1 || W[I0] = R3.L;
            R4.H = ( A1 -= R2.L * R7.L), R4.L = (A0 += R2.L * R7.L) || I2 += M0;
/*
   At the end of stage 2 R0 has (1,0), R1 has (2,3), R4 has (5, 6).
*/
//************************ Implementation of Part 3 ***************************
/*
   The following two instruction does the job of stage 3 -
   A1 = Element 0 * cos(pi/4) 
   A0 =  Element 0 * cos(pi/4)
   A1 = A1 - Element 1 * cos(pi/4)
   A0 = A0 + Element 1 * cos(pi/4)
   The value of coefficients C2 and C6 are read in register R7. 
*/      
            A1 = R0.L * R7.L, A0 = R0.L * R7.L ||  NOP  || R3.H = W[I1++];
            R5.H = (A1 -= R0.H * R7.L), R5.L = (A0 += R0.H * R7.L)
            || R7 = [I3++];
    
/*
   The following three instructions do -
   A1 = Element 2 * cos(3pi/8) 
   A0 =  Element 3 * cos(3pi/8)
   A1 = A1 + Element 3 * cos(pi/8)
   A0 = A0 - Element 2 * cos(pi/8)
   R3 reads the value of cos pi/4.
   The value of coefficients C7 and C1 is read in register R7.
   Element 4 = Element 4 + Element 5.
   Element 5 = Element 4 - Element 5.
   Element 6 = Element 7 - Element 6.
   Element 7 = Element 7 + Element 6.
*/
            A1 = R1.H * R7.L, A0 = R1.L * R7.L ||  W[P0++P3] = R5.L
            || R2.L = W[I0];
            R2 = R2 +|+ R4, R4 = R2 -|- R4  ||  I0 += 4 || R3.L = W[I1--];
            R6.H = (A1 += R1.L * R7.H), R6.L = (A0 -= R1.H * R7.H) ||  I0 += 4
            || R7 = [I3++];
    
/*
   At the end of part 3 R2 has (4, 7), R4 has (5,6), R5 has (1, 0) and
   R6 has (2,3).
*/
//****************************** Implementation of Part 4 **********************
/*
   The following two instructions do -
   A1 = Element 4 * cos(7pi/16) 
   A0 =  Element 7 * cos(7pi/16)
   A1 = A1 + Element 7 * cos(pi/16)
   A0 = A0 - Element 4 * cos(pi/16)
   The value of next coefficients are read, and the registers are written to 
   their locations.
*/
    
            A1 = R2.H * R7.L, A0 = R2.L * R7.L ||  W[P0++P3] = R6.H
            || R0 = [I0++];
            R2.H = ( A1 += R2.L * R7.H), R2.L = ( A0 -= R2.H * R7.H)
            || W[P0++P3] = R5.H || R7 = [I3++]; 
/*
   The following two instructions do -
   A1 = Element 5 * cos(3pi/16) 
   A0 =  Element 6 * cos(3pi/16)
   A1 = A1 + Element 6 * cos(5pi/16)
   A0 = A0 - Element 5 * cos(5pi/16)
   The output values are written.
*/
    
            A1 = R4.H * R7.H, A0 = R4.L *R7.H ||  W[P0++P2] = R6.L 
            || R1.H = W[I0++]; 
            R4.H = (A1 += R4.L * R7.L), R4.L = ( A0 -= R4.H * R7.L)
            ||  W[P0++P4] = R2.L || R1.L = W[I0++];      
    
//******************* Implementation of Part 1 *****************************
/*
  This is the same part as part 1 specified earlier. First time the part 1 
  calculation is done outside the loop, after wards it is done here. It serves
  two purpose. Firts it computes part 1 for the next data, and it writes the
  data 5, and 6 to its bit reversed order in transpose way. 
*/
            R0 = R0 +|+ R3, R3 = R0 -|- R3 (ASR) || W[P1++P3] = R2.H
            || R2 = [I2++];
            R1 = R1 +|+ R2, R2 = R1 -|- R2 (ASR, CO) ||  W[P1++P3] = R4.L
            || R7 = [I3++]; 
ROW_END_8U16S:    R0 = R0 +|+ R1, R1 = R0 -|- R1 ||  W[P1++P5] = R4.H || NOP;
        B1 = B0;            //Swapping of Input and output address pointers
        B0 = B2;            //B0 points to input buffer.
DCT_END_8U16S:
        B2 = B1;            //B2 points to output buffer.
    
_xhDCT8x8Fwd_8u16s_C1R.END:
   RETS=[SP++] ; 
   (R7:4,P5:3)=[SP++];     //Pop the registers before returning.
    RTS;                    //Return.
    NOP;
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -