ppc_zoom_ultimate.s
来自「linux下的MPEG1」· S 代码 · 共 324 行
S
324 行
; PowerPC optimized zoom for Goom; © 2001-2003 Guillaume Borios; This Source Code is released under the terms of the General Public License; Change log :; 21 Dec 2003 : Use of altivec is now determined with a parameter; Section definition : We use a read only section.text; name of the function to call by C program : ppc_zoom; We declare this label as a global to extend its scope outside this file.globl _ppc_zoom_generic.globl _ppc_zoom_G4; Description :; This routine dynamically computes and applies a zoom filter; parameters :; r3 <=> unsigned int sizeX (in pixels); r4 <=> unsigned int sizeY (in pixels); r5 <=> unsigned int * frompixmap; r6 <=> unsigned int * topixmap; r7 <=> unsigned int * brutS; r8 <=> unsigned int * brutD; r9 <=> unsigned int buffratio; r10 <=> int [16][16] precalccoeffs; globals after init; r5 <=> frompixmap - 1 byte needed for preincremental fetch (replaces r5); r6 <=> topixmap - 1 byte needed for preincremental fetch (replaces r6); r3 <=> ax = x max in 16th of pixels (replaces old r3); r4 <=> ay = y max in 16th of pixels (replaces old r4); r20 <=> row size in bytes; r12 <=> 0xFF00FF (mask for parallel 32 bits pixs computing); r30 <=> brutS - 1 byte needed for preincremental fetch (replaces r7); r31 <=> brutD - 1 byte needed for preincremental fetch (replaces r8); ABI notes :; r1 is the Stack Pointer (SP) => Do not use; r13..r31 are non-volatiles => Do not use_ppc_zoom_generic:; Saves the used non volatile registers in the Mach-O stack s Red-Zonestmw r18,-56(r1); initli r18,0 ; Default value if out of range : 0 (Black)mr r11,r10lis r12,0xFFmullw r2,r3,r4 ; Number of pixels to computesubi r30,r8,0slwi r20,r3,2srawi r19,r20,2ori r12,r12,0xFFsubi r3,r3,1subi r4,r4,1mtspr ctr,r2 ; Init the loop count (one loop per pixel computed)subi r31,r7,0subi r6,r6,4slwi r3,r3,4slwi r4,r4,4;pre init for looplwz r2,0(r31) ; pxlwz r29,4(r31) ; pylwz r8,0(r30) ; px2lwz r10,4(r30) ; py2b L1.align 5L1:; computes dynamically the position to fetchsub r8,r8,r2sub r10,r10,r29mullw r8,r8,r9addi r31,r31,8mullw r10,r10,r9addi r30,r30,8srawi r8,r8,16srawi r10,r10,16add r2,r2,r8add r29,r29,r10; if px>ax or py>ay goto outofrange; computes the attenuation coeffs and the original point addressrlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)*4*16)cmpl cr4,0,r2,r3rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D) (r10=(r10%16)*4 | r10)cmpl cr7,0,r29,r4srawi r29,r29,4 ; pos computingbge- cr4,L4srawi r2,r2,4 ; pos computingmullw r29, r29,r19 ; pos computingbge- cr7,L4; Channels notation : 00112233 (AARRVVBB)add r2,r2,r29 ; pos computinglwzx r10,r11,r10 ; Loads coefsslwi r2,r2,2 ; pos computingadd r2,r2,r5 ; pos computingrlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011)lwz r25,0(r2) ; Loads col1 -> r25lwz r26,4(r2) ; Loads col2 -> r26rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022)rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033)add r2,r2,r20 ; Adds one line for future load of col3 and col4and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XXrlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044)andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3; computes final pixel colorand r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XXlwz r27,0(r2) ; Loads col3 -> r27mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3mullw r25,r25,r21 ; Applies coef1 on col1 channel 2andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00mullw r29,r29,r22 ; Applies coef2 on col2 channel 2lwz r28,4(r2) ; Loads col4 -> r28add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XXadd r25,r25,r29 ; Adds col1 & col2 channel 2mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00mullw r29,r29,r23 ; Applies coef3 on col3 channel 2lwz r2,0(r31) ; pxadd r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XXmullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2lwz r8,0(r30) ; px2andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3lwz r10,4(r30) ; py2mullw r28,r28,r24 ; Applies coef4 on col4 channel 2srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8lwz r29,4(r31) ; pyadd r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)stwu r7,4(r6) ; Stores the computed pixelbdnz L1 ; Iterate again if neededb L3 ;goto end ; If not, returns from the function; if out of rangeL4:stwu r18,4(r6)lwz r8,0(r30) ; px2lwz r10,4(r30) ; py2lwz r2,0(r31) ; pxlwz r29,4(r31) ; pybdnz L1L3:; Restore saved registers and returnlmw r18,-56(r1)blr_ppc_zoom_G4:; Saves the used non volatile registers in the Mach-O stack s Red-Zonestmw r17,-60(r1); initli r18,0 ; Default value if out of range : 0 (Black)mr r11,r10lis r12,0xFFmullw r2,r3,r4 ; Number of pixels to computesubi r30,r8,0slwi r20,r3,2srawi r19,r20,2ori r12,r12,0xFFsubi r3,r3,1subi r4,r4,1mtspr ctr,r2 ; Init the loop count (one loop per pixel computed)subi r31,r7,0subi r6,r6,4slwi r3,r3,4slwi r4,r4,4;pre init for looplwz r2,0(r31) ; pxlwz r29,4(r31) ; pylwz r8,0(r30) ; px2lwz r10,4(r30) ; py2;*********************lis r17,0x0F01b L100.align 5L100:addi r6,r6,4; Optimization to ensure the destination buffer; won't be loaded into the data cacherlwinm. r0,r6,0,27,31bne+ L500dcbz 0,r6;dcba 0,r6L500:; computes dynamically the position to fetch;mullw r8,r8,r29;mullw r2,r2,r29;add r2,r8,r2;srawi r2,r2,17sub r8,r8,r2sub r10,r10,r29mullw r8,r8,r9addi r31,r31,8mullw r10,r10,r9addi r30,r30,8dst r30,r17,0srawi r8,r8,16srawi r10,r10,16add r2,r2,r8add r29,r29,r10dst r31,r17,1; if px>ax or py>ay goto outofrange; computes the attenuation coeffs and the original point addressrlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)*4*16)cmpl cr4,0,r2,r3rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D) (r10=(r29%16)*4 | r10)cmpl cr7,0,r29,r4srawi r29,r29,4 ; pos computingbge- cr4,L400srawi r2,r2,4 ; pos computingmullw r29, r29,r19 ; pos computingbge- cr7,L400; Channels notation : 00112233 (AARRVVBB)add r2,r2,r29 ; pos computinglwzx r10,r11,r10 ; Loads coefsslwi r2,r2,2 ; pos computingadd r2,r2,r5 ; pos computingrlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011)lwz r25,0(r2) ; Loads col1 -> r25lwz r26,4(r2) ; Loads col2 -> r26rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022)rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033)add r2,r2,r20 ; Adds one line for future load of col3 and col4and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XXrlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044)dst r2,r17,2rlwinm r25,r25,0,16,23 ; Masks col1 channel 2 : 0x0000XX00;andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3; computes final pixel colorand r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XXlwz r27,0(r2) ; Loads col3 -> r27mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3mullw r25,r25,r21 ; Applies coef1 on col1 channel 2rlwinm r29,r26,0,16,23 ; Masks col2 channel 2 : 0x0000XX00;andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00mullw r29,r29,r22 ; Applies coef2 on col2 channel 2lwz r28,4(r2) ; Loads col4 -> r28add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XXadd r25,r25,r29 ; Adds col1 & col2 channel 2mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3rlwinm r29,r27,0,16,23 ; Masks col3 channel 2 : 0x0000XX00;andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00mullw r29,r29,r23 ; Applies coef3 on col3 channel 2lwz r2,0(r31) ; pxadd r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XXmullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2lwz r8,0(r30) ; px2rlwinm r28,r28,0,16,23 ; Masks col4 channel 2 : 0x0000XX00;andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3lwz r10,4(r30) ; py2mullw r28,r28,r24 ; Applies coef4 on col4 channel 2srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8lwz r29,4(r31) ; pyadd r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)stw r7,0(r6) ; Stores the computed pixelbdnz L100 ; Iterate again if neededb L300 ;goto end ; If not, returns from the function; if out of rangeL400:stw r18,0(r6)lwz r8,0(r30) ; px2lwz r10,4(r30) ; py2lwz r2,0(r31) ; pxlwz r29,4(r31) ; pybdnz L100L300:; Restore saved registers and returnlmw r17,-60(r1)blr
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?