📄 simple_idct_mmx.asm
字号:
movq mm7,[coeffs+48] ; C3 C1 C3 C1
; rounder_op mm0, rounder_arg
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1
paddd mm4,mm5 ; A0 a0
psubd mm6,mm5 ; A3 a3
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0
paddd mm0,mm1 ; A1 a1
psubd mm5,mm1 ; A2 a2
movq mm1,[coeffs+64]
pmaddwd mm1,mm2 ; -C7R3+C3R1 -C7r3+C3r1
paddd mm7,mm4 ; A0+B0 a0+b0
paddd mm4,mm4 ; 2A0 2a0
psubd mm4,mm7 ; A0-B0 a0-b0
psrad mm7,shift
psrad mm4,shift
movq mm3,mm0 ; A1 a1
paddd mm0,mm1 ; A1+B1 a1+b1
psubd mm3,mm1 ; A1-B1 a1-b1
psrad mm0,shift
psrad mm3,shift
packssdw mm7,mm7 ; A0+B0 a0+b0
movd [dst],mm7
packssdw mm0,mm0 ; A1+B1 a1+b1
movd [dst + 16],mm0
packssdw mm3,mm3 ; A1-B1 a1-b1
movd [dst + 96],mm3
packssdw mm4,mm4 ; A0-B0 a0-b0
movd [dst + 112],mm4
movq mm4,[coeffs+80] ; -C1 C5 -C1 C5
pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1
pmaddwd mm2,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1
movq mm3,mm5 ; A2 a2
paddd mm3,mm4 ; A2+B2 a2+b2
psubd mm5,mm4 ; a2-B2 a2-b2
psrad mm3,shift
psrad mm5,shift
movq mm4,mm6 ; A3 a3
paddd mm6,mm2 ; A3+B3 a3+b3
psubd mm4,mm2 ; a3-B3 a3-b3
psrad mm6,shift
packssdw mm3,mm3 ; A2+B2 a2+b2
movd [dst + 32],mm3
psrad mm4,shift
packssdw mm6,mm6 ; A3+B3 a3+b3
movd [dst + 48],mm6
packssdw mm4,mm4 ; A3-B3 a3-b3
packssdw mm5,mm5 ; A2-B2 a2-b2
movd [dst + 64],mm4
movd [dst + 80],mm5
%undef src0
%undef src4
%undef src1
%undef src5
%undef dst
%undef rounder_op
%undef rounder_arg
%undef shift
%endmacro
;---------------------------------------------------------------------------
; IDCT7
;---------------------------------------------------------------------------
%macro IDCT7 8
%define src0 %1
%define src4 %2
%define src1 %3
%define src5 %4
%define dst %5
%define rounder_op %6
%define rounder_arg %7
%define shift %8
movq mm0,[src0] ; R4 R0 r4 r0
movq mm4,[coeffs+16] ; C4 C4 C4 C4
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0
; rounder_op mm4, rounder_arg
; rounder_op mm0, rounder_arg
psrad mm4,shift
psrad mm0,shift
movq mm2,[src0 + 8] ; R4 R0 r4 r0
movq mm1,[coeffs+16] ; C4 C4 C4 C4
pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0
movq mm7,[coeffs+24] ; -C4 C4 -C4 C4
pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0
movq mm7,[coeffs+32] ; C6 C2 C6 C2
; rounder_op mm1, rounder_arg
; rounder_op mm2, rounder_arg
psrad mm1,shift
packssdw mm4,mm1 ; A0 a0
movq [dst],mm4
psrad mm2,shift
packssdw mm0,mm2 ; A1 a1
movq [dst + 16],mm0
movq [dst + 96],mm0
movq [dst + 112],mm4
movq [dst + 32],mm0
movq [dst + 48],mm4
movq [dst + 64],mm4
movq [dst + 80],mm0
%undef src0
%undef src4
%undef src1
%undef src5
%undef dst
%undef rounder_op
%undef rounder_arg
%undef shift
%endmacro
;---------------------------------------------------------------------------
; Permutation helpers
;---------------------------------------------------------------------------
%macro XLODA 2
mov bx, [srcP+2*%2] ; get src contents
mov ax, [srcP+2*%1] ; get dest contents
mov [srcP+2*%1], bx ; store new dest val
%endmacro
%macro XCHGA 2
mov ax, [srcP+2*%1] ; get dest contents
mov [srcP+2*%1], bx ; store new dest val
%endmacro
%macro XCHGB 2
mov bx, [srcP+2*%1] ; get dest contents
mov [srcP+2*%1], ax ; store new dest val
%endmacro
%macro XSTRA 2
mov [srcP+2*%1], bx ; store dest val
%endmacro
%macro XSTRB 2
mov [srcP+2*%1], ax ; store dest val
%endmacro
;---------------------------------------------------------------------------
; Permutation macro
;---------------------------------------------------------------------------
%macro PERMUTEP 1
%define srcP %1
push ebx
; XCHGA 0x00, 0x00 ; nothing to do
XLODA 0x08, 0x01
XCHGB 0x10, 0x08
XCHGA 0x20, 0x10
XCHGB 0x02, 0x20
XCHGA 0x04, 0x02
XSTRB 0x01, 0x04
XLODA 0x09, 0x03
XCHGB 0x18, 0x09
XCHGA 0x12, 0x18
XCHGB 0x24, 0x12
XSTRA 0x03, 0x24
XLODA 0x0C, 0x05
XCHGB 0x11, 0x0C
XCHGA 0x28, 0x11
XCHGB 0x30, 0x28
XCHGA 0x22, 0x30
XCHGB 0x06, 0x22
XSTRA 0x05, 0x06
XLODA 0x0D, 0x07
XCHGB 0x1C, 0x0D
XCHGA 0x13, 0x1C
XCHGB 0x29, 0x13
XCHGA 0x38, 0x29
XCHGB 0x32, 0x38
XCHGA 0x26, 0x32
XSTRB 0x07, 0x26
XLODA 0x14, 0x0A
XCHGB 0x21, 0x14
XSTRA 0x0A, 0x21
XLODA 0x19, 0x0B
XCHGB 0x1A, 0x19
XCHGA 0x16, 0x1A
XCHGB 0x25, 0x16
XCHGA 0x0E, 0x25
XCHGB 0x15, 0x0E
XCHGA 0x2C, 0x15
XCHGB 0x31, 0x2C
XCHGA 0x2A, 0x31
XCHGB 0x34, 0x2A
XCHGA 0x23, 0x34
XSTRB 0x0B, 0x23
XLODA 0x1D, 0x0F
XCHGB 0x1E, 0x1D
XCHGA 0x17, 0x1E
XCHGB 0x2D, 0x17
XCHGA 0x3C, 0x2D
XCHGB 0x33, 0x3C
XCHGA 0x2B, 0x33
XCHGB 0x39, 0x2B
XCHGA 0x3A, 0x39
XCHGB 0x36, 0x3A
XCHGA 0x27, 0x36
XSTRB 0x0F, 0x27
; XCHGA 0x1B, 0x1B
; XCHGA 0x1F, 0x1F
XLODA 0x35, 0x2E
XSTRB 0x2E, 0x35
XLODA 0x3D, 0x2F
XCHGB 0x3E, 0x3D
XCHGA 0x37, 0x3E
XSTRB 0x2F, 0x37
; XCHGA 0x3B, 0x3B
; XCHGA 0x3F, 0x3F
pop ebx
%undef srcP
%endmacro
;=============================================================================
; Code
;=============================================================================
SECTION .text
cglobal simple_idct_mmx_P
cglobal simple_idct_mmx
;-----------------------------------------------------------------------------
; void simple_idct_mmx_P(int16_t * const block)
; expects input data to be permutated
;-----------------------------------------------------------------------------
ALIGN 16
simple_idct_mmx_P:
sub esp, 128
mov edx, [esp+128+4]
; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt
DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11
Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .four
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .two
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .one
IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT0 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .ret
ALIGN 16
.four
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .six
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .five
IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT4 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .ret
ALIGN 16
.six
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .seven
IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT6 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .ret
ALIGN 16
.two
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .three
IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT2 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .ret
ALIGN 16
.three
IDCT3 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT3 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT3 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT3 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .ret
ALIGN 16
.five
IDCT5 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
; IDCT5 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT5 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
; IDCT5 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .ret
ALIGN 16
.one
IDCT1 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT1 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT1 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT1 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .ret
ALIGN 16
.seven
IDCT7 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
; IDCT7 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT7 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
; IDCT7 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
.ret
add esp, 128
ret
;-----------------------------------------------------------------------------
; void simple_idct_mmx(int16_t * const block)
;
; simple_idct_mmx is the same function as simple_idct_mmx_P above except that
; on entry it will do a fast in-line and in-place permutation on the iDCT parm
; list. This means that same parm list will also not have to be copied on the
; way out. - trbarry 6/2003
;-----------------------------------------------------------------------------
ALIGN 16
simple_idct_mmx:
sub esp, 128
mov edx, [esp+128+4]
PERMUTEP edx ; permute parm list in place
; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt
DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11
Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .fourP
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .twoP
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .oneP
IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT0 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .retP
ALIGN 16
.fourP
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .sixP
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .fiveP
IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT4 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .retP
ALIGN 16
.sixP
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .sevenP
IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT6 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .retP
ALIGN 16
.twoP
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .threeP
IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT2 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .retP
ALIGN 16
.threeP
IDCT3 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT3 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT3 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT3 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .retP
ALIGN 16
.fiveP
IDCT5 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
; IDCT5 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT5 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
; IDCT5 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .retP
ALIGN 16
.oneP
IDCT1 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
IDCT1 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT1 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
IDCT1 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
jmp .retP
ALIGN 16
.sevenP
IDCT7 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20
; IDCT7 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20
IDCT7 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20
; IDCT7 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20
.retP
add esp, 128
ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -