⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idct_3dne.asm

📁 wince下的xvidcore开发库,可用于MP4等视频播放开发
💻 ASM
📖 第 1 页 / 共 3 页
字号:
   pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
   pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
   pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
   pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
   paddd mm0, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
   pshufw mm1, [eax+80+8],10001000b  ; x6 x4 x6 x4
   movq mm4, mm3                 ; 4     ; a1 a0
   paddd mm6, mm7                ; 7 free    ; b1=sum(odd1) b0=sum(odd0)
   paddd mm2, mm5                ; 5 free    ; b3=sum(odd3) b2=sum(odd2)
   pshufw mm5, [eax+80],10001000b; x2 x0 x2 x0   mm5 & mm0 exchanged for next cycle
   movq mm7, mm0                 ; 7     ; a3 a2
   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
   paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
   movq mm3, [tab_i_35_xmm]      ; 3     ; w05 w04 w01 w00
   psubd mm7, mm2                ; ; a3-b3 a2-b2
   paddd mm0, mm2                ; 0 free a3+b3 a2+b2
   pshufw mm2, [eax+80],11011101b; x3 x1 x3 x1
   pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
   pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
   psrad mm0, SHIFT_INV_ROW      ; y3=a3+b3 y2=a2+b2
   packssdw mm7, mm4             ; 4     ; y6 y7 y4 y5
   packssdw mm6, mm0             ; 0 free    ; y3 y2 y1 y0
   pshufw mm0, [eax+80+8],11011101b  ; x7 x5 x7 x5
   movq [eax+64], mm6            ; 3     ; save y3 y2 y1 y0 stall2
 
 ;   DCT_8_INV_ROW_1_s [eax+80], [eax+80], tab_i_35_xmm, rounder_5
   movq mm4, [tab_i_35_xmm+8]    ; 4     ; w07 w06 w03 w02
   movq mm6, [tab_i_35_xmm+32]   ; 6     ; w21 w20 w17 w16
   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
   paddd mm3, [rounder_5]        ; +rounder stall 6
   paddd mm5, [rounder_5]        ; +rounder
   movq [eax+64+8], mm7          ; 7     ; save y7 y6 y5 y4
   movq mm7, [tab_i_35_xmm+40]   ; 7     ; w23 w22 w19 w18
   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
   pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
   pmaddwd mm2, [tab_i_35_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
   pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
   pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
   paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
   pshufw mm1, [eax+96+8],10001000b  ; x6 x4 x6 x4
   movq mm4, mm3                 ; 4     ; a1 a0
   paddd mm6, mm7                ; 7 free    ; b1=sum(odd1) b0=sum(odd0)
   paddd mm2, mm0                ; 5 free    ; b3=sum(odd3) b2=sum(odd2)
   pshufw mm0, [eax+96],10001000b    ; x2 x0 x2 x0
   movq mm7, mm5                 ; 7     ; a3 a2
   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
   paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
   movq mm3, [tab_i_26_xmm]      ; 3     ; w05 w04 w01 w00
   psubd mm7, mm2                ; ; a3-b3 a2-b2
   paddd mm5, mm2                ; 0 free a3+b3 a2+b2
   pshufw mm2, [eax+96],11011101b; x3 x1 x3 x1
   pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
   pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
   psrad mm5, SHIFT_INV_ROW      ; y3=a3+b3 y2=a2+b2
   packssdw mm7, mm4             ; 4     ; y6 y7 y4 y5
   packssdw mm6, mm5             ; 0 free    ; y3 y2 y1 y0
   pshufw mm5, [eax+96+8],11011101b  ; x7 x5 x7 x5
   movq [eax+80], mm6            ; 3     ; save y3 y2 y1 y0
 
 ;   DCT_8_INV_ROW_1_s [eax+96], [eax+96], tab_i_26_xmm, rounder_6
   movq mm4, [tab_i_26_xmm+8]    ; 4     ; w07 w06 w03 w02
   movq mm6, [tab_i_26_xmm+32]   ; 6     ; w21 w20 w17 w16
   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
   paddd mm3, [rounder_6]        ; +rounder
   paddd mm0, [rounder_6]        ; +rounder
   movq [eax+80+8], mm7          ; 7     ; save y7 y6
   movq mm7, [tab_i_26_xmm+40]   ; 7     ; w23 w22 w19 w18
   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
   pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
   pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
   pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
   pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
   paddd mm0, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
   pshufw mm1, [eax+112+8],10001000b ; x6 x4 x6 x4
   movq mm4, mm3                 ; 4     ; a1 a0
   paddd mm6, mm7                ; 7 free    ; b1=sum(odd1) b0=sum(odd0)
   paddd mm2, mm5                ; 5 free    ; b3=sum(odd3) b2=sum(odd2)
   pshufw mm5, [eax+112],10001000b; x2 x0 x2 x0  mm5 & mm0 exchanged for next cycle
   movq mm7, mm0                 ; 7     ; a3 a2
   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
   paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
   movq mm3, [tab_i_17_xmm]      ; 3     ; w05 w04 w01 w00
   psubd mm7, mm2                ; ; a3-b3 a2-b2
   paddd mm0, mm2                ; 0 free a3+b3 a2+b2
   pshufw mm2, [eax+112],11011101b; x3 x1 x3 x1
   pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
   pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
   psrad mm0, SHIFT_INV_ROW      ; y3=a3+b3 y2=a2+b2
   packssdw mm7, mm4             ; 4     ; y6 y7 y4 y5
   packssdw mm6, mm0             ; 0 free    ; y3 y2 y1 y0
   pshufw mm0, [eax+112+8],11011101b ; x7 x5 x7 x5
   movq [eax+96], mm6            ; 3     ; save y3 y2 y1 y0 stall2
 
 ;   DCT_8_INV_ROW_1_s [eax+112], [eax+112], tab_i_17_xmm, rounder_7
   movq mm4, [tab_i_17_xmm+8]    ; 4     ; w07 w06 w03 w02
   movq mm6, [tab_i_17_xmm+32]   ; 6     ; w21 w20 w17 w16
   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
   paddd mm3, [rounder_7]        ; +rounder stall 6
   paddd mm5, [rounder_7]        ; +rounder
   movq [eax+96+8], mm7          ; 7     ; save y7 y6 y5 y4
   movq mm7, [tab_i_17_xmm+40]   ; 7     ; w23 w22 w19 w18
   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
   pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
   pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
   pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
   pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
   paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
   pshufw mm1, [eax+0+8],10001000b; x6 x4 x6 x4
   movq mm4, mm3                 ; 4     ; a1 a0
   paddd mm6, mm7                ; 7 free    ; b1=sum(odd1) b0=sum(odd0)
   paddd mm2, mm0                ; 5 free    ; b3=sum(odd3) b2=sum(odd2)
   pshufw mm0, [eax+0],10001000b ; x2 x0 x2 x0
   movq mm7, mm5                 ; 7     ; a3 a2
   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
   paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
   movq mm3, [tab_i_04_xmm]      ; 3     ; w05 w04 w01 w00
   psubd mm7, mm2                ; ; a3-b3 a2-b2
   paddd mm5, mm2                ; 0 free a3+b3 a2+b2
   pshufw mm2, [eax+0],11011101b ; x3 x1 x3 x1
   pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
   pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
   psrad mm5, SHIFT_INV_ROW      ; y3=a3+b3 y2=a2+b2
   packssdw mm7, mm4             ; 4     ; y6 y7 y4 y5
   packssdw mm6, mm5             ; 0 free    ; y3 y2 y1 y0
   pshufw mm5, [eax+0+8],11011101b; x7 x5 x7 x5
   movq [eax+112], mm6           ; 3     ; save y3 y2 y1 y0
 
 ;   DCT_8_INV_ROW_1_s [eax+0],  0, tab_i_04_xmm, rounder_0
   movq mm4, [tab_i_04_xmm+8]    ; 4     ; w07 w06 w03 w02
   movq mm6, [tab_i_04_xmm+32]   ; 6     ; w21 w20 w17 w16
   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
   paddd mm3, [rounder_0]        ; +rounder
   paddd mm0, [rounder_0]        ; +rounder
   movq [eax+112+8], mm7         ; 7     ; save y7 y6
   movq mm7, [tab_i_04_xmm+40]   ; 7     ; w23 w22 w19 w18
   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
   pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
   pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
   pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
   pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
   paddd mm0, mm1                ; 1
   pshufw mm1, [eax+16+8],10001000b  ; x6 x4 x6 x4
   movq mm4, mm3                 ; 4     ; a1 a0
   paddd mm6, mm7                ; 7 free    ; b1=sum(odd1) b0=sum(odd0)
   paddd mm2, mm5                ; 5 free    ; b3=sum(odd3) b2=sum(odd2)
   pshufw mm5, [eax+16],10001000b; x2 x0 x2 x0   mm5 & mm0 exchanged for next cycle
   movq mm7, mm0                 ; 7     ; a3 a2
   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
   paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
   movq mm3, [tab_i_17_xmm]      ; 3     ; w05 w04 w01 w00
   psubd mm7, mm2                ; ; a3-b3 a2-b2
   paddd mm0, mm2                ; 0 free a3+b3 a2+b2
   pshufw mm2, [eax+16],11011101b; x3 x1 x3 x1
   pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
   pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
   psrad mm0, SHIFT_INV_ROW      ; y3=a3+b3 y2=a2+b2
   packssdw mm7, mm4             ; 4     ; y6 y7 y4 y5
   packssdw mm6, mm0             ; 0 free    ; y3 y2 y1 y0
   pshufw mm0, [eax+16+8],11011101b  ; x7 x5 x7 x5
   movq [eax+0], mm6             ; 3     ; save y3 y2 y1 y0 stall2
 
 ; DCT_8_INV_ROW_1_s [eax+16], 16, tab_i_17_xmm, rounder_1
   movq mm4, [tab_i_17_xmm+8]    ; 4     ; w07 w06 w03 w02
   movq mm6, [tab_i_17_xmm+32]   ; 6     ; w21 w20 w17 w16
   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
   paddd mm3, [rounder_1]        ; +rounder stall 6
   paddd mm5, [rounder_1]        ; +rounder
   movq [eax+0+8], mm7           ; 7     ; save y7 y6 y5 y4
   movq mm7, [tab_i_17_xmm+40]   ; 7     ; w23 w22 w19 w18
   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
   pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
   pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
   pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
   pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
   paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
   paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
   pshufw mm1, [eax+32+8],10001000b  ; x6 x4 x6 x4
   movq mm4, mm3                 ; 4     ; a1 a0
   paddd mm6, mm7                ; 7 free    ; b1=sum(odd1) b0=sum(odd0)
   paddd mm2, mm0                ; 5 free    ; b3=sum(odd3) b2=sum(odd2)
   pshufw mm0, [eax+32],10001000b; x2 x0 x2 x0
   movq mm7, mm5                 ; 7     ; a3 a2
   psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
   paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
   movq mm3, [tab_i_26_xmm]      ; 3     ; w05 w04 w01 w00
   psubd mm7, mm2                ; ; a3-b3 a2-b2
   paddd mm5, mm2                ; 0 free a3+b3 a2+b2
   pshufw mm2, [eax+32],11011101b; x3 x1 x3 x1
   pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
   pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
   psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
   psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
   psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
   psrad mm5, SHIFT_INV_ROW      ; y3=a3+b3 y2=a2+b2
   packssdw mm7, mm4             ; 4     ; y6 y7 y4 y5
   packssdw mm6, mm5             ; 0 free    ; y3 y2 y1 y0
   pshufw mm5, [eax+32+8],11011101b  ; x7 x5 x7 x5
   movq [eax+16], mm6            ; 3     ; save y3 y2 y1 y0
 
 ;   DCT_8_INV_ROW_1_s [eax+32], 32, tab_i_26_xmm, rounder_2
   movq mm4, [tab_i_26_xmm+8]    ; 4     ; w07 w06 w03 w02
   movq mm6, [tab_i_26_xmm+32]   ; 6     ; w21 w20 w17 w16
   pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
   paddd mm3, [rounder_2]        ; +rounder
   paddd mm0, [rounder_2]        ; +rounder
   movq [eax+16+8], mm7          ; 7     ; save y7 y6
   movq mm7, [tab_i_26_xmm+40]   ; 7     ; w23 w22 w19 w18
   pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
   pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
   pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
   pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
   pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -