📄 vdec_idctmmx.s
字号:
psubsw %mm4, %mm0 /* tm10; free mm4 *//* moved from next block */ pmulhw 8*5(%esi), %mm1 /* V5 */ movq %mm6, 8*4(%esi) /* tm4; free mm6 */ movq %mm0, 8*10(%esi) /* tm10; free mm0 *//* column 1: even part * use V5, V13, V1, V9 to produce V56..V59 *//* moved to prev block: * movq 8*5(%ecx), %mm1 * pmulhw 8*5(%esi), %mm1 V5 */ movq 8*13(%ecx), %mm7 psllw $1, %mm1 /* t128=t130 */ pmulhw 8*13(%esi), %mm7 /* V13 */ movq %mm1, %mm2 /* duplicate t128=t130 */ movq 8(%ecx), %mm3 pmulhw 8(%esi), %mm3 /* V1 */ movq 8*9(%ecx), %mm5 psubsw %mm7, %mm1 /* V50 */ pmulhw 8*9(%esi), %mm5 /* V9 */ paddsw %mm7, %mm2 /* V51 */ pmulhw x5a825a825a825a82, %mm1 /* 23170 ->V52 */ movq %mm2, %mm6 /* duplicate V51 */ psraw $1, %mm2 /* t138=t144 */ movq %mm3, %mm4 /* duplicate V1 */ psraw $2, %mm6 /* t136 */ paddsw %mm5, %mm3 /* V53 */ psubsw %mm5, %mm4 /* V54 ;mm5 free */ movq %mm3, %mm7 /* duplicate V53 *//* moved from next block */ movq 8*11(%ecx), %mm0 psraw $1, %mm4 /* t140=t142 */ psubsw %mm6, %mm1 /* V55 ; mm6 free */ paddsw %mm2, %mm3 /* V56 */ movq %mm4, %mm5 /* duplicate t140=t142 */ paddsw %mm1, %mm4 /* V57 */ movq %mm3, 8*5(%esi) /* V56 */ psubsw %mm1, %mm5 /* V58; mm1 free */ movq %mm4, 8*13(%esi) /* V57 */ psubsw %mm2, %mm7 /* V59; mm2 free */ movq %mm5, 8*9(%esi) /* V58 *//* keep mm7 alive all along the next block * movq %mm7, 8(%esi) V59 * moved above * movq 8*11(%ecx), %mm0 */ pmulhw 8*11(%esi), %mm0 /* V11 */ movq 8*7(%ecx), %mm6 pmulhw 8*7(%esi), %mm6 /* V7 */ movq 8*15(%ecx), %mm4 movq %mm0, %mm3 /* duplicate V11 */ pmulhw 8*15(%esi), %mm4 /* V15 */ movq 8*3(%ecx), %mm5 psllw $1, %mm6 /* t146=t152 */ pmulhw 8*3(%esi), %mm5 /* V3 */ paddsw %mm6, %mm0 /* V63 *//* note that V15 computation has a correction step: * this is a 'magic' constant that rebiases the results to be closer to the * expected result. this magic constant can be refined to reduce the error * even more by doing the correction step in a later stage when the number * is actually multiplied by 16 */ paddw x0005000200010001, %mm4 psubsw %mm6, %mm3 /* V60 ; free mm6 */ psraw $1, %mm0 /* t154=t156 */ movq %mm3, %mm1 /* duplicate V60 */ pmulhw x539f539f539f539f, %mm1 /* V67 */ movq %mm5, %mm6 /* duplicate V3 */ psraw $2, %mm4 /* t148=t150 */ paddsw %mm4, %mm5 /* V61 */ psubsw %mm4, %mm6 /* V62 ; free mm4 */ movq %mm5, %mm4 /* duplicate V61 */ psllw $1, %mm1 /* t169 */ paddsw %mm0, %mm5 /* V65 -> result */ psubsw %mm0, %mm4 /* V64 ; free mm0 */ pmulhw x5a825a825a825a82, %mm4 /* V68 */ psraw $1, %mm3 /* t158 */ psubsw %mm6, %mm3 /* V66 */ movq %mm5, %mm2 /* duplicate V65 */ pmulhw x61f861f861f861f8, %mm3 /* V70 */ psllw $1, %mm6 /* t165 */ pmulhw x4546454645464546, %mm6 /* V69 */ psraw $1, %mm2 /* t172 *//* moved from next block */ movq 8*5(%esi), %mm0 /* V56 */ psllw $1, %mm4 /* t174 *//* moved from next block */ psraw $1, %mm0 /* t177=t188 */ nop psubsw %mm3, %mm6 /* V72 */ psubsw %mm1, %mm3 /* V71 ; free mm1 */ psubsw %mm2, %mm6 /* V73 ; free mm2 *//* moved from next block */ psraw $1, %mm5 /* t178=t189 */ psubsw %mm6, %mm4 /* V74 *//* moved from next block */ movq %mm0, %mm1 /* duplicate t177=t188 */ paddsw %mm4, %mm3 /* V75 *//* moved from next block */ paddsw %mm5, %mm0 /* tm1 *//* location * 5 - V56 * 13 - V57 * 9 - V58 * X - V59, mm7 * X - V65, mm5 * X - V73, mm6 * X - V74, mm4 * X - V75, mm3 * free mm0, mm1 & mm2 * moved above * movq 8*5(%esi), %mm0 V56 * psllw $1, %mm0 t177=t188 ! new !! * psllw $1, %mm5 t178=t189 ! new !! * movq %mm0, %mm1 duplicate t177=t188 * paddsw %mm5, %mm0 tm1 */ movq 8*13(%esi), %mm2 /* V57 */ psubsw %mm5, %mm1 /* tm15; free mm5 */ movq %mm0, 8(%esi) /* tm1; free mm0 */ psraw $1, %mm7 /* t182=t184 ! new !! *//* save the store as used directly in the transpose * movq %mm1, 120(%esi) tm15; free mm1 */ movq %mm7, %mm5 /* duplicate t182=t184 */ psubsw %mm3, %mm7 /* tm7 */ paddsw %mm3, %mm5 /* tm9; free mm3 */ movq 8*9(%esi), %mm0 /* V58 */ movq %mm2, %mm3 /* duplicate V57 */ movq %mm7, 8*7(%esi) /* tm7; free mm7 */ psubsw %mm6, %mm3 /* tm13 */ paddsw %mm6, %mm2 /* tm3 ; free mm6 *//* moved up from the transpose */ movq %mm3, %mm7/* moved up from the transpose */ punpcklwd %mm1, %mm3 movq %mm0, %mm6 /* duplicate V58 */ movq %mm2, 8*3(%esi) /* tm3; free mm2 */ paddsw %mm4, %mm0 /* tm5 */ psubsw %mm4, %mm6 /* tm11; free mm4 *//* moved up from the transpose */ punpckhwd %mm1, %mm7 movq %mm0, 8*5(%esi) /* tm5; free mm0 *//* moved up from the transpose */ movq %mm5, %mm2/* transpose - M4 part * --------- --------- * | M1 | M2 | | M1'| M3'| * --------- --> --------- * | M3 | M4 | | M2'| M4'| * --------- --------- * Two alternatives: use full mmword approach so the following code can be * scheduled before the transpose is done without stores, or use the faster * half mmword stores (when possible) */ movd %mm3, 8*9+4(%esi) /* MS part of tmt9 */ punpcklwd %mm6, %mm5 movd %mm7, 8*13+4(%esi) /* MS part of tmt13 */ punpckhwd %mm6, %mm2 movd %mm5, 8*9(%esi) /* LS part of tmt9 */ punpckhdq %mm3, %mm5 /* free mm3 */ movd %mm2, 8*13(%esi) /* LS part of tmt13 */ punpckhdq %mm7, %mm2 /* free mm7 *//* moved up from the M3 transpose */ movq 8*8(%esi), %mm0/* moved up from the M3 transpose */ movq 8*10(%esi), %mm1/* moved up from the M3 transpose */ movq %mm0, %mm3/* shuffle the rest of the data, and write it with 2 mmword writes */ movq %mm5, 8*11(%esi) /* tmt11 *//* moved up from the M3 transpose */ punpcklwd %mm1, %mm0 movq %mm2, 8*15(%esi) /* tmt15 *//* moved up from the M3 transpose */ punpckhwd %mm1, %mm3/* transpose - M3 part * moved up to previous code section * movq 8*8(%esi), %mm0 * movq 8*10(%esi), %mm1 * movq %mm0, %mm3 * punpcklwd %mm1, %mm0 * punpckhwd %mm1, %mm3 */ movq 8*12(%esi), %mm6 movq 8*14(%esi), %mm4 movq %mm6, %mm2/* shuffle the data and write the lower parts of the transposed in 4 dwords */ punpcklwd %mm4, %mm6 movq %mm0, %mm1 punpckhdq %mm6, %mm1 movq %mm3, %mm7 punpckhwd %mm4, %mm2 /* free mm4 */ punpckldq %mm6, %mm0 /* free mm6 *//* moved from next block */ movq 8*13(%esi), %mm4 /* tmt13 */ punpckldq %mm2, %mm3 punpckhdq %mm2, %mm7 /* free mm2 *//* moved from next block */ movq %mm3, %mm5 /* duplicate tmt5 *//* column 1: even part (after transpose)* moved above* movq %mm3, %mm5 duplicate tmt5* movq 8*13(%esi), %mm4 tmt13*/ psubsw %mm4, %mm3 /* V134 */ pmulhw x5a825a825a825a82, %mm3 /* 23170 ->V136 */ movq 8*9(%esi), %mm6 /* tmt9 */ paddsw %mm4, %mm5 /* V135 ; mm4 free */ movq %mm0, %mm4 /* duplicate tmt1 */ paddsw %mm6, %mm0 /* V137 */ psubsw %mm6, %mm4 /* V138 ; mm6 free */ psllw $2, %mm3 /* t290 */ psubsw %mm5, %mm3 /* V139 */ movq %mm0, %mm6 /* duplicate V137 */ paddsw %mm5, %mm0 /* V140 */ movq %mm4, %mm2 /* duplicate V138 */ paddsw %mm3, %mm2 /* V141 */ psubsw %mm3, %mm4 /* V142 ; mm3 free */ movq %mm0, 8*9(%esi) /* V140 */ psubsw %mm5, %mm6 /* V143 ; mm5 free *//* moved from next block */ movq 8*11(%esi), %mm0 /* tmt11 */ movq %mm2, 8*13(%esi) /* V141 *//* moved from next block */ movq %mm0, %mm2 /* duplicate tmt11 *//* column 1: odd part (after transpose) *//* moved up to the prev block * movq 8*11(%esi), %mm0 tmt11 * movq %mm0, %mm2 duplicate tmt11 */ movq 8*15(%esi), %mm5 /* tmt15 */ psubsw %mm7, %mm0 /* V144 */ movq %mm0, %mm3 /* duplicate V144 */ paddsw %mm7, %mm2 /* V147 ; free mm7 */ pmulhw x539f539f539f539f, %mm0 /* 21407-> V151 */ movq %mm1, %mm7 /* duplicate tmt3 */ paddsw %mm5, %mm7 /* V145 */ psubsw %mm5, %mm1 /* V146 ; free mm5 */ psubsw %mm1, %mm3 /* V150 */ movq %mm7, %mm5 /* duplicate V145 */ pmulhw x4546454645464546, %mm1 /* 17734-> V153 */ psubsw %mm2, %mm5 /* V148 */ pmulhw x61f861f861f861f8, %mm3 /* 25080-> V154 */ psllw $2, %mm0 /* t311 */ pmulhw x5a825a825a825a82, %mm5 /* 23170-> V152 */ paddsw %mm2, %mm7 /* V149 ; free mm2 */ psllw $1, %mm1 /* t313 */ nop /* without the nop - freeze here for one clock */ movq %mm3, %mm2 /* duplicate V154 */ psubsw %mm0, %mm3 /* V155 ; free mm0 */ psubsw %mm2, %mm1 /* V156 ; free mm2 *//* moved from the next block */ movq %mm6, %mm2 /* duplicate V143 *//* moved from the next block */ movq 8*13(%esi), %mm0 /* V141 */ psllw $1, %mm1 /* t315 */ psubsw %mm7, %mm1 /* V157 (keep V149) */ psllw $2, %mm5 /* t317 */ psubsw %mm1, %mm5 /* V158 */ psllw $1, %mm3 /* t319 */ paddsw %mm5, %mm3 /* V159 *//* column 1: output butterfly (after transform) * moved to the prev block * movq %mm6, %mm2 duplicate V143 * movq 8*13(%esi), %mm0 V141 */ psubsw %mm3, %mm2 /* V163 */ paddsw %mm3, %mm6 /* V164 ; free mm3 */ movq %mm4, %mm3 /* duplicate V142 */ psubsw %mm5, %mm4 /* V165 ; free mm5 */ movq %mm2, scratch7 /* out7 */ psraw $4, %mm6 psraw $4, %mm4 paddsw %mm5, %mm3 /* V162 */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -