📄 mmxidct_asm.s
字号:
movq %mm7, %mm5 /* duplicate t182=t184 */ psubsw %mm3, %mm7 /* tm7 */ paddsw %mm3, %mm5 /* tm9; free mm3 */ movq 8*9(%esi), %mm0 /* V58 */ movq %mm2, %mm3 /* duplicate V57 */ movq %mm7, 8*7(%esi) /* tm7; free mm7 */ psubsw %mm6, %mm3 /* tm13 */ paddsw %mm6, %mm2 /* tm3 ; free mm6 *//* moved up from the transpose */ movq %mm3, %mm7/* moved up from the transpose */ punpcklwd %mm1, %mm3 movq %mm0, %mm6 /* duplicate V58 */ movq %mm2, 8*3(%esi) /* tm3; free mm2 */ paddsw %mm4, %mm0 /* tm5 */ psubsw %mm4, %mm6 /* tm11; free mm4 *//* moved up from the transpose */ punpckhwd %mm1, %mm7 movq %mm0, 8*5(%esi) /* tm5; free mm0 *//* moved up from the transpose */ movq %mm5, %mm2/* transpose - M4 part * --------- --------- * | M1 | M2 | | M1'| M3'| * --------- --> --------- * | M3 | M4 | | M2'| M4'| * --------- --------- * Two alternatives: use full mmword approach so the following code can be * scheduled before the transpose is done without stores, or use the faster * half mmword stores (when possible) */ movd %mm3, 8*9+4(%esi) /* MS part of tmt9 */ punpcklwd %mm6, %mm5 movd %mm7, 8*13+4(%esi) /* MS part of tmt13 */ punpckhwd %mm6, %mm2 movd %mm5, 8*9(%esi) /* LS part of tmt9 */ punpckhdq %mm3, %mm5 /* free mm3 */ movd %mm2, 8*13(%esi) /* LS part of tmt13 */ punpckhdq %mm7, %mm2 /* free mm7 *//* moved up from the M3 transpose */ movq 8*8(%esi), %mm0/* moved up from the M3 transpose */ movq 8*10(%esi), %mm1/* moved up from the M3 transpose */ movq %mm0, %mm3/* shuffle the rest of the data, and write it with 2 mmword writes */ movq %mm5, 8*11(%esi) /* tmt11 *//* moved up from the M3 transpose */ punpcklwd %mm1, %mm0 movq %mm2, 8*15(%esi) /* tmt15 *//* moved up from the M3 transpose */ punpckhwd %mm1, %mm3/* transpose - M3 part * moved up to previous code section * movq 8*8(%esi), %mm0 * movq 8*10(%esi), %mm1 * movq %mm0, %mm3 * punpcklwd %mm1, %mm0 * punpckhwd %mm1, %mm3 */ movq 8*12(%esi), %mm6 movq 8*14(%esi), %mm4 movq %mm6, %mm2/* shuffle the data and write the lower parts of the transposed in 4 dwords */ punpcklwd %mm4, %mm6 movq %mm0, %mm1 punpckhdq %mm6, %mm1 movq %mm3, %mm7 punpckhwd %mm4, %mm2 /* free mm4 */ punpckldq %mm6, %mm0 /* free mm6 *//* moved from next block */ movq 8*13(%esi), %mm4 /* tmt13 */ punpckldq %mm2, %mm3 punpckhdq %mm2, %mm7 /* free mm2 *//* moved from next block */ movq %mm3, %mm5 /* duplicate tmt5 *//* column 1: even part (after transpose)* moved above* movq %mm3, %mm5 duplicate tmt5* movq 8*13(%esi), %mm4 tmt13*/ psubsw %mm4, %mm3 /* V134 */ pmulhw x5a825a825a825a82, %mm3 /* 23170 ->V136 */ movq 8*9(%esi), %mm6 /* tmt9 */ paddsw %mm4, %mm5 /* V135 ; mm4 free */ movq %mm0, %mm4 /* duplicate tmt1 */ paddsw %mm6, %mm0 /* V137 */ psubsw %mm6, %mm4 /* V138 ; mm6 free */ psllw $2, %mm3 /* t290 */ psubsw %mm5, %mm3 /* V139 */ movq %mm0, %mm6 /* duplicate V137 */ paddsw %mm5, %mm0 /* V140 */ movq %mm4, %mm2 /* duplicate V138 */ paddsw %mm3, %mm2 /* V141 */ psubsw %mm3, %mm4 /* V142 ; mm3 free */ movq %mm0, 8*9(%esi) /* V140 */ psubsw %mm5, %mm6 /* V143 ; mm5 free *//* moved from next block */ movq 8*11(%esi), %mm0 /* tmt11 */ movq %mm2, 8*13(%esi) /* V141 *//* moved from next block */ movq %mm0, %mm2 /* duplicate tmt11 *//* column 1: odd part (after transpose) *//* moved up to the prev block * movq 8*11(%esi), %mm0 tmt11 * movq %mm0, %mm2 duplicate tmt11 */ movq 8*15(%esi), %mm5 /* tmt15 */ psubsw %mm7, %mm0 /* V144 */ movq %mm0, %mm3 /* duplicate V144 */ paddsw %mm7, %mm2 /* V147 ; free mm7 */ pmulhw x539f539f539f539f, %mm0 /* 21407-> V151 */ movq %mm1, %mm7 /* duplicate tmt3 */ paddsw %mm5, %mm7 /* V145 */ psubsw %mm5, %mm1 /* V146 ; free mm5 */ psubsw %mm1, %mm3 /* V150 */ movq %mm7, %mm5 /* duplicate V145 */ pmulhw x4546454645464546, %mm1 /* 17734-> V153 */ psubsw %mm2, %mm5 /* V148 */ pmulhw x61f861f861f861f8, %mm3 /* 25080-> V154 */ psllw $2, %mm0 /* t311 */ pmulhw x5a825a825a825a82, %mm5 /* 23170-> V152 */ paddsw %mm2, %mm7 /* V149 ; free mm2 */ psllw $1, %mm1 /* t313 */ nop /* without the nop - freeze here for one clock */ movq %mm3, %mm2 /* duplicate V154 */ psubsw %mm0, %mm3 /* V155 ; free mm0 */ psubsw %mm2, %mm1 /* V156 ; free mm2 *//* moved from the next block */ movq %mm6, %mm2 /* duplicate V143 *//* moved from the next block */ movq 8*13(%esi), %mm0 /* V141 */ psllw $1, %mm1 /* t315 */ psubsw %mm7, %mm1 /* V157 (keep V149) */ psllw $2, %mm5 /* t317 */ psubsw %mm1, %mm5 /* V158 */ psllw $1, %mm3 /* t319 */ paddsw %mm5, %mm3 /* V159 *//* column 1: output butterfly (after transform) * moved to the prev block * movq %mm6, %mm2 duplicate V143 * movq 8*13(%esi), %mm0 V141 */ psubsw %mm3, %mm2 /* V163 */ paddsw %mm3, %mm6 /* V164 ; free mm3 */ movq %mm4, %mm3 /* duplicate V142 */ psubsw %mm5, %mm4 /* V165 ; free mm5 */ movq %mm2, scratch7 /* out7 */ psraw $4, %mm6 psraw $4, %mm4 paddsw %mm5, %mm3 /* V162 */ movq 8*9(%esi), %mm2 /* V140 */ movq %mm0, %mm5 /* duplicate V141 *//* in order not to perculate this line up, * we read 72(%esi) very near to this location */ movq %mm6, 8*9(%esi) /* out9 */ paddsw %mm1, %mm0 /* V161 */ movq %mm3, scratch5 /* out5 */ psubsw %mm1, %mm5 /* V166 ; free mm1 */ movq %mm4, 8*11(%esi) /* out11 */ psraw $4, %mm5 movq %mm0, scratch3 /* out3 */ movq %mm2, %mm4 /* duplicate V140 */ movq %mm5, 8*13(%esi) /* out13 */ paddsw %mm7, %mm2 /* V160 *//* moved from the next block */ movq 8(%esi), %mm0 psubsw %mm7, %mm4 /* V167 ; free mm7 *//* moved from the next block */ movq 8*3(%esi), %mm7 psraw $4, %mm4 movq %mm2, scratch1 /* out1 *//* moved from the next block */ movq %mm0, %mm1 movq %mm4, 8*15(%esi) /* out15 *//* moved from the next block */ punpcklwd %mm7, %mm0/* transpose - M2 parts * moved up to the prev block * movq 8(%esi), %mm0 * movq 8*3(%esi), %mm7 * movq %mm0, %mm1 * punpcklwd %mm7, %mm0 */ movq 8*5(%esi), %mm5 punpckhwd %mm7, %mm1 movq 8*7(%esi), %mm4 movq %mm5, %mm3/* shuffle the data and write the lower parts of the trasposed in 4 dwords */ movd %mm0, 8*8(%esi) /* LS part of tmt8 */ punpcklwd %mm4, %mm5 movd %mm1, 8*12(%esi) /* LS part of tmt12 */ punpckhwd %mm4, %mm3 movd %mm5, 8*8+4(%esi) /* MS part of tmt8 */ punpckhdq %mm5, %mm0 /* tmt10 */ movd %mm3, 8*12+4(%esi) /* MS part of tmt12 */ punpckhdq %mm3, %mm1 /* tmt14 *//* transpose - M1 parts */ movq (%esi), %mm7 movq 8*2(%esi), %mm2 movq %mm7, %mm6 movq 8*4(%esi), %mm5 punpcklwd %mm2, %mm7 movq 8*6(%esi), %mm4 punpckhwd %mm2, %mm6 /* free mm2 */ movq %mm5, %mm3 punpcklwd %mm4, %mm5 punpckhwd %mm4, %mm3 /* free mm4 */ movq %mm7, %mm2 movq %mm6, %mm4 punpckldq %mm5, %mm7 /* tmt0 */ punpckhdq %mm5, %mm2 /* tmt2 ; free mm5 *//* shuffle the rest of the data, and write it with 2 mmword writes */ punpckldq %mm3, %mm6 /* tmt4 *//* moved from next block */ movq %mm2, %mm5 /* duplicate tmt2 */ punpckhdq %mm3, %mm4 /* tmt6 ; free mm3 *//* moved from next block */ movq %mm0, %mm3 /* duplicate tmt10 *//* column 0: odd part (after transpose) *moved up to prev block * movq %mm0, %mm3 duplicate tmt10 * movq %mm2, %mm5 duplicate tmt2 */ psubsw %mm4, %mm0 /* V110 */ paddsw %mm4, %mm3 /* V113 ; free mm4 */ movq %mm0, %mm4 /* duplicate V110 */ paddsw %mm1, %mm2 /* V111 */ pmulhw x539f539f539f539f, %mm0 /* 21407-> V117 */ psubsw %mm1, %mm5 /* V112 ; free mm1 */ psubsw %mm5, %mm4 /* V116 */ movq %mm2, %mm1 /* duplicate V111 */ pmulhw x4546454645464546, %mm5 /* 17734-> V119 */ psubsw %mm3, %mm2 /* V114 */ pmulhw x61f861f861f861f8, %mm4 /* 25080-> V120 */ paddsw %mm3, %mm1 /* V115 ; free mm3 */ pmulhw x5a825a825a825a82, %mm2 /* 23170-> V118 */ psllw $2, %mm0 /* t266 */ movq %mm1, (%esi) /* save V115 */ psllw $1, %mm5 /* t268 */ psubsw %mm4, %mm5 /* V122 */ psubsw %mm0, %mm4 /* V121 ; free mm0 */ psllw $1, %mm5 /* t270 */ psubsw %mm1, %mm5 /* V123 ; free mm1 */ psllw $2, %mm2 /* t272 */ psubsw %mm5, %mm2 /* V124 (keep V123) */ psllw $1, %mm4 /* t274 */ movq %mm5, 8*2(%esi) /* save V123 ; free mm5 */ paddsw %mm2, %mm4 /* V125 (keep V124) *//* column 0: even part (after transpose) */ movq 8*12(%esi), %mm0 /* tmt12 */ movq %mm6, %mm3 /* duplicate tmt4 */ psubsw %mm0, %mm6 /* V100 */ paddsw %mm0, %mm3 /* V101 ; free mm0 */ pmulhw x5a825a825a825a82, %mm6 /* 23170 ->V102 */ movq %mm7, %mm5 /* duplicate tmt0 */ movq 8*8(%esi), %mm1 /* tmt8 */ paddsw %mm1, %mm7 /* V103 */ psubsw %mm1, %mm5 /* V104 ; free mm1 */ movq %mm7, %mm0 /* duplicate V103 */ psllw $2, %mm6 /* t245 */ paddsw %mm3, %mm7 /* V106 */ movq %mm5, %mm1 /* duplicate V104 */ psubsw %mm3, %mm6 /* V105 */ psubsw %mm3, %mm0 /* V109; free mm3 */ paddsw %mm6, %mm5 /* V107 */ psubsw %mm6, %mm1 /* V108 ; free mm6 *//* column 0: output butterfly (after transform) */ movq %mm1, %mm3 /* duplicate V108 */ paddsw %mm2, %mm1 /* out4 */ psraw $4, %mm1 psubsw %mm2, %mm3 /* out10 ; free mm2 */ psraw $4, %mm3 movq %mm0, %mm6 /* duplicate V109 */ movq %mm1, 8*4(%esi) /* out4 ; free mm1 */ psubsw %mm4, %mm0 /* out6 */ movq %mm3, 8*10(%esi) /* out10 ; free mm3 */ psraw $4, %mm0 paddsw %mm4, %mm6 /* out8 ; free mm4 */ movq %mm7, %mm1 /* duplicate V106 */ movq %mm0, 8*6(%esi) /* out6 ; free mm0 */ psraw $4, %mm6 movq (%esi), %mm4 /* V115 */ movq %mm6, 8*8(%esi) /* out8 ; free mm6 */ movq %mm5, %mm2 /* duplicate V107 */ movq 8*2(%esi), %mm3 /* V123 */ paddsw %mm4, %mm7 /* out0 *//* moved up from next block */ movq scratch3, %mm0 psraw $4, %mm7/* moved up from next block */ movq scratch5, %mm6 psubsw %mm4, %mm1 /* out14 ; free mm4 */ paddsw %mm3, %mm5 /* out2 */ psraw $4, %mm1 movq %mm7, (%esi) /* out0 ; free mm7 */ psraw $4, %mm5 movq %mm1, 8*14(%esi) /* out14 ; free mm1 */ psubsw %mm3, %mm2 /* out12 ; free mm3 */ movq %mm5, 8*2(%esi) /* out2 ; free mm5 */ psraw $4, %mm2/* moved up to the prev block */ movq scratch7, %mm4/* moved up to the prev block */ psraw $4, %mm0 movq %mm2, 8*12(%esi) /* out12 ; free mm2 *//* moved up to the prev block */ psraw $4, %mm6/* move back the data to its correct place* moved up to the prev block * movq scratch3, %mm0 * movq scratch5, %mm6 * movq scratch7, %mm4 * psraw $4, %mm0 * psraw $4, %mm6*/ movq scratch1, %mm1 psraw $4, %mm4 movq %mm0, 8*3(%esi) /* out3 */ psraw $4, %mm1 movq %mm6, 8*5(%esi) /* out5 */ movq %mm4, 8*7(%esi) /* out7 */ movq %mm1, 8(%esi) /* out1 */ popl %edi popl %esi popl %edx popl %ecx popl %ebx movl %ebp,%esp popl %ebp ret.Lfe1: .size IDCT_mmx,.Lfe1-IDCT_mmx#endif /* i386 && USE_MMX */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -