📄 vdec_idctmmx.s
字号:
movq 8*9(%esi), %mm2 /* V140 */ movq %mm0, %mm5 /* duplicate V141 *//* in order not to perculate this line up, * we read 72(%esi) very near to this location */ movq %mm6, 8*9(%esi) /* out9 */ paddsw %mm1, %mm0 /* V161 */ movq %mm3, scratch5 /* out5 */ psubsw %mm1, %mm5 /* V166 ; free mm1 */ movq %mm4, 8*11(%esi) /* out11 */ psraw $4, %mm5 movq %mm0, scratch3 /* out3 */ movq %mm2, %mm4 /* duplicate V140 */ movq %mm5, 8*13(%esi) /* out13 */ paddsw %mm7, %mm2 /* V160 *//* moved from the next block */ movq 8(%esi), %mm0 psubsw %mm7, %mm4 /* V167 ; free mm7 *//* moved from the next block */ movq 8*3(%esi), %mm7 psraw $4, %mm4 movq %mm2, scratch1 /* out1 *//* moved from the next block */ movq %mm0, %mm1 movq %mm4, 8*15(%esi) /* out15 *//* moved from the next block */ punpcklwd %mm7, %mm0/* transpose - M2 parts * moved up to the prev block * movq 8(%esi), %mm0 * movq 8*3(%esi), %mm7 * movq %mm0, %mm1 * punpcklwd %mm7, %mm0 */ movq 8*5(%esi), %mm5 punpckhwd %mm7, %mm1 movq 8*7(%esi), %mm4 movq %mm5, %mm3/* shuffle the data and write the lower parts of the trasposed in 4 dwords */ movd %mm0, 8*8(%esi) /* LS part of tmt8 */ punpcklwd %mm4, %mm5 movd %mm1, 8*12(%esi) /* LS part of tmt12 */ punpckhwd %mm4, %mm3 movd %mm5, 8*8+4(%esi) /* MS part of tmt8 */ punpckhdq %mm5, %mm0 /* tmt10 */ movd %mm3, 8*12+4(%esi) /* MS part of tmt12 */ punpckhdq %mm3, %mm1 /* tmt14 *//* transpose - M1 parts */ movq (%esi), %mm7 movq 8*2(%esi), %mm2 movq %mm7, %mm6 movq 8*4(%esi), %mm5 punpcklwd %mm2, %mm7 movq 8*6(%esi), %mm4 punpckhwd %mm2, %mm6 /* free mm2 */ movq %mm5, %mm3 punpcklwd %mm4, %mm5 punpckhwd %mm4, %mm3 /* free mm4 */ movq %mm7, %mm2 movq %mm6, %mm4 punpckldq %mm5, %mm7 /* tmt0 */ punpckhdq %mm5, %mm2 /* tmt2 ; free mm5 *//* shuffle the rest of the data, and write it with 2 mmword writes */ punpckldq %mm3, %mm6 /* tmt4 *//* moved from next block */ movq %mm2, %mm5 /* duplicate tmt2 */ punpckhdq %mm3, %mm4 /* tmt6 ; free mm3 *//* moved from next block */ movq %mm0, %mm3 /* duplicate tmt10 *//* column 0: odd part (after transpose) *moved up to prev block * movq %mm0, %mm3 duplicate tmt10 * movq %mm2, %mm5 duplicate tmt2 */ psubsw %mm4, %mm0 /* V110 */ paddsw %mm4, %mm3 /* V113 ; free mm4 */ movq %mm0, %mm4 /* duplicate V110 */ paddsw %mm1, %mm2 /* V111 */ pmulhw x539f539f539f539f, %mm0 /* 21407-> V117 */ psubsw %mm1, %mm5 /* V112 ; free mm1 */ psubsw %mm5, %mm4 /* V116 */ movq %mm2, %mm1 /* duplicate V111 */ pmulhw x4546454645464546, %mm5 /* 17734-> V119 */ psubsw %mm3, %mm2 /* V114 */ pmulhw x61f861f861f861f8, %mm4 /* 25080-> V120 */ paddsw %mm3, %mm1 /* V115 ; free mm3 */ pmulhw x5a825a825a825a82, %mm2 /* 23170-> V118 */ psllw $2, %mm0 /* t266 */ movq %mm1, (%esi) /* save V115 */ psllw $1, %mm5 /* t268 */ psubsw %mm4, %mm5 /* V122 */ psubsw %mm0, %mm4 /* V121 ; free mm0 */ psllw $1, %mm5 /* t270 */ psubsw %mm1, %mm5 /* V123 ; free mm1 */ psllw $2, %mm2 /* t272 */ psubsw %mm5, %mm2 /* V124 (keep V123) */ psllw $1, %mm4 /* t274 */ movq %mm5, 8*2(%esi) /* save V123 ; free mm5 */ paddsw %mm2, %mm4 /* V125 (keep V124) *//* column 0: even part (after transpose) */ movq 8*12(%esi), %mm0 /* tmt12 */ movq %mm6, %mm3 /* duplicate tmt4 */ psubsw %mm0, %mm6 /* V100 */ paddsw %mm0, %mm3 /* V101 ; free mm0 */ pmulhw x5a825a825a825a82, %mm6 /* 23170 ->V102 */ movq %mm7, %mm5 /* duplicate tmt0 */ movq 8*8(%esi), %mm1 /* tmt8 */ paddsw %mm1, %mm7 /* V103 */ psubsw %mm1, %mm5 /* V104 ; free mm1 */ movq %mm7, %mm0 /* duplicate V103 */ psllw $2, %mm6 /* t245 */ paddsw %mm3, %mm7 /* V106 */ movq %mm5, %mm1 /* duplicate V104 */ psubsw %mm3, %mm6 /* V105 */ psubsw %mm3, %mm0 /* V109; free mm3 */ paddsw %mm6, %mm5 /* V107 */ psubsw %mm6, %mm1 /* V108 ; free mm6 *//* column 0: output butterfly (after transform) */ movq %mm1, %mm3 /* duplicate V108 */ paddsw %mm2, %mm1 /* out4 */ psraw $4, %mm1 psubsw %mm2, %mm3 /* out10 ; free mm2 */ psraw $4, %mm3 movq %mm0, %mm6 /* duplicate V109 */ movq %mm1, 8*4(%esi) /* out4 ; free mm1 */ psubsw %mm4, %mm0 /* out6 */ movq %mm3, 8*10(%esi) /* out10 ; free mm3 */ psraw $4, %mm0 paddsw %mm4, %mm6 /* out8 ; free mm4 */ movq %mm7, %mm1 /* duplicate V106 */ movq %mm0, 8*6(%esi) /* out6 ; free mm0 */ psraw $4, %mm6 movq (%esi), %mm4 /* V115 */ movq %mm6, 8*8(%esi) /* out8 ; free mm6 */ movq %mm5, %mm2 /* duplicate V107 */ movq 8*2(%esi), %mm3 /* V123 */ paddsw %mm4, %mm7 /* out0 *//* moved up from next block */ movq scratch3, %mm0 psraw $4, %mm7/* moved up from next block */ movq scratch5, %mm6 psubsw %mm4, %mm1 /* out14 ; free mm4 */ paddsw %mm3, %mm5 /* out2 */ psraw $4, %mm1 movq %mm7, (%esi) /* out0 ; free mm7 */ psraw $4, %mm5 movq %mm1, 8*14(%esi) /* out14 ; free mm1 */ psubsw %mm3, %mm2 /* out12 ; free mm3 */ movq %mm5, 8*2(%esi) /* out2 ; free mm5 */ psraw $4, %mm2/* moved up to the prev block */ movq scratch7, %mm4/* moved up to the prev block */ psraw $4, %mm0 movq %mm2, 8*12(%esi) /* out12 ; free mm2 *//* moved up to the prev block */ psraw $4, %mm6/* move back the data to its correct place* moved up to the prev block * movq scratch3, %mm0 * movq scratch5, %mm6 * movq scratch7, %mm4 * psraw $4, %mm0 * psraw $4, %mm6*/ movq scratch1, %mm1 psraw $4, %mm4 movq %mm0, 8*3(%esi) /* out3 */ psraw $4, %mm1 movq %mm6, 8*5(%esi) /* out5 */ movq %mm4, 8*7(%esi) /* out7 */ movq %mm1, 8(%esi) /* out1 *//* transpose matrix */ movl $8, %ebx /* ebx is x_size */ movl %esi, %edi /* pointer to the matrix */ movl %ebx, %ecx sal $2, %ecx movl %ebx, %eax addl %ebx, %ecx subl $4, %eax /* eax is inner loop variable */ addl %ebx, %ecx /* ecx is 6*row size */ movl %eax, %edx /* edx is the outer loop variable */.L1: movq (%esi), %mm0 /* first line */ movq (%esi,%ebx,4), %mm2 /* third line */ movq %mm0, %mm6 /* copy first line */ punpcklwd (%esi,%ebx,2), %mm0 /* interleave fist and second lines */ movq %mm2, %mm7 /* copy third line */ punpcklwd (%esi,%ecx), %mm2 /* interleave third and fourth lines */ movq %mm0, %mm4 /* copy first intermediate result */ movq (%esi,%ebx,2), %mm1 /* second line *//* the next line 'punpcklwd %mm2, %mm0' inverted two pixels. *//* punpckldq make printing cleaner */ punpckldq %mm2, %mm0 /* interleave to produce result 1 */ movq (%esi,%ecx), %mm3 /* fourth line */ punpckhdq %mm2, %mm4 /* interleave to produce result 2 */ movq %mm0, (%esi) /* write result 1 */ punpckhwd %mm1, %mm6 /* interleave first and second lines */ movq %mm4, (%esi,%ebx,2) /* write result 2 */ punpckhwd %mm3, %mm7 /* interleave 3rd and 4th lines */ movq %mm6, %mm5 /* copy first intermediate result */ punpckldq %mm7, %mm6 /* interleave to produce result 3 */ leal (%edi,%ebx,8), %edi /* point to 4x4 set 4 rows down */ punpckhdq %mm7, %mm5 /* interleave to produce result 4 */ movq %mm6, (%esi,%ebx,4) /* write result 3 */ movq %mm5, (%esi,%ecx) /* write result 4 */ /* check to see if number of rows left is zero */ cmpl $0, %edx /* last time through you are done and ready to exit */ je .L3.L2: movq 8(%esi), %mm0 /* first line */ movq 8(%esi,%ebx,4), %mm2 /* third line */ movq %mm0, %mm6 /* copy first line */ punpcklwd 8(%esi,%ebx,2), %mm0 /* interleave first and second lines */ movq %mm2, %mm7 /* copy third line */ punpcklwd 8(%esi,%ecx), %mm2 /* interleave 3rd and 4th lines */ movq %mm0, %mm4 /* copy first intermediate */ movq (%edi), %mm1 /* first line */ punpckldq %mm2, %mm0 /* interleave to produce 1st result */ movq (%edi,%ebx,4), %mm3 /* third line */ punpckhdq %mm2, %mm4 /* interleave to produce 2nd result */ punpckhwd 8(%esi,%ebx,2), %mm6 /* interleave 1st and 2nd lines */ movq %mm1, %mm2 /* copy first line */ punpckhwd 8(%esi,%ecx), %mm7 /* interleave 3rd and 4th lines */ movq %mm6, %mm5 /* copy first intermediate */ movq %mm0, (%edi) /* write result 1 */ punpckhdq %mm7, %mm5 /* produce third result */ punpcklwd (%edi,%ebx,2), %mm1 /* interleave 1st and 2nd lines */ movq %mm3, %mm0 /* copy third line */ punpckhwd (%edi,%ebx,2), %mm2 /* interleave 1st and 2nd lines */ movq %mm4, (%edi,%ebx,2) /* write result 2 */ punpckldq %mm7, %mm6 /* produce fourth result */ punpcklwd (%edi,%ecx), %mm3 /* interleave 3rd and 4th lines */ movq %mm1, %mm4 /* copy first intermediate */ movq %mm6, (%edi,%ebx,4) /* write result 3 */ punpckldq %mm3, %mm1 punpckhwd (%edi,%ecx), %mm0 /* interleave 3rd and 4th lines */ movq %mm2, %mm6 /* copy second intermediate */ movq %mm5, (%edi,%ecx) /* write result 4 */ punpckhdq %mm3, %mm4 /* produce second result */ movq %mm1, 8(%esi) /* write result 5 */ punpckldq %mm0, %mm2 /* produce third result */ movq %mm4, 8(%esi,%ebx,2) /* write result 6 */ punpckhdq %mm0, %mm6 /* produce fourth result */ movq %mm2, 8(%esi,%ebx,4) /* write result 7 */ movq %mm6, 8(%esi,%ecx) /* write result 8 */ /* increment %esi to point to next 4x4 block in same row */ addl $8, %esi /* increment %edi to point to nxt 4x4 block below current */ leal (%edi,%ebx,8), %edi sub $4, %eax /* decrement inner loop var */ jnz .L2 /* %edi points to start of second row in block just finished */ sal $1, %edx leal 8(%esi,%ebx,8), %esi subl %edx, %esi /* subtract the number of bytes in last row */ /* now we point to spot where row=col */ subl $8, %edx /* sub 4 from row number */ sarl $1, %edx mov %esi, %edi mov %edx, %eax /* reset x_size to outer loop variable to start new row */ jmp .L1.L3: emms popl %edi popl %esi popl %edx popl %ecx popl %ebx movl %ebp,%esp popl %ebp ret.Lfe1: .size vdec_IDCT,.Lfe1-vdec_IDCT
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -