📄 video_yuv_mmx.s
字号:
sarl %ebx addl %ebx,%esi addl %esi,%edx negl %ebx movl %ebx,FrameWidth(%esp)/* * Register Usage: */PrepareChromaLine: movl AspectCount(%esp),%ebp movl FrameWidth(%esp),%ebx subl $2,%ebp movl CCOPitch(%esp),%eax movl %eax,tmpCCOPitch(%esp) ja continue xorl %eax,%eax addl AspectAdjustmentCount(%esp),%ebp movl %eax,tmpCCOPitch(%esp)continue: movl %ebp,AspectCount(%esp)do_next_8x2_block: movl tmpYCursorEven(%esp),%ebp/* here is even line */ movd (%edx,%ebx,),%mm1 /* 4 u values */ pxor %mm0,%mm0 /* mm0=0 */ movd (%esi,%ebx,),%mm2 /* 4 v values */ punpcklbw %mm0,%mm1 /* get 4 unsign u */ psubw Minusg,%mm1 /* get 4 unsign u-128 */ punpcklbw %mm0,%mm2 /* get unsign v */ psubw Minusg,%mm2 /* get unsign v-128 */ movq %mm1,%mm3 /* save the u-128 unsign */ movq %mm1,%mm5 /* save u-128 unsign */ punpcklwd %mm2,%mm1 /* get 2 low u, v unsign pairs */ pmaddwd UVtG,%mm1 punpckhwd %mm2,%mm3 /* create high 2 unsign uv pairs */ pmaddwd UVtG,%mm3 movq %mm2,temp_mmx(%esp) /* save v-128 */ movq (%ebp,%ebx,2),%mm6 /* mm6 has 8 y pixels */ psubusb Yadd,%mm6 /* mm6 has 8 y-16 pixels */ packssdw %mm3,%mm1 /* packed the results to signed words */ movq %mm6,%mm7 /* save the 8 y-16 pixels */ punpcklbw %mm0,%mm6 /* mm6 has 4 low y-16 unsign */ pmullw Ymul,%mm6 punpckhbw %mm0,%mm7 /* mm7 has 4 high y-16 unsign */ pmullw Ymul,%mm7 movq %mm1,%mm4 movq %mm1,temp_mmx+8(%esp) /* save 4 chroma G values */ punpcklwd %mm1,%mm1 /* chroma G replicate low 2 */ movq %mm6,%mm0 /* low y */ punpckhwd %mm4,%mm4 /* chroma G replicate high 2 */ movq %mm7,%mm3 /* high y */ psubw %mm1,%mm6 /* 4 low G */ psraw GRightShift(%esp),%mm6 psubw %mm4,%mm7 /* 4 high G values in signed 16 bit */ movq %mm5,%mm2 punpcklwd %mm5,%mm5 /* replicate the 2 low u pixels */ pmullw UtB,%mm5 punpckhwd %mm2,%mm2 psraw GRightShift(%esp),%mm7 pmullw UtB,%mm2 packuswb %mm7,%mm6 /* mm6: G7 G6 G5 G4 G3 G2 G1 G0 */ movq %mm5,temp_mmx+16(%esp) /* low chroma B */ paddw %mm0,%mm5 /* 4 low B values in signed 16 bit */ movq %mm2,temp_mmx+40(%esp) /* high chroma B */ paddw %mm3,%mm2 /* 4 high B values in signed 16 bit */ psraw BRightShift(%esp),%mm5 /* low B scaled down by 6+(8-5) */ psraw BRightShift(%esp),%mm2 /* high B scaled down by 6+(8-5) */ packuswb %mm2,%mm5 /* mm5: B7 B6 B5 B4 B3 B2 B1 B0 */ movq temp_mmx(%esp),%mm2 /* 4 v values */ movq %mm5,%mm1 /* save B */ movq %mm2,%mm7 punpcklwd %mm2,%mm2 /* replicate the 2 low v pixels */ pmullw VtR,%mm2 punpckhwd %mm7,%mm7 pmullw VtR,%mm7 paddusb BUpperLimit(%esp),%mm1 /* mm1: saturate B+0FF-15 */ movq %mm2,temp_mmx+24(%esp) /* low chroma R */ paddw %mm0,%mm2 /* 4 low R values in signed 16 bit */ psraw RRightShift(%esp),%mm2 /* low R scaled down by 6+(8-5) */ pxor %mm4,%mm4 /* mm4=0 for 8->16 conversion */ movq %mm7,temp_mmx+32(%esp) /* high chroma R */ paddw %mm3,%mm7 /* 4 high R values in signed 16 bit */ psraw RRightShift(%esp),%mm7 /* high R scaled down by 6+(8-5) */ psubusb BUpperLimit(%esp),%mm1 packuswb %mm7,%mm2 /* mm2: R7 R6 R5 R4 R3 R2 R1 R0 */ paddusb GUpperLimit(%esp),%mm6 /* G fast patch ih */ psubusb GUpperLimit(%esp),%mm6 /* fast patch ih */ paddusb RUpperLimit(%esp),%mm2 /* R */ psubusb RUpperLimit(%esp),%mm2/* * here we are packing from RGB24 to RGB16 * input: * mm6: G7 G6 G5 G4 G3 G2 G1 G0 * mm1: B7 B6 B5 B4 B3 B2 B1 B0 * mm2: R7 R6 R5 R4 R3 R2 R1 R0 * assuming 8 original pixels in 0-H representation on mm6, mm5, mm2 * when H=2**xBITS-1 (x is for R G B) * output: * mm1- result: 4 low RGB16 * mm7- result: 4 high RGB16 * using: mm0- zero register * mm3- temporary results * algorithm: * for (i=0; i<8; i++) { * RGB[i]=256*(R[i]<<(8-5))+(G[i]<<5)+B[i]; * } */ psllq RLeftShift(%esp),%mm2 /* position R in the most significant part of the byte */ movq %mm1,%mm7 /* mm1: Save B *//* * note: no need for shift to place B on the least significant part of the byte * R in left position, B in the right position so they can be combined */ punpcklbw %mm2,%mm1 /* mm1: 4 low 16 bit RB */ pxor %mm0,%mm0 /* mm0: 0 */ punpckhbw %mm2,%mm7 /* mm5: 4 high 16 bit RB */ movq %mm6,%mm3 /* mm3: G */ punpcklbw %mm0,%mm6 /* mm6: low 4 G 16 bit */ psllw GLeftShift(%esp),%mm6 /* shift low G 5 positions */ punpckhbw %mm0,%mm3 /* mm3: high 4 G 16 bit */ por %mm6,%mm1 /* mm1: low RBG16 */ psllw GLeftShift(%esp),%mm3 /* shift high G 5 positions */ por %mm3,%mm7 /* mm5: high RBG16 */ movl tmpYCursorOdd(%esp),%ebp /* moved to here to save cycles before odd line */ movq %mm1,(%edi) /* !! aligned *//*- start odd line */ movq (%ebp,%ebx,2),%mm1 /* mm1 has 8 y pixels */ pxor %mm2,%mm2 psubusb Yadd,%mm1 /* mm1 has 8 pixels y-16 */ movq %mm1,%mm5 punpcklbw %mm2,%mm1 /* get 4 low y-16 unsign pixels word */ pmullw Ymul,%mm1 /* low 4 luminance contribution */ punpckhbw %mm2,%mm5 /* 4 high y-16 */ pmullw Ymul,%mm5 /* high 4 luminance contribution */ movq %mm7,8(%edi) /* !! aligned */ movq %mm1,%mm0 paddw temp_mmx+24(%esp),%mm0 /* low 4 R */ movq %mm5,%mm6 psraw RRightShift(%esp),%mm0 /* low R scaled down by 6+(8-5) */ paddw temp_mmx+32(%esp),%mm5 /* high 4 R */ movq %mm1,%mm2 psraw RRightShift(%esp),%mm5 /* high R scaled down by 6+(8-5) */ paddw temp_mmx+16(%esp),%mm2 /* low 4 B */ packuswb %mm5,%mm0 /* mm0: R7 R6 R5 R4 R3 R2 R1 R0 */ psraw BRightShift(%esp),%mm2 /* low B scaled down by 6+(8-5) */ movq %mm6,%mm5 paddw temp_mmx+40(%esp),%mm6 /* high 4 B */ psraw BRightShift(%esp),%mm6 /* high B scaled down by 6+(8-5) */ movq temp_mmx+8(%esp),%mm3 /* chroma G low 4 */ packuswb %mm6,%mm2 /* mm2: B7 B6 B5 B4 B3 B2 B1 B0 */ movq %mm3,%mm4 punpcklwd %mm3,%mm3 /* replicate low 2 */ punpckhwd %mm4,%mm4 /* replicate high 2 */ psubw %mm3,%mm1 /* 4 low G */ psraw GRightShift(%esp),%mm1 /* low G scaled down by 6+(8-5) */ psubw %mm4,%mm5 /* 4 high G values in signed 16 bit */ psraw GRightShift(%esp),%mm5 /* high G scaled down by 6+(8-5) */ paddusb BUpperLimit(%esp),%mm2 /* mm1: saturate B+0FF-15 */ packuswb %mm5,%mm1 /*mm1: G7 G6 G5 G4 G3 G2 G1 G0 */ psubusb BUpperLimit(%esp),%mm2 paddusb GUpperLimit(%esp),%mm1 /* G */ psubusb GUpperLimit(%esp),%mm1 paddusb RUpperLimit(%esp),%mm0 /* R */ movl tmpCCOPitch(%esp),%eax psubusb RUpperLimit(%esp),%mm0/* * here we are packing from RGB24 to RGB16 * mm1: G7 G6 G5 G4 G3 G2 G1 G0 * mm2: B7 B6 B5 B4 B3 B2 B1 B0 * mm0: R7 R6 R5 R4 R3 R2 R1 R0 * output: * mm2- result: 4 low RGB16 * mm7- result: 4 high RGB16 * using: mm4- zero register * mm3- temporary results */ psllq RLeftShift(%esp),%mm0 /* position R in the most significant part of the byte */ movq %mm2,%mm7 /* mm7: Save B *//* * note: no need for shift to place B on the least significant part of the byte * R in left position, B in the right position so they can be combined */ punpcklbw %mm0,%mm2 /* mm1: 4 low 16 bit RB */ pxor %mm4,%mm4 /* mm4: 0 */ movq %mm1,%mm3 /* mm3: G */ punpckhbw %mm0,%mm7 /* mm7: 4 high 16 bit RB */ punpcklbw %mm4,%mm1 /* mm1: low 4 G 16 bit */ punpckhbw %mm4,%mm3 /* mm3: high 4 G 16 bit */ psllw GLeftShift(%esp),%mm1 /* shift low G 5 positions */ por %mm1,%mm2 /* mm2: low RBG16 */ psllw GLeftShift(%esp),%mm3 /* shift high G 5 positions */ por %mm3,%mm7 /* mm7: high RBG16 */#ifdef DOUBLE movq %mm2,%mm1 movq %mm7,%mm5 movq %mm2,%mm0 movq %mm7,%mm3 punpckhwd %mm2,%mm1 punpckhwd %mm7,%mm5 punpcklwd %mm2,%mm0 punpcklwd %mm7,%mm3 movq %mm0,(%edi,%eax,) movq %mm1,8(%edi,%eax,) movq %mm3,16(%edi,%eax,) movq %mm5,24(%edi,%eax,) addl $32,%edi addl $4,%ebx#endif#ifndef DOUBLE movq %mm2,(%edi,%eax,) movq %mm7,8(%edi,%eax,) /* aligned */ addl $16,%edi /* ih take 16 bytes (8 pixels-16 bit) */ addl $4,%ebx /* ? to take 4 pixels together instead of 2 */#endif jl do_next_8x2_block addl CCOSkipDistance(%esp),%edi /* go to begin of next line */ addl tmpCCOPitch(%esp),%edi /* skip odd line (if it is needed) */// Leax AspectCount// Lebp CCOPitch ; skip odd line// sub eax, 2// jg @f// Addeax AspectBaseCount// xor ebp, ebp//@@:// Seax AspectCount// add edi, ebp movl YPitch(%esp),%eax movl tmpYCursorOdd(%esp),%ebp addl %eax,%ebp /* skip one line */// lea ebp, [ebp+2*eax] /* skip two lines */ movl %ebp,tmpYCursorEven(%esp)// Sebp tmpYCursorOdd addl %eax,%ebp /* skip one line */ movl %ebp,tmpYCursorOdd(%esp)// Lebp tmpYCursorEven// lea ebp, [ebp+2*eax]// Sebp tmpYCursorEven addl ChromaPitch(%esp),%esi addl ChromaPitch(%esp),%edx// Leax YLimit /* Done with last line? */// cmp ebp, eax// jbe PrepareChromaLine subw $2,FrameHeight(%esp) ja PrepareChromaLine/******************************************************************************/finish: emms addl $LocalFrameSize,%esp popl %ebx popl %ebp popl %edi popl %esi ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -