📄 convert_yv12.h
字号:
srcp[2] = srcV-src_pitch_uv;
typename _mm::__m add_ones=_mm::set1_pi8(1);
for (int y=0;y<height;y+=2,dst+=dst_pitch2,srcp[0]+=src_pitch2,srcp[1]+=src_pitch_uv,srcp[2]+=src_pitch_uv)
{
unsigned char *edi=dst;
const unsigned char *eax=srcp[0];
const unsigned char *ebx=srcp[1];
const unsigned char *ecx=srcp[2];
for (int x=0;x<src_rowsize;x+=_mm::size,edi+=_mm::size*2,eax+=_mm::size,ebx+=_mm::size/2,ecx+=_mm::size/2)
{
typename _mm::__m mm6=add_ones;
//int edx= src_pitch_uv;
typename _mm::__m mm0,mm7=_mm::setzero_si64(),mm2,mm3,mm4,mm1,mm5;
movq (mm0,eax); // mm0 = Y current line
movd (mm2,ebx+src_pitch_uv); // mm2 = U top field
movd (mm3, ecx+src_pitch_uv); // mm3 = V top field
movd (mm4,ebx); // U prev top field
movq (mm1,mm0); // mm1 = Y current line
movd (mm5,ecx); // V prev top field
_mm::pavgb (mm4,mm2); // interpolate chroma U (25/75)
_mm::pavgb (mm5,mm3 ); // interpolate chroma V (25/75)
psubusb (mm4, mm6); // Better rounding (thanks trbarry!)
psubusb (mm5, mm6 );
_mm::pavgb (mm4,mm2 ); // interpolate chroma U
_mm::pavgb (mm5,mm3 ); // interpolate chroma V
punpcklbw (mm0,mm7); // Y low
punpckhbw (mm1,mm7 ); // Y high*
punpcklbw (mm4,mm7); // U 00uu 00uu 00uu 00uu
punpcklbw (mm5,mm7 ); // V 00vv 00vv 00vv 00vv
pxor (mm6,mm6 );
punpcklbw (mm6,mm4 ); // U 0000 uu00 0000 uu00 (low)
punpckhbw (mm7,mm4); // V 0000 uu00 0000 uu00 (high
por (mm0,mm6 );
por (mm1,mm7);
movq (mm6,mm5);
punpcklbw (mm5,mm5); // V 0000 vvvv 0000 vvvv (low)
punpckhbw (mm6,mm6); // V 0000 vvvv 0000 vvvv (high)
pslld (mm5,24);
pslld (mm6,24);
por (mm0,mm5);
por (mm1,mm6);
//mov edx, src_pitch_uv2
movq (edi,mm0);
movq (edi+_mm::size,mm1);
//Next line
movq (mm6,add_ones);
movd (mm4,ebx+src_pitch_uv2); // U next top field
movd (mm5,ecx+src_pitch_uv2); // V prev top field
//mov edx, [src_pitch]
pxor (mm7,mm7);
movq (mm0,eax+src_pitch); // Next U-line
_mm::pavgb (mm4,mm2); // interpolate chroma U
movq (mm1,mm0); // mm1 = Y current line
_mm::pavgb (mm5,mm3); // interpolate chroma V
psubusb (mm4, mm6); // Better rounding (thanks trbarry!)
psubusb (mm5, mm6 );
_mm::pavgb (mm4,mm2 ); // interpolate chroma U
_mm::pavgb (mm5,mm3 ); // interpolate chroma V
punpcklbw (mm0,mm7); // Y low
punpckhbw (mm1,mm7 ); // Y high*
punpcklbw (mm4,mm7); // U 00uu 00uu 00uu 00uu
punpcklbw (mm5,mm7 ); // V 00vv 00vv 00vv 00vv
pxor (mm6,mm6 );
punpcklbw (mm6,mm4 ); // U 0000 uu00 0000 uu00 (low)
punpckhbw (mm7,mm4); // V 0000 uu00 0000 uu00 (high
por (mm0,mm6 );
por (mm1,mm7);
movq (mm6,mm5);
punpcklbw (mm5,mm5); // V 0000 vvvv 0000 vvvv (low)
punpckhbw (mm6,mm6); // V 0000 vvvv 0000 vvvv (high)
pslld (mm5,24);
//mov edx,[dst_pitch]
pslld (mm6,24);
por (mm0,mm5);
por (mm1,mm6);
movq (edi+dst_pitch,mm0);
movq (edi+dst_pitch+_mm::size,mm1);
}
}
_mm::sfence();
_mm::empty();
}
static void yv12_i_to_yuy2(const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, int src_rowsize, stride_t src_pitch, stride_t src_pitch_uv,
BYTE* dst, stride_t dst_pitch,
int height)
{
if (_mm::align && (intptr_t(srcY)&15 || intptr_t(srcU)&15 || intptr_t(srcV)&15 || intptr_t(dst)&15 || src_pitch&15 || src_pitch_uv&15 || dst_pitch&15))
{
TconvertYV12<typename _mm::T64>::yv12_i_to_yuy2(srcY,srcU,srcV,src_rowsize,src_pitch,src_pitch_uv,dst,dst_pitch,height);
return;
}
stride_t src_pitch_uv2 = src_pitch_uv*2;
stride_t src_pitch_uv4 = src_pitch_uv*4;
int skipnext = 0;
stride_t dst_pitch2=dst_pitch*2;
stride_t src_pitch2 = src_pitch*2;
stride_t dst_pitch4 = dst_pitch*4;
stride_t src_pitch4 = src_pitch*4;
/**** Do first and last lines - NO interpolation: *****/
// MMX loop relies on C-code to adjust the lines for it.
const BYTE* _srcY=srcY;
const BYTE* _srcU=srcU;
const BYTE* _srcV=srcV;
BYTE* _dst=dst;
//
for (int i=0;i<8;i++)
{
switch (i)
{
case 1:
_srcY+=src_pitch2; // Same chroma as in 0
_dst+=dst_pitch2;
break;
case 2:
_srcY-=src_pitch; // Next field
_dst-=dst_pitch;
_srcU+=src_pitch_uv;
_srcV+=src_pitch_uv;
break;
case 3:
_srcY+=src_pitch2; // Same chroma as in 2
_dst+=dst_pitch2;
break;
case 4: // Now we process the bottom four lines of the picture.
_srcY=srcY+(src_pitch*(height-4));
_srcU=srcU+(src_pitch_uv*((height>>1)-2));
_srcV=srcV+(src_pitch_uv*((height>>1)-2));
_dst = dst+(dst_pitch*(height-4));
break;
case 5: // Same chroma as in 4
_srcY += src_pitch2;
_dst += dst_pitch2;
break;
case 6: // Next field
_srcY -= src_pitch;
_dst -= dst_pitch;
_srcU+=src_pitch_uv;
_srcV+=src_pitch_uv;
break;
case 7: // Same chroma as in 6
_srcY += src_pitch2;
_dst += dst_pitch2;
default: // Nothing, case 0
break;
}
unsigned char *edi=_dst;
const unsigned char *eax=_srcY;
const unsigned char *ebx=_srcU;
const unsigned char *ecx=_srcV;
typename _mm::__m mm7=_mm::setzero_si64();
for (int edx=0;edx<src_rowsize;edx+=_mm::size,eax+=_mm::size,ebx+=_mm::size/2,ecx+=_mm::size/2,edi+=_mm::size*2)
{
typename _mm::__m mm0,mm1,mm3,mm2,mm4,mm5;
movq (mm0,eax); //Y
movd (mm1,ebx); //U
movq (mm3,mm0);
movd (mm2,ecx); //V
punpcklbw (mm0,mm7); // Y low
punpckhbw (mm3,mm7); // Y high
punpcklbw (mm1,mm7 ); // 00uu 00uu
punpcklbw (mm2,mm7); // 00vv 00vv
movq (mm4,mm1 );
movq (mm5,mm2);
punpcklbw (mm1,mm7 ); // 0000 00uu low
punpcklbw (mm2,mm7); // 0000 00vv low
punpckhbw (mm4,mm7 ); // 0000 00uu high
punpckhbw (mm5,mm7); // 0000 00vv high
pslld (mm1,8);
pslld (mm4,8);
pslld (mm2,24);
pslld (mm5,24);
por (mm0, mm1);
por (mm3, mm4);
por (mm0, mm2);
por (mm3, mm5);
movq (edi,mm0);
movq (edi+_mm::size,mm3);
}
}
/****************************************
* Conversion main loop.
* The code properly interpolates UV from
* interlaced material.
* We process two lines in the same field
* in the same loop, to avoid reloading
* chroma each time.
*****************************************/
height-=8;
dst+=dst_pitch4;
srcY+=src_pitch4;
srcU+=src_pitch_uv2;
srcV+=src_pitch_uv2;
const BYTE *srcp[3];
srcp[0] = srcY;
srcp[1] = srcU-src_pitch_uv2;
srcp[2] = srcV-src_pitch_uv2;
typename _mm::__m add_ones=_mm::set1_pi8(1);
for (int y=0;y<height;)
{
unsigned char *edi=dst;
const unsigned char *eax=srcp[0];
const unsigned char *ebx=srcp[1];
const unsigned char *ecx=srcp[2];
yloop:
for (int x=0;x<src_rowsize;x+=_mm::size,edi+=_mm::size*2,eax+=_mm::size,ebx+=_mm::size/2,ecx+=_mm::size/2)
{
//mov edx, src_pitch_uv2
typename _mm::__m mm6=add_ones,mm0,mm7,mm2,mm3,mm4,mm1,mm5;
movq (mm0,eax); // mm0 = Y current line
pxor (mm7,mm7);
movd (mm2,ebx+src_pitch_uv2); // mm2 = U top field
movd (mm3, ecx+src_pitch_uv2); // mm3 = V top field
movd (mm4,ebx); // U prev top field
movq (mm1,mm0); // mm1 = Y current line
movd (mm5,ecx); // V prev top field
_mm::pavgb (mm4,mm2); // interpolate chroma U
_mm::pavgb (mm5,mm3); // interpolate chroma V
psubusb (mm4, mm6); // Better rounding (thanks trbarry!)
psubusb (mm5, mm6);
_mm::pavgb (mm4,mm2); // interpolate chroma U
_mm::pavgb (mm5,mm3); // interpolate chroma V
punpcklbw (mm0,mm7); // Y low
punpckhbw (mm1,mm7); // Y high*
punpcklbw (mm4,mm7); // U 00uu 00uu 00uu 00uu
punpcklbw (mm5,mm7); // V 00vv 00vv 00vv 00vv
pxor (mm6,mm6);
punpcklbw (mm6,mm4); // U 0000 uu00 0000 uu00 (low)
punpckhbw (mm7,mm4); // V 0000 uu00 0000 uu00 (high
por (mm0,mm6);
por (mm1,mm7);
movq (mm6,mm5);
punpcklbw (mm5,mm5); // V 0000 vvvv 0000 vvvv (low)
punpckhbw (mm6,mm6); // V 0000 vvvv 0000 vvvv (high)
pslld (mm5,24);
pslld (mm6,24);
por (mm0,mm5);
por (mm1,mm6);
//mov edx, src_pitch_uv4
movq (edi,mm0);
movq (edi+_mm::size,mm1);
//Next line in same field
movq (mm6, add_ones);
movd (mm4,ebx+src_pitch_uv4); // U next top field
movd (mm5,ecx+src_pitch_uv4); // V prev top field
//mov edx, [src_pitch2]
movq( mm0,eax+src_pitch2); // Next Y-line
_mm::pavgb (mm4,mm2); // interpolate chroma U
_mm::pavgb (mm5,mm3); // interpolate chroma V
psubusb (mm4, mm6); // Better rounding (thanks trbarry!)
psubusb (mm5, mm6);
_mm::pavgb (mm4,mm2); // interpolate chroma U
_mm::pavgb (mm5,mm3); // interpolate chroma V
pxor (mm7,mm7);
movq (mm1,mm0); // mm1 = Y current line
punpcklbw (mm0,mm7); // Y low
punpckhbw (mm1,mm7 ); // Y high*
punpcklbw (mm4,mm7); // U 00uu 00uu 00uu 00uu
punpcklbw (mm5,mm7 ); // V 00vv 00vv 00vv 00vv
pxor (mm6,mm6);
punpcklbw (mm6,mm4); // U 0000 uu00 0000 uu00 (low)
punpckhbw (mm7,mm4); // V 0000 uu00 0000 uu00 (high
por (mm0,mm6 );
por( mm1,mm7);
movq (mm6,mm5);
punpcklbw (mm5,mm5); // V 0000 vvvv 0000 vvvv (low)
punpckhbw (mm6,mm6); // V 0000 vvvv 0000 vvvv (high)
pslld (mm5,24);
//mov edx,[dst_pitch2]
pslld (mm6,24);
por (mm0,mm5);
por (mm1,mm6);
movq (edi+dst_pitch2,mm0);
movq (edi+dst_pitch2+_mm::size,mm1);
}
if (skipnext)
{
dst+=dst_pitch4;
srcp[0]+=src_pitch4;
srcp[1]+=src_pitch_uv2;
srcp[2]+=src_pitch_uv2;
skipnext=0;
y+=4;
}
else
{
edi=dst;
eax=srcp[0];
ebx=srcp[1];
ecx=srcp[2];
edi+=dst_pitch;
eax+=src_pitch;
ebx+=src_pitch_uv;
ecx+=src_pitch_uv;
skipnext=1;
if(y+4<=height)
goto yloop;
}
}
_mm::sfence();
_mm::empty();
}
};
#pragma warning(pop)
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -