📄 yuv2rgb_mmx.c
字号:
mov edi, horiz_count
horiz_loop:
movd mm2, [ecx] ; mm2 = ________u3u2u1u0
movd mm3, [edx] ; mm3 = ________v3v2v1v0
movq mm0, [ebx] ; mm0 = y7y6y5y4y3y2y1y0
pxor mm7, mm7 ; zero mm7
punpcklbw mm2, mm7 ; mm2 = __u3__u2__u1__u0
punpcklbw mm3, mm7 ; mm3 = __v3__v2__v1__v0
psubw mm2, mmw_0x0080 ; mm2 -= 128
psubw mm3, mmw_0x0080 ; mm3 -= 128
psllw mm2, 3 ; mm2 *= 8
psllw mm3, 3 ; mm3 *= 8
movq mm4, mm2 ; mm4 = mm2 = u
movq mm5, mm3 ; mm5 = mm3 = v
pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff
pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff
pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma
pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma
paddsw mm2, mm3 ; mm2 = green chroma
psubusb mm0, mmb_0x10 ; mm0 -= 16
movq mm1, mmw_0x00ff ; mm1 = 00ff00ff00ff00ff
psrlw mm0, 8 ; mm0 = __y7__y5__y3__y1 luma odd
pand mm1, mm0 ; mm1 = __y6__y4__y2__y0 luma even
psllw mm0, 3 ; mm0 *= 8
psllw mm1, 3 ; mm1 *= 8
pmulhw mm0, mmw_mult_Y ; mm0 luma odd *= luma coeff
pmulhw mm1, mmw_mult_Y ; mm1 luma even *= luma coeff
movq mm3, mm4 ; copy blue chroma
movq mm6, mm5 ; copy red chroma
movq mm7, mm2 ; copy green chroma
paddsw mm3, mm0 ; mm3 = luma odd + blue chroma
paddsw mm4, mm1 ; mm4 = luma even + blue chroma
paddsw mm6, mm0 ; mm6 = luma odd + red chroma
paddsw mm5, mm1 ; mm5 = luma even + red chroma
paddsw mm7, mm0 ; mm7 = luma odd + green chroma
paddsw mm2, mm1 ; mm2 = luma even + green chroma
packuswb mm3, mm3
packuswb mm4, mm4
packuswb mm6, mm6
packuswb mm5, mm5
packuswb mm7, mm7
packuswb mm2, mm2
punpcklbw mm4, mm3 ; mm4 = b7b6b5b4b3b2b1b0 blue
punpcklbw mm5, mm6 ; mm5 = r7r6r5r4r3r2r1r0 red
punpcklbw mm2, mm7 ; mm2 = g7g6g5g4g3g2g1g0 green
pand mm4, mask_5
pand mm5, mask_5
pand mm2, mask_5
psrlw mm4, 3 ; mm4 = blue shifted
pand mm4, mask_blue ; mask the blue again
pxor mm7, mm7 ; zero mm7
movq mm1, mm4 ; mm1 = copy blue
movq mm3, mm5 ; mm3 = copy red
movq mm6, mm2 ; mm6 = copy green
punpckhbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm6, mm7
psllw mm6, 2 ; shift green
psllw mm3, 7 ; shift red
por mm6, mm3
por mm6, mm1
movq 8[eax], mm6
punpcklbw mm2, mm7 ; mm2 = __g3__g2__g1__g0 already masked
punpcklbw mm5, mm7
punpcklbw mm4, mm7
psllw mm2, 2 ; shift green
psllw mm5, 7 ; shift red
por mm2, mm5
por mm2, mm4
movq [eax], mm2
add ebx, 8 ; puc_y += 8;
add ecx, 4 ; puc_u += 4;
add edx, 4 ; puc_v += 4;
add eax, 16 ; puc_out += 16 // wrote 16 bytes
inc edi
jne horiz_loop
pop edi
pop edx
pop ecx
pop ebx
pop eax
emms
}
puc_y += stride_y;
if (y%2) {
puc_u += stride_uv;
puc_v += stride_uv;
}
puc_out += stride_out;
}
}
/***/
#define _S(a) (a)>255 ? 255 : (a)<0 ? 0 : (a)
#define _R(y,u,v) (0x2568*(y) + 0x3343*(u)) /0x2000
#define _G(y,u,v) (0x2568*(y) - 0x0c92*(v) - 0x1a1e*(u)) /0x2000
#define _B(y,u,v) (0x2568*(y) + 0x40cf*(v)) /0x2000
#define _mR 0x7c00
#define _mG 0x03e0
#define _mB 0x001f
#define _Ps565(r,g,b) ( ((r & 0xF8) >> 3) | (((g & 0xF8) << 3)) | (((b & 0xF8) << 8)) )
void yuv2rgb_565(uint8_t *puc_y, int stride_y,
uint8_t *puc_u, uint8_t *puc_v, int stride_uv,
uint8_t *puc_out, int width_y, int height_y,
unsigned int _stride_out)
{
int y, horiz_count;
unsigned short * pus_out;
int stride_out = width_y * 2;
if (height_y < 0) {
height_y = -height_y;
puc_y += (height_y - 1) * stride_y ;
puc_u += (height_y/2 - 1) * stride_uv;
puc_v += (height_y/2 - 1) * stride_uv;
stride_y = -stride_y;
stride_uv = -stride_uv;
}
pus_out = (unsigned short *) puc_out;
horiz_count = -(width_y >> 3);
for (y=0; y<height_y; y++)
{
_asm {
push eax
push ebx
push ecx
push edx
push edi
mov eax, puc_out
mov ebx, puc_y
mov ecx, puc_u
mov edx, puc_v
mov edi, horiz_count
horiz_loop:
movd mm2, [ecx] ; mm2 = ________u3u2u1u0
movd mm3, [edx] ; mm3 = ________v3v2v1v0
movq mm0, [ebx] ; mm0 = y7y6y5y4y3y2y1y0
pxor mm7, mm7 ; zero mm7
punpcklbw mm2, mm7 ; mm2 = __u3__u2__u1__u0
punpcklbw mm3, mm7 ; mm3 = __v3__v2__v1__v0
psubw mm2, mmw_0x0080 ; mm2 -= 128
psubw mm3, mmw_0x0080 ; mm3 -= 128
psllw mm2, 3 ; mm2 *= 8
psllw mm3, 3 ; mm3 *= 8
movq mm4, mm2 ; mm4 = mm2 = u
movq mm5, mm3 ; mm5 = mm3 = v
pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff
pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff
pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma
pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma
paddsw mm2, mm3 ; mm2 = green chroma
psubusb mm0, mmb_0x10 ; mm0 -= 16
movq mm1, mmw_0x00ff ; mm1 = 00ff00ff00ff00ff
psrlw mm0, 8 ; mm0 = __y7__y5__y3__y1 luma odd
pand mm1, mm0 ; mm1 = __y6__y4__y2__y0 luma even
psllw mm0, 3 ; mm0 *= 8
psllw mm1, 3 ; mm1 *= 8
pmulhw mm0, mmw_mult_Y ; mm0 luma odd *= luma coeff
pmulhw mm1, mmw_mult_Y ; mm1 luma even *= luma coeff
movq mm3, mm4 ; copy blue chroma
movq mm6, mm5 ; copy red chroma
movq mm7, mm2 ; copy green chroma
paddsw mm3, mm0 ; mm3 = luma odd + blue chroma
paddsw mm4, mm1 ; mm4 = luma even + blue chroma
paddsw mm6, mm0 ; mm6 = luma odd + red chroma
paddsw mm5, mm1 ; mm5 = luma even + red chroma
paddsw mm7, mm0 ; mm7 = luma odd + green chroma
paddsw mm2, mm1 ; mm2 = luma even + green chroma
packuswb mm3, mm3
packuswb mm4, mm4
packuswb mm6, mm6
packuswb mm5, mm5
packuswb mm7, mm7
packuswb mm2, mm2
punpcklbw mm4, mm3 ; mm4 = b7b6b5b4b3b2b1b0 blue
punpcklbw mm5, mm6 ; mm5 = r7r6r5r4r3r2r1r0 red
punpcklbw mm2, mm7 ; mm2 = g7g6g5g4g3g2g1g0 green
pand mm4, mask_5
pand mm5, mask_5
pand mm2, mask_5
psrlw mm4, 3 ; mm4 = red shifted
pand mm4, mask_blue ; mask the blue again
pxor mm7, mm7 ; zero mm7
movq mm1, mm5 ; mm1 = copy blue
movq mm3, mm4 ; mm3 = copy red
movq mm6, mm2 ; mm6 = copy green
punpckhbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm6, mm7
psllw mm6, 3 ; shift green
psllw mm1, 8 ; shift blue
por mm6, mm3
por mm6, mm1
movq 8[eax], mm6
punpcklbw mm2, mm7 ; mm2 = __g3__g2__g1__g0 already masked
punpcklbw mm4, mm7
punpcklbw mm5, mm7
psllw mm2, 3 ; shift green
psllw mm5, 8 ; shift blue
por mm2, mm4
por mm2, mm5
movq [eax], mm2
add ebx, 8 ; puc_y += 8;
add ecx, 4 ; puc_u += 4;
add edx, 4 ; puc_v += 4;
add eax, 16 ; puc_out += 16 // wrote 16 bytes
inc edi
jne horiz_loop
pop edi
pop edx
pop ecx
pop ebx
pop eax
emms
}
/***/
puc_y += stride_y;
if (y%2) {
puc_u += stride_uv;
puc_v += stride_uv;
}
puc_out += stride_out;
}
}
/***/
void yuy2_out(uint8_t *puc_y, int stride_y,
uint8_t *puc_u, uint8_t *puc_v, int stride_uv,
uint8_t *puc_out, int width_y, int height_y,
unsigned int stride_out)
{
int y;
uint8_t* puc_out2;
unsigned int stride_diff = 4 * stride_out - 2 * width_y;
if (height_y < 0) {
height_y = -height_y;
puc_y += (height_y - 1) * stride_y ;
puc_u += (height_y/2 - 1) * stride_uv;
puc_v += (height_y/2 - 1) * stride_uv;
stride_y = -stride_y;
stride_uv = -stride_uv;
}
puc_out2 = puc_out + 2 * stride_out;
for (y=height_y/2; y; y--) {
register uint8_t *py, *py2, *pu, *pv;
register int x;
uint32_t tmp;
py = puc_y;
py2 = puc_y + stride_y;
pu = puc_u;
pv = puc_v;
for (x=width_y/2; x; x--) {
tmp = *(py++);
tmp |= *(pu++) << 8;
tmp |= *(py++) << 16;
tmp |= *(pv++) << 24;
*(uint32_t*)puc_out=tmp;
puc_out += 4;
tmp &= 0xFF00FF00;
tmp |= *(py2++);
tmp |= *(py2++) << 16;
*(uint32_t*)puc_out2=tmp;
puc_out2 += 4;
}
puc_y += 2*stride_y;
puc_u += stride_uv;
puc_v += stride_uv;
puc_out += stride_diff;
puc_out2 += stride_diff;
}
}
void uyvy_out(uint8_t *puc_y, int stride_y,
uint8_t *puc_u, uint8_t *puc_v, int stride_uv,
uint8_t *puc_out, int width_y, int height_y,
unsigned int stride_out)
{
int y;
uint8_t* puc_out2;
unsigned int stride_diff = 4 * stride_out - 2 * width_y;
if (height_y < 0) {
height_y = -height_y;
puc_y += (height_y - 1) * stride_y ;
puc_u += (height_y/2 - 1) * stride_uv;
puc_v += (height_y/2 - 1) * stride_uv;
stride_y = -stride_y;
stride_uv = -stride_uv;
}
puc_out2 = puc_out + 2 * stride_out;
for (y=height_y/2; y; y--) {
register uint8_t *py, *py2, *pu, *pv;
register int x;
uint32_t tmp;
py = puc_y;
py2 = puc_y + stride_y;
pu = puc_u;
pv = puc_v;
for (x=width_y/2; x; x--) {
tmp = *(pu++);
tmp |= *(py++) << 8;
tmp |= *(pv++) << 16;
tmp |= *(py++) << 24;
*(uint32_t*)puc_out=tmp;
puc_out += 4;
tmp &= 0x00FF00FF;
tmp |= *(py2++) << 8;
tmp |= *(py2++) << 24;
*(uint32_t*)puc_out2=tmp;
puc_out2 += 4;
}
puc_y += 2*stride_y;
puc_u += stride_uv;
puc_v += stride_uv;
puc_out += stride_diff;
puc_out2 += stride_diff;
}
}
void yuv12_out(uint8_t *puc_y, int stride_y,
uint8_t *puc_u, uint8_t *puc_v, int stride_uv,
uint8_t *puc_out, int width_y, int height_y,
unsigned int stride_out)
{
int i;
unsigned char * pauc_out[3];
if (height_y < 0) {
height_y = -height_y;
puc_y += (height_y - 1) * stride_y ;
puc_u += (height_y/2 - 1) * stride_uv;
puc_v += (height_y/2 - 1) * stride_uv;
stride_y = -stride_y;
stride_uv = -stride_uv;
}
pauc_out[0] = puc_out;
pauc_out[1] = puc_out + stride_out * height_y;
pauc_out[2] = puc_out + stride_out * height_y * 5 / 4;
for (i=0; i<height_y; i++) {
memcpy(pauc_out[0], puc_y, width_y);
pauc_out[0] += stride_out;
puc_y += stride_y;
}
for (i=0; i<height_y/2; i++) {
memcpy(pauc_out[2], puc_u, width_y/2);
memcpy(pauc_out[1], puc_v, width_y/2);
pauc_out[1] += stride_out/2;
pauc_out[2] += stride_out/2;
puc_u += stride_uv;
puc_v += stride_uv;
}
}
/***/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -