📄 swdec_idct.c
字号:
tmp = (data[0]+4)>>3;
tmp |= tmp<<8;
tmp |= tmp<<16;
tmp |= tmp<<24;
for (j = 4; j; j--)
{
pOut32[0] = tmp;
pOut32[1] = tmp;
pOut32 += width>>2;
pOut32[0] = tmp;
pOut32[1] = tmp;
pOut32 += width>>2;
}
} else if (pDecContainer->StrmStorage.numIdctRows == 1) {
SwDec_IdctIntraBlock(data, 1, pOut, width); /* 1-row IDCT */
} else if (pDecContainer->StrmStorage.numIdctRows <= 3) {
SwDec_IdctIntraBlock(data, 3, pOut, width); /* 3-row IDCT */
} else if (pDecContainer->StrmStorage.numIdctRows <= 5) {
SwDec_IdctIntraBlock(data, 5, pOut, width); /* 5-row IDCT */
} else {
SwDec_IdctIntraBlock(data, 8, pOut, width); /* full 8-row IDCT */
}
return;
}
/*------------------------------------------------------------------------------
5.4 Function name: SwDec_IdctIntraBlock
Functional description:
Function counts 2-D idct using Loeffler's fast algorithm.
Improved version with single row and zero-row shortcuts.
In case of Intra block, transformed coefficients are write
directly to the output picture.
Inputs:
i32 *block Block data before idct, length 64
u32 rows Rows in idct
u8 *pOut pointer to output picture
u32 width width of picture in pels
Outputs:
------------------------------------------------------------------------------*/
void SwDec_IdctIntraBlock(i32 *block, u32 rows, u8 *pOut, u32 width)
{
/* Variables */
i32 i, t, x0, x1, x2, x3, x4, x5, x6, x7, x8;
#ifndef MP4DEC_ARM11
const u8* const clp = clipIntra+512;
#endif
/* Code */
/* Horizontal transform */
for (i = (i32)(rows<<3)-8; i >=0; i-=8)
{
/* If all coefficients x1..x7 == 0, the first coefficient x0
defines the idct. This 1-D shortcut minimizes the counting.
*/
if (!((x1 = block[i+1]) | (x2 = block[i+2]) | (x3 = block[i+3]) |
(x4 = block[i+4]) | (x5 = block[i+5]) | (x6 = block[i+6]) |
(x7 = block[i+7])))
{
block[i+1]=block[i+2]=block[i+3]=block[i+4]=
block[i+5]=block[i+6]=block[i+7]=
block[i]=block[i] << 6;
}
else /* Full idct for this row */
{
x0 = block[i];
x8 = x1 + x7;
x7 = x1 - x7;
x1 = x8;
x3 = L0 * x3;
x5 = L0 * x5;
x8 = x0 + x4;
x4 = x0 - x4;
x0 = x8;
t = L9 * (x2 + x6); /* Scale-rotate operation sqrt(2)*C6 */
x8 = L7 * x2 + t;
x2 = -L8 * x6 + t;
x6 = x8;
x8 = (x7 << 7) + x5;
x5 = (x7 << 7) - x5;
x7 = x8;
x8 = (x1 << 7) + x3;
x3 = (x1 << 7) - x3;
x1 = x8;
/* 2^11 added for proper rounding at the last stage */
x0 = (x0 << 18) + 2048;
x4 = (x4 << 18) + 2048;
x8 = x0 + x6;
x6 = x0 - x6;
x0 = x8;
x8 = x4 + x2;
x2 = x4 - x2;
x4 = x8;
t = L3 * (x3 + x5); /* Scale-rotate operation C1 */
x8 = -L1 * x3 + t;
x3 = -L2 * x5 + t;
x5 = x8;
t = L6 * (x7 + x1); /* Scale-rotate operation C3 */
x8 = -L4 * x7 + t;
x7 = -L5 * x1 + t;
x1 = x8;
block[i+0] = (x0 + x1) >> 12;
block[i+1] = (x4 + x5) >> 12;
block[i+2] = (x2 + x3) >> 12;
block[i+3] = (x6 + x7) >> 12;
block[i+4] = (x6 - x7) >> 12;
block[i+5] = (x2 - x3) >> 12;
block[i+6] = (x4 - x5) >> 12;
block[i+7] = (x0 - x1) >> 12;
}
} /* End of horizontal transform */
/* Vertical transform */
for (i = 7; i >= 0; i--)
{
/* 2-stage zero-1D shortcut check separates three column idct cases:
7 zero coeffs, 5 zero coeffs and less than 5 zero coeffs.
The more zero coeffs are found the less counting is needed.
*/
if (!((x3 = block[24+i]) | (x4 = block[32+i]) | (x5 = block[40+i]) |
(x6 = block[48+i]) | (x7 = block[56+i])))
{ /* atleast 5 last coeffs are 0 */
if (!((x1 = block[8+i]) | (x2 = block[16+i])))
{ /* atleast 7 last coeffs are 0 */
x0 = (block[i] + 256) >> 9;
if (x0 > 255) x0 = 255;
if (x0 < 0) x0 = 0;
/*lint -e737*/
pOut[i]=pOut[width+i]=pOut[2*width+i]=pOut[3*width+i]=
pOut[4*width+i]=pOut[5*width+i]=pOut[6*width+i]=
pOut[7*width+i]=(u8)x0;
/*lint +e737*/
}
else /* 5 last coeffs (x7, x6, x5, x4 and x3) are 0 */
{
x0 = block[i];
x7 = x1;
x0 += 256; /* 2^8 added for proper rounding */
x4 = x0;
t = L9B * x2; /* Scale-rotate operation sqrt(2)*C6 */
x6 = L7B * x2 + t;
x2 = t;
x7 = x7 << 2;
x5 = x7;
x1 = x1 << 2;
x3 = x1;
x8 = (x0 << 13) + x6;
x6 = (x0 << 13) - x6;
x0 = x8;
x8 = (x4 << 13) + x2;
x2 = (x4 << 13) - x2;
x4 = x8;
t = L3 * (x3 + x5); /* Scale-rotate operation C1 */
x8 = -L1 * x3 + t;
x3 = -L2 * x5 + t;
x5 = x8;
t = L6 * (x7 + x1); /* Scale-rotate operation C3 */
x8 = -L4 * x7 + t;
x7 = -L5 * x1 + t;
x1 = x8;
#ifndef MP4DEC_ARM11
/*lint -e737*/
pOut[i] = (u8)clp[(x0 + x1) >> 22];
pOut[width+i] = (u8)clp[(x4 + x5) >> 22];
pOut[2*width+i] = (u8)clp[(x2 + x3) >> 22];
pOut[3*width+i] = (u8)clp[(x6 + x7) >> 22];
pOut[4*width+i] = (u8)clp[(x6 - x7) >> 22];
pOut[5*width+i] = (u8)clp[(x2 - x3) >> 22];
pOut[6*width+i] = (u8)clp[(x4 - x5) >> 22];
pOut[7*width+i] = (u8)clp[(x0 - x1) >> 22];
/*lint +e737*/
#else
/* Scale and write saturated coefficients
* to the output picture. Saturation is done for
* two pixels at a time by using ARM11 SIMD
* instructions. */
x8 = (x0 + x1) >> 22;
x1 = (x0 - x1) >> 22;
x0 = (x4 + x5) >> 22;
x4 = (x4 - x5) >> 22;
x5 = (x2 + x3) >> 22;
x2 = (x2 - x3) >> 22;
x3 = (x6 + x7) >> 22;
x6 = (x6 - x7) >> 22;
__asm
{
PKHBT x7, x8, x0, LSL #16; /* pack two coefficients */
PKHBT x8, x5, x3, LSL #16;
USAT16 x7, #8, x7; /* saturate two coefficients.. */
USAT16 x8, #8, x8; /* at a time to range [0,255] */
MOV x0, i; /* get address */
STRB x7, [pOut, x0]; /* store coefficient */
ADD x0, x0, width; /* update address */
MOV x7, x7, ASR #16; /* get upper 16 bits */
STRB x7, [pOut, x0]; /* store coefficient */
ADD x0, x0, width; /* and so on ... */
STRB x8, [pOut, x0];
ADD x0, x0, width;
MOV x8, x8, ASR #16;
STRB x8, [pOut, x0];
PKHBT x7, x6, x2, LSL #16;
PKHBT x8, x4, x1, LSL #16;
USAT16 x7, #8, x7;
USAT16 x8, #8, x8;
ADD x0, x0, width;
STRB x7, [pOut, x0];
ADD x0, x0, width;
MOV x7, x7, ASR #16;
STRB x7, [pOut, x0];
ADD x0, x0, width;
STRB x8, [pOut, x0];
ADD x0, x0, width;
MOV x8, x8, ASR #16;
STRB x8, [pOut, x0];
}
#endif
}
}
else /* Full idct for this column */
{
x0 = block[i];
x1 = block[8+i];
x2 = block[16+i];
x8 = x1 + x7;
x7 = x1 - x7;
x1 = x8;
x3 = L0 * x3;
x5 = L0 * x5;
x0 += 256; /* 2^8 added for proper rounding at the last stage */
x8 = x0 + x4;
x4 = x0 - x4;
x0 = x8;
t = L9B * (x2 + x6); /* Scale-rotate operation sqrt(2)*C6 */
x8 = L7B * x2 + t;
x2 = -L8B * x6 + t;
x6 = x8;
x8 = (x7 << 7) + x5;
x5 = ((x7 << 7) - x5) >> 5;
x7 = x8 >> 5;
x8 = (x1 << 7) + x3;
x3 = ((x1 << 7) - x3) >> 5;
x1 = x8 >> 5;
x8 = (x0 << 13) + x6;
x6 = (x0 << 13) - x6;
x0 = x8;
x8 = (x4 << 13) + x2;
x2 = (x4 << 13) - x2;
x4 = x8;
t = L3 * (x3 + x5); /* Scale-rotate operation C1 */
x8 = -L1 * x3 + t;
x3 = -L2 * x5 + t;
x5 = x8;
t = L6 * (x7 + x1); /* Scale-rotate operation C3 */
x8 = -L4 * x7 + t;
x7 = -L5 * x1 + t;
x1 = x8;
#ifndef MP4DEC_ARM11
/*lint -e737*/
pOut[i] = (u8)clp[(x0 + x1) >> 22];
pOut[width+i] = (u8)clp[(x4 + x5) >> 22];
pOut[2*width+i] = (u8)clp[(x2 + x3) >> 22];
pOut[3*width+i] = (u8)clp[(x6 + x7) >> 22];
pOut[4*width+i] = (u8)clp[(x6 - x7) >> 22];
pOut[5*width+i] = (u8)clp[(x2 - x3) >> 22];
pOut[6*width+i] = (u8)clp[(x4 - x5) >> 22];
pOut[7*width+i] = (u8)clp[(x0 - x1) >> 22];
/*lint +e737*/
#else
/* Scale and write saturated coefficients
* to the output picture. Saturation is done for
* two pixels at a time by using ARM11 SIMD
* instructions. */
x8 = (x0 + x1) >> 22;
x1 = (x0 - x1) >> 22;
x0 = (x4 + x5) >> 22;
x4 = (x4 - x5) >> 22;
x5 = (x2 + x3) >> 22;
x2 = (x2 - x3) >> 22;
x3 = (x6 + x7) >> 22;
x6 = (x6 - x7) >> 22;
__asm
{
PKHBT x7, x8, x0, LSL #16; /* pack two coefficients */
PKHBT x8, x5, x3, LSL #16;
USAT16 x7, #8, x7; /* saturate two coefficients.. */
USAT16 x8, #8, x8; /* at a time to range [0,255] */
MOV x0, i; /* get address */
STRB x7, [pOut, x0]; /* store coefficient */
ADD x0, x0, width; /* update address */
MOV x7, x7, ASR #16; /* get upper 16 bits */
STRB x7, [pOut, x0]; /* store coefficient */
ADD x0, x0, width; /* and so on ... */
STRB x8, [pOut, x0];
ADD x0, x0, width;
MOV x8, x8, ASR #16;
STRB x8, [pOut, x0];
PKHBT x7, x6, x2, LSL #16;
PKHBT x8, x4, x1, LSL #16;
USAT16 x7, #8, x7;
USAT16 x8, #8, x8;
ADD x0, x0, width;
STRB x7, [pOut, x0];
ADD x0, x0, width;
MOV x7, x7, ASR #16;
STRB x7, [pOut, x0];
ADD x0, x0, width;
STRB x8, [pOut, x0];
ADD x0, x0, width;
MOV x8, x8, ASR #16;
STRB x8, [pOut, x0];
}
#endif
}
}
return;
}
/*lint +e701 +e702 */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -