📄 mpegvideo_altivec.c
字号:
q1 = vec_ctf(qmat[3], QMAT_SHIFT);
q2 = vec_ctf(qmat[5], QMAT_SHIFT);
q3 = vec_ctf(qmat[7], QMAT_SHIFT);
q4 = vec_ctf(qmat[9], QMAT_SHIFT);
q5 = vec_ctf(qmat[11], QMAT_SHIFT);
q6 = vec_ctf(qmat[13], QMAT_SHIFT);
q7 = vec_ctf(qmat[15], QMAT_SHIFT);
alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias),
vec_cmpgt(alt0, zero));
alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias),
vec_cmpgt(alt1, zero));
alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias),
vec_cmpgt(alt2, zero));
alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias),
vec_cmpgt(alt3, zero));
alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias),
vec_cmpgt(alt4, zero));
alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias),
vec_cmpgt(alt5, zero));
alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias),
vec_cmpgt(alt6, zero));
alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias),
vec_cmpgt(alt7, zero));
}
}
// Store the data back into the original block
{
vector signed short data0, data1, data2, data3, data4, data5, data6, data7;
data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0));
data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0));
data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0));
data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0));
data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0));
data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0));
data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0));
data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0));
{
// Clamp for overflow
vector signed int max_q_int, min_q_int;
vector signed short max_q, min_q;
LOAD4(max_q_int, &(s->max_qcoeff));
LOAD4(min_q_int, &(s->min_qcoeff));
max_q = vec_pack(max_q_int, max_q_int);
min_q = vec_pack(min_q_int, min_q_int);
data0 = vec_max(vec_min(data0, max_q), min_q);
data1 = vec_max(vec_min(data1, max_q), min_q);
data2 = vec_max(vec_min(data2, max_q), min_q);
data4 = vec_max(vec_min(data4, max_q), min_q);
data5 = vec_max(vec_min(data5, max_q), min_q);
data6 = vec_max(vec_min(data6, max_q), min_q);
data7 = vec_max(vec_min(data7, max_q), min_q);
}
{
vector bool char zero_01, zero_23, zero_45, zero_67;
vector signed char scanIndices_01, scanIndices_23, scanIndices_45, scanIndices_67;
vector signed char negOne = vec_splat_s8(-1);
vector signed char* scanPtr =
(vector signed char*)(s->intra_scantable.inverse);
signed char lastNonZeroChar;
// Determine the largest non-zero index.
zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero),
vec_cmpeq(data1, (vector signed short)zero));
zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero),
vec_cmpeq(data3, (vector signed short)zero));
zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero),
vec_cmpeq(data5, (vector signed short)zero));
zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero),
vec_cmpeq(data7, (vector signed short)zero));
// 64 biggest values
scanIndices_01 = vec_sel(scanPtr[0], negOne, zero_01);
scanIndices_23 = vec_sel(scanPtr[1], negOne, zero_23);
scanIndices_45 = vec_sel(scanPtr[2], negOne, zero_45);
scanIndices_67 = vec_sel(scanPtr[3], negOne, zero_67);
// 32 largest values
scanIndices_01 = vec_max(scanIndices_01, scanIndices_23);
scanIndices_45 = vec_max(scanIndices_45, scanIndices_67);
// 16 largest values
scanIndices_01 = vec_max(scanIndices_01, scanIndices_45);
// 8 largest values
scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
vec_mergel(scanIndices_01, negOne));
// 4 largest values
scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
vec_mergel(scanIndices_01, negOne));
// 2 largest values
scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
vec_mergel(scanIndices_01, negOne));
// largest value
scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
vec_mergel(scanIndices_01, negOne));
scanIndices_01 = vec_splat(scanIndices_01, 0);
vec_ste(scanIndices_01, 0, &lastNonZeroChar);
lastNonZero = lastNonZeroChar;
// While the data is still in vectors we check for the transpose IDCT permute
// and handle it using the vector unit if we can. This is the permute used
// by the altivec idct, so it is common when using the altivec dct.
if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM))
{
TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
}
vec_st(data0, 0, data);
vec_st(data1, 16, data);
vec_st(data2, 32, data);
vec_st(data3, 48, data);
vec_st(data4, 64, data);
vec_st(data5, 80, data);
vec_st(data6, 96, data);
vec_st(data7, 112, data);
}
}
// special handling of block[0]
if (s->mb_intra)
{
if (!s->h263_aic)
{
if (n < 4)
oldBaseValue /= s->y_dc_scale;
else
oldBaseValue /= s->c_dc_scale;
}
// Divide by 8, rounding the result
data[0] = (oldBaseValue + 4) >> 3;
}
// We handled the tranpose permutation above and we don't
// need to permute the "no" permutation case.
if ((lastNonZero > 0) &&
(s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
(s->dsp.idct_permutation_type != FF_NO_IDCT_PERM))
{
ff_block_permute(data, s->dsp.idct_permutation,
s->intra_scantable.scantable, lastNonZero);
}
return lastNonZero;
}
#undef FOUROF
/*
AltiVec version of dct_unquantize_h263
this code assumes `block' is 16 bytes-aligned
*/
void dct_unquantize_h263_altivec(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1);
int i, level, qmul, qadd;
int nCoeffs;
assert(s->block_last_index[n]>=0);
POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
qadd = (qscale - 1) | 1;
qmul = qscale << 1;
if (s->mb_intra) {
if (!s->h263_aic) {
if (n < 4)
block[0] = block[0] * s->y_dc_scale;
else
block[0] = block[0] * s->c_dc_scale;
}else
qadd = 0;
i = 1;
nCoeffs= 63; //does not always use zigzag table
} else {
i = 0;
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
}
{
register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
DECLARE_ALIGNED_16(short, qmul8[]) =
{
qmul, qmul, qmul, qmul,
qmul, qmul, qmul, qmul
};
DECLARE_ALIGNED_16(short, qadd8[]) =
{
qadd, qadd, qadd, qadd,
qadd, qadd, qadd, qadd
};
DECLARE_ALIGNED_16(short, nqadd8[]) =
{
-qadd, -qadd, -qadd, -qadd,
-qadd, -qadd, -qadd, -qadd
};
register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
register vector bool short blockv_null, blockv_neg;
register short backup_0 = block[0];
register int j = 0;
qmulv = vec_ld(0, qmul8);
qaddv = vec_ld(0, qadd8);
nqaddv = vec_ld(0, nqadd8);
#if 0 // block *is* 16 bytes-aligned, it seems.
// first make sure block[j] is 16 bytes-aligned
for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
level = block[j];
if (level) {
if (level < 0) {
level = level * qmul - qadd;
} else {
level = level * qmul + qadd;
}
block[j] = level;
}
}
#endif
// vectorize all the 16 bytes-aligned blocks
// of 8 elements
for(; (j + 7) <= nCoeffs ; j+=8)
{
blockv = vec_ld(j << 1, block);
blockv_neg = vec_cmplt(blockv, vczero);
blockv_null = vec_cmpeq(blockv, vczero);
// choose between +qadd or -qadd as the third operand
temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
// multiply & add (block{i,i+7} * qmul [+-] qadd)
temp1 = vec_mladd(blockv, qmulv, temp1);
// put 0 where block[{i,i+7} used to have 0
blockv = vec_sel(temp1, blockv, blockv_null);
vec_st(blockv, j << 1, block);
}
// if nCoeffs isn't a multiple of 8, finish the job
// using good old scalar units.
// (we could do it using a truncated vector,
// but I'm not sure it's worth the hassle)
for(; j <= nCoeffs ; j++) {
level = block[j];
if (level) {
if (level < 0) {
level = level * qmul - qadd;
} else {
level = level * qmul + qadd;
}
block[j] = level;
}
}
if (i == 1)
{ // cheat. this avoid special-casing the first iteration
block[0] = backup_0;
}
}
POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
}
extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
void MPV_common_init_altivec(MpegEncContext *s)
{
if (s->avctx->lowres==0)
{
if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
(s->avctx->idct_algo == FF_IDCT_ALTIVEC))
{
s->dsp.idct_put = idct_put_altivec;
s->dsp.idct_add = idct_add_altivec;
s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
}
}
// Test to make sure that the dct required alignments are met.
if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
(((long)(s->q_inter_matrix) & 0x0f) != 0))
{
av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
"to use AltiVec DCT. Reverting to non-AltiVec version.\n");
return;
}
if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
{
av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
"to use AltiVec DCT. Reverting to non-AltiVec version.\n");
return;
}
if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
(s->avctx->dct_algo == FF_DCT_ALTIVEC))
{
#if 0 /* seems to cause trouble under some circumstances */
s->dct_quantize = dct_quantize_altivec;
#endif
s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -