📄 vp3dsp_mmx.c
字号:
pmullw_m2r(*(dequant_matrix + 12), r3); psrlq_i2r(16, r4); movq_r2m(r7, *(output_data + 8)); movq_r2r(r4, r5); movq_r2r(r0, r7); psrlq_i2r(16, r4); psrlq_i2r(48, r7); movq_r2r(r2, r6); pand_r2r(r2, r5); pand_r2r(r4, r6); movq_r2m(r7, *(output_data + 40)); pxor_r2r(r6, r4); psrlq_i2r(32, r1); por_r2r(r5, r4); movq_m2r(*M(3), r7); pand_r2r(r2, r1); movq_m2r(*(input_data + 24), r5); psllq_i2r(16, r0); pmullw_m2r(*(dequant_matrix + 24), r5); pand_r2r(r0, r7); movq_r2m(r1, *(output_data + 32)); por_r2r(r4, r7); movq_r2r(r3, r4); pand_r2r(r2, r3); movq_m2r(*M(2), r1); psllq_i2r(32, r3); por_r2r(r3, r7); movq_r2r(r5, r3); psllq_i2r(48, r3); pand_r2r(r0, r1); movq_r2m(r7, *(output_data + 16)); por_r2r(r3, r6); movq_m2r(*M(1), r7); por_r2r(r1, r6); movq_m2r(*(input_data + 28), r1); pand_r2r(r4, r7); pmullw_m2r(*(dequant_matrix + 28), r1); por_r2r(r6, r7); pand_m2r(*M(1), r0); psrlq_i2r(32, r4); movq_r2m(r7, *(output_data + 24)); movq_r2r(r4, r6); movq_m2r(*M(3), r7); pand_r2r(r2, r4); movq_m2r(*M(1), r3); pand_r2r(r1, r7); pand_r2r(r5, r3); por_r2r(r4, r0); psllq_i2r(16, r3); por_r2r(r0, r7); movq_m2r(*M(2), r4); por_r2r(r3, r7); movq_m2r(*(input_data + 40), r0); movq_r2r(r4, r3); pmullw_m2r(*(dequant_matrix + 40), r0); pand_r2r(r5, r4); movq_r2m(r7, *(output_data + 4)); por_r2r(r4, r6); movq_r2r(r3, r4); psrlq_i2r(16, r6); movq_r2r(r0, r7); pand_r2r(r1, r4); psllq_i2r(48, r7); por_r2r(r4, r6); movq_m2r(*(input_data + 44), r4); por_r2r(r6, r7); pmullw_m2r(*(dequant_matrix + 44), r4); psrlq_i2r(16, r3); movq_r2m(r7, *(output_data + 12)); pand_r2r(r1, r3); psrlq_i2r(48, r5); pand_r2r(r2, r1); movq_m2r(*(input_data + 52), r6); por_r2r(r3, r5); pmullw_m2r(*(input_data + 52), r6); psrlq_i2r(16, r0); movq_r2r(r4, r7); movq_r2r(r2, r3); psllq_i2r(48, r7); pand_r2r(r0, r3); pxor_r2r(r3, r0); psllq_i2r(32, r3); por_r2r(r5, r7); movq_r2r(r6, r5); pand_m2r(*M(1), r6); por_r2r(r3, r7); psllq_i2r(32, r6); por_r2r(r1, r0); movq_r2m(r7, *(output_data + 20)); por_r2r(r6, r0); movq_m2r(*(input_data + 60), r7); movq_r2r(r5, r6); pmullw_m2r(*(input_data + 60), r7); psrlq_i2r(32, r5); pand_r2r(r2, r6); movq_r2r(r5, r1); movq_r2m(r0, *(output_data + 28)); pand_r2r(r2, r1); movq_m2r(*(input_data + 56), r0); movq_r2r(r7, r3); pmullw_m2r(*(dequant_matrix + 56), r0); psllq_i2r(16, r3); pand_m2r(*M(3), r7); pxor_r2r(r1, r5); por_r2r(r5, r6); movq_r2r(r3, r5); pand_m2r(*M(3), r5); por_r2r(r1, r7); movq_m2r(*(input_data + 48), r1); pxor_r2r(r5, r3); pmullw_m2r(*(dequant_matrix + 48), r1); por_r2r(r3, r7); por_r2r(r5, r6); movq_r2r(r0, r5); movq_r2m(r7, *(output_data + 60)); psrlq_i2r(16, r5); pand_m2r(*M(2), r5); movq_r2r(r0, r7); por_r2r(r5, r6); pand_r2r(r2, r0); pxor_r2r(r0, r7); psllq_i2r(32, r0); movq_r2m(r6, *(output_data + 52)); psrlq_i2r(16, r4); movq_m2r(*(input_data + 36), r5); psllq_i2r(16, r7); pmullw_m2r(*(dequant_matrix + 36), r5); movq_r2r(r7, r6); movq_m2r(*M(2), r3); psllq_i2r(16, r6); pand_m2r(*M(3), r7); pand_r2r(r1, r3); por_r2r(r0, r7); movq_r2r(r1, r0); pand_m2r(*M(3), r1); por_r2r(r3, r6); movq_r2r(r4, r3); psrlq_i2r(32, r1); pand_r2r(r2, r3); por_r2r(r1, r7); por_r2r(r3, r7); movq_r2r(r4, r3); pand_m2r(*M(1), r3); movq_r2r(r5, r1); movq_r2m(r7, *(output_data + 44)); psrlq_i2r(48, r5); movq_m2r(*(input_data + 32), r7); por_r2r(r3, r6); pmullw_m2r(*(dequant_matrix + 32), r7); por_r2r(r5, r6); pand_m2r(*M(2), r4); psllq_i2r(32, r0); movq_r2m(r6, *(output_data + 36)); movq_r2r(r0, r6); pand_m2r(*M(3), r0); psllq_i2r(16, r6); movq_m2r(*(input_data + 20), r5); movq_r2r(r1, r3); pmullw_m2r(*(dequant_matrix + 40), r5); psrlq_i2r(16, r1); pand_m2r(*M(1), r1); por_r2r(r4, r0); pand_r2r(r7, r2); por_r2r(r1, r0); por_r2r(r2, r0); psllq_i2r(16, r3); movq_r2r(r3, r4); movq_r2r(r5, r2); movq_r2m(r0, *(output_data + 56)); psrlq_i2r(48, r2); pand_m2r(*M(2), r4); por_r2r(r2, r6); movq_m2r(*M(1), r2); por_r2r(r4, r6); pand_r2r(r7, r2); psllq_i2r(32, r3); por_m2r(*(output_data + 40), r3); por_r2r(r2, r6); movq_m2r(*M(3), r2); psllq_i2r(16, r5); movq_r2m(r6, *(output_data + 48)); pand_r2r(r5, r2); movq_m2r(*M(2), r6); pxor_r2r(r2, r5); pand_r2r(r7, r6); psrlq_i2r(32, r2); pand_m2r(*M(3), r7); por_r2r(r2, r3); por_m2r(*(output_data + 32), r7); por_r2r(r3, r6); por_r2r(r5, r7); movq_r2m(r6, *(output_data + 40)); movq_r2m(r7, *(output_data + 32));#undef M /* at this point, function has completed dequantization + dezigzag + * partial transposition; now do the idct itself */#define I(K) (output_data + K * 8)#define J(K) (output_data + ((K - 4) * 8) + 4) RowIDCT(); Transpose();#undef I#undef J#define I(K) (output_data + (K * 8) + 32)#define J(K) (output_data + ((K - 4) * 8) + 36) RowIDCT(); Transpose();#undef I#undef J#define I(K) (output_data + K * 8)#define J(K) (output_data + K * 8) ColumnIDCT();#undef I#undef J#define I(K) (output_data + (K * 8) + 4)#define J(K) (output_data + (K * 8) + 4) ColumnIDCT();#undef I#undef J}void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix, int coeff_count, uint8_t *dest, int stride){ int16_t transformed_data[64]; int16_t *op; int i, j; uint8_t vector128[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; vp3_idct_mmx(input_data, dequant_matrix, transformed_data); /* place in final output */ op = transformed_data; movq_m2r(*vector128, mm0); for (i = 0; i < 8; i++) {#if 1 for (j = 0; j < 8; j++) { if (*op < -128) *dest = 0; else if (*op > 127) *dest = 255; else *dest = (uint8_t)(*op + 128); op++; dest++; } dest += (stride - 8);#else/* prototype optimization */ pxor_r2r(mm1, mm1); packsswb_m2r(*(op + 4), mm1); movq_r2r(mm1, mm2); psrlq_i2r(32, mm2); packsswb_m2r(*(op + 0), mm1); op += 8; por_r2r(mm2, mm1); paddb_r2r(mm0, mm1); movq_r2m(mm1, *dest); dest += stride;#endif } /* be a good MMX citizen */ emms();}void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix, int coeff_count, uint8_t *dest, int stride){ int16_t transformed_data[64]; int16_t *op; int i, j; int16_t sample; vp3_idct_mmx(input_data, dequant_matrix, transformed_data); /* place in final output */ op = transformed_data; for (i = 0; i < 8; i++) { for (j = 0; j < 8; j++) { sample = *dest + *op; if (sample < 0) *dest = 0; else if (sample > 255) *dest = 255; else *dest = (uint8_t)(sample & 0xFF); op++; dest++; } dest += (stride - 8); } /* be a good MMX citizen */ emms();}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -