📄 rtjpeg.c
字号:
psubw_r2r(mm2, mm1); // tmp10 - tmp12 psllw_i2r(2, mm4); // shift tmp10 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6 psllw_i2r(2, mm1); // shift (tmp10-tmp12) pmulhw_m2r(RTjpeg_C6, mm1); // z5 psllw_i2r(2, mm5); // prepare for multiply pmulhw_r2r(mm0, mm4); // multiply by converted real /* stage 5 */ pmulhw_m2r(RTjpeg_C4, mm5); // z3 psllw_i2r(2, mm2); // prepare for multiply pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply movq_r2r(mm3, mm0); // copy tmp7 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7 paddw_r2r(mm1, mm4); // z2 paddw_r2r(mm5, mm0); // z11 psubw_r2r(mm5, mm3); // z13 /* stage 6 */ movq_r2r(mm3, mm5); // copy z13 paddw_r2r(mm1, mm2); // z4 movq_r2r(mm0, mm6); // copy z11 psubw_r2r(mm4, mm5); // y3 paddw_r2r(mm2, mm6); // y1 paddw_r2r(mm4, mm3); // y5 movq_r2m(mm5, *(dataptr+7)); //save y3 movq_r2m(mm6, *(dataptr+3)); //save y1 psubw_r2r(mm2, mm0); // y7 /************************************************************************************************ Start of Transpose************************************************************************************************/ movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2 movq_r2r(mm7, mm5); // copy first line punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines movq_r2r(mm6, mm2); // copy third line punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines movq_r2r(mm7, mm1); // copy first intermediate result punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2 movq_r2m(mm7, *(dataptr+9)); // write result 1 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines movq_r2m(mm1, *(dataptr+11)); // write result 2 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines movq_r2r(mm5, mm1); // copy first intermediate result punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4 movq_r2m(mm5, *(dataptr+13)); // write result 3 /****** last 4x4 done */ movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line movq_r2r(mm0, mm6); // copy first line punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines movq_r2r(mm2, mm7); // copy third line punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines movq_r2r(mm0, mm4); // copy first intermediate result movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines movq_r2r(mm1, mm2); // copy first line punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines movq_r2r(mm6, mm5); // copy first intermediate result movq_r2m(mm0, *(dataptr+8)); // write result 1 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines movq_r2r(mm3, mm0); // copy third line punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines movq_r2m(mm4, *(dataptr+10)); // write result 2 out punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines movq_r2r(mm1, mm4); // copy second intermediate result movq_r2m(mm6, *(dataptr+12)); // write result 3 out punpckldq_r2r(mm3, mm1); // punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines movq_r2r(mm2, mm6); // copy second intermediate result movq_r2m(mm5, *(dataptr+14)); // write result 4 out punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block) punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result movq_r2m(mm4, *(dataptr+3)); // write result 6 out punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result movq_r2m(mm2, *(dataptr+5)); // write result 7 out movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4 movq_r2m(mm6, *(dataptr+7)); // write result 8 out// Do first 4x4 quadrant, which is used in the beginning of the DCT: movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line movq_r2r(mm0, mm2); // copy first line punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines movq_r2r(mm7, mm4); // copy third line punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines movq_r2r(mm0, mm1); // copy first intermediate result movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2 movq_r2r(mm0, mm7); // write result 1 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */ movq_r2r(mm1, mm6); // write result 2 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */ punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */ movq_r2r(mm2, mm3); // copy first intermediate result psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */ punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3 movq_r2m(mm7, tmp7); // save tmp07 movq_r2r(mm2, mm5); // write result 3 movq_r2m(mm6, tmp6); // save tmp06 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */ movq_r2r(mm3, mm4); // write result 4/************************************************************************************************ End of Transpose 2************************************************************************************************/ paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/ movq_r2r(mm0, mm7); psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/ movq_r2r(mm1, mm6); paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */ psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */ psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */ paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */ psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/ paddw_r2r(mm7, mm6); // tmp12 + tmp13 /* stage 3 */ movq_m2r(tmp6, mm2); movq_r2r(mm0, mm3); psllw_i2r(2, mm6); // m8 * 2^2 paddw_r2r(mm1, mm0); pmulhw_m2r(RTjpeg_C4, mm6); // z1 psubw_r2r(mm1, mm3); movq_r2m(mm0, *dataptr); movq_r2r(mm7, mm0); /* Odd part */ movq_r2m(mm3, *(dataptr+8)); paddw_r2r(mm5, mm4); // tmp10 movq_m2r(tmp7, mm3); paddw_r2r(mm6, mm0); // tmp32 paddw_r2r(mm2, mm5); // tmp11 psubw_r2r(mm6, mm7); // tmp33 movq_r2m(mm0, *(dataptr+4)); paddw_r2r(mm3, mm2); // tmp12 /* stage 4 */ movq_r2m(mm7, *(dataptr+12)); movq_r2r(mm4, mm1); // copy of tmp10 psubw_r2r(mm2, mm1); // tmp10 - tmp12 psllw_i2r(2, mm4); // m8 * 2^2 movq_m2r(RTjpeg_C2mC6, mm0); psllw_i2r(2, mm1); pmulhw_m2r(RTjpeg_C6, mm1); // z5 psllw_i2r(2, mm2); pmulhw_r2r(mm0, mm4); // z5 /* stage 5 */ pmulhw_m2r(RTjpeg_C2pC6, mm2); psllw_i2r(2, mm5); pmulhw_m2r(RTjpeg_C4, mm5); // z3 movq_r2r(mm3, mm0); // copy tmp7 movq_m2r(*(dataptr+1), mm7); paddw_r2r(mm1, mm4); // z2 paddw_r2r(mm1, mm2); // z4 paddw_r2r(mm5, mm0); // z11 psubw_r2r(mm5, mm3); // z13 /* stage 6 */ movq_r2r(mm3, mm5); // copy z13 psubw_r2r(mm4, mm3); // y3=z13 - z2 paddw_r2r(mm4, mm5); // y5=z13 + z2 movq_r2r(mm0, mm6); // copy z11 movq_r2m(mm3, *(dataptr+6)); //save y3 psubw_r2r(mm2, mm0); // y7=z11 - z4 movq_r2m(mm5, *(dataptr+10)); //save y5 paddw_r2r(mm2, mm6); // y1=z11 + z4 movq_r2m(mm0, *(dataptr+14)); //save y7 /************************************************ * End of 1st 4 rows ************************************************/ movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */ movq_r2r(mm7, mm0); // copy x0 movq_r2m(mm6, *(dataptr+2)); //save y1 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */ movq_r2r(mm1, mm6); // copy x1 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */ movq_r2r(mm2, mm5); // copy x2 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7 movq_r2r(mm3, mm4); // copy x3 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6 movq_r2m(mm7, tmp7); // save tmp07 movq_r2r(mm0, mm7); // copy tmp00 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6 /* stage 2, Even Part */ paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4 movq_r2m(mm6, tmp6); // save tmp07 movq_r2r(mm1, mm6); // copy tmp01 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5 paddw_r2r(mm7, mm6); // tmp12 + tmp13 /* stage 3, Even and stage 4 & 5 even */ movq_m2r(tmp6, mm2); // load tmp6 movq_r2r(mm0, mm3); // copy tmp10 psllw_i2r(2, mm6); // shift z1 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11 pmulhw_m2r(RTjpeg_C4, mm6); // z1 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11 movq_r2m(mm0, *(dataptr+1)); //save y0 movq_r2r(mm7, mm0); // copy tmp13 /* odd part */ movq_r2m(mm3, *(dataptr+9)); //save y4 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5 movq_m2r(tmp7, mm3); // load tmp7 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1 movq_r2m(mm0, *(dataptr+5)); //save y2 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7 /* stage 4 */ movq_r2m(mm7, *(dataptr+13)); //save y6 movq_r2r(mm4, mm1); // copy tmp10 psubw_r2r(mm2, mm1); // tmp10 - tmp12 psllw_i2r(2, mm4); // shift tmp10 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6 psllw_i2r(2, mm1); // shift (tmp10-tmp12) pmulhw_m2r(RTjpeg_C6, mm1); // z5 psllw_i2r(2, mm5); // prepare for multiply pmulhw_r2r(mm0, mm4); // multiply by converted real /* stage 5 */ pmulhw_m2r(RTjpeg_C4, mm5); // z3 psllw_i2r(2, mm2); // prepare for multiply pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply movq_r2r(mm3, mm0); // copy tmp7 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7 paddw_r2r(mm1, mm4); // z2 paddw_r2r(mm5, mm0); // z11 psubw_r2r(mm5, mm3); // z13 /* stage 6 */ movq_r2r(mm3, mm5); // copy z13 paddw_r2r(mm1, mm2); // z4 movq_r2r(mm0, mm6); // copy z11 psubw_r2r(mm4, mm5); // y3 paddw_r2r(mm2, mm6); // y1 paddw_r2r(mm4, mm3); // y5 movq_r2m(mm5, *(dataptr+7)); //save y3 psubw_r2r(mm2, mm0); // y
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -