📄 dsputil_mmx.c
字号:
: "+g"(h), "+r" (pixels), "+r" (block) : "r"(line_size) : "%eax", "memory" );}static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h){ __asm __volatile( "lea (%3, %3), %%eax \n\t" ".balign 8 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq 8(%1, %3), %%mm5 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm4, 8(%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t" "addl %%eax, %1 \n\t" "addl %%eax, %2 \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq 8(%1, %3), %%mm5 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm4, 8(%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t" "addl %%eax, %1 \n\t" "addl %%eax, %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) : "r"(line_size) : "%eax", "memory" );}static void clear_blocks_mmx(DCTELEM *blocks){ __asm __volatile( "pxor %%mm7, %%mm7 \n\t" "movl $-128*6, %%eax \n\t" "1: \n\t" "movq %%mm7, (%0, %%eax) \n\t" "movq %%mm7, 8(%0, %%eax) \n\t" "movq %%mm7, 16(%0, %%eax) \n\t" "movq %%mm7, 24(%0, %%eax) \n\t" "addl $32, %%eax \n\t" " js 1b \n\t" : : "r" (((int)blocks)+128*6) : "%eax" );}#ifdef CONFIG_ENCODERSstatic int pix_sum16_mmx(uint8_t * pix, int line_size){ const int h=16; int sum; int index= -line_size*h; __asm __volatile( "pxor %%mm7, %%mm7 \n\t" "pxor %%mm6, %%mm6 \n\t" "1: \n\t" "movq (%2, %1), %%mm0 \n\t" "movq (%2, %1), %%mm1 \n\t" "movq 8(%2, %1), %%mm2 \n\t" "movq 8(%2, %1), %%mm3 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" "punpckhbw %%mm7, %%mm1 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm3 \n\t" "paddw %%mm0, %%mm1 \n\t" "paddw %%mm2, %%mm3 \n\t" "paddw %%mm1, %%mm3 \n\t" "paddw %%mm3, %%mm6 \n\t" "addl %3, %1 \n\t" " js 1b \n\t" "movq %%mm6, %%mm5 \n\t" "psrlq $32, %%mm6 \n\t" "paddw %%mm5, %%mm6 \n\t" "movq %%mm6, %%mm5 \n\t" "psrlq $16, %%mm6 \n\t" "paddw %%mm5, %%mm6 \n\t" "movd %%mm6, %0 \n\t" "andl $0xFFFF, %0 \n\t" : "=&r" (sum), "+r" (index) : "r" (pix - index), "r" (line_size) ); return sum;}#endif //CONFIG_ENCODERSstatic void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ int i=0; asm volatile( "1: \n\t" "movq (%1, %0), %%mm0 \n\t" "movq (%2, %0), %%mm1 \n\t" "paddb %%mm0, %%mm1 \n\t" "movq %%mm1, (%2, %0) \n\t" "movq 8(%1, %0), %%mm0 \n\t" "movq 8(%2, %0), %%mm1 \n\t" "paddb %%mm0, %%mm1 \n\t" "movq %%mm1, 8(%2, %0) \n\t" "addl $16, %0 \n\t" "cmpl %3, %0 \n\t" " jb 1b \n\t" : "+r" (i) : "r"(src), "r"(dst), "r"(w-15) ); for(; i<w; i++) dst[i+0] += src[i+0];}#ifdef CONFIG_ENCODERSstatic int pix_norm1_mmx(uint8_t *pix, int line_size) { int tmp; asm volatile ( "movl $16,%%ecx\n" "pxor %%mm0,%%mm0\n" "pxor %%mm7,%%mm7\n" "1:\n" "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ "pmaddwd %%mm3,%%mm3\n" "pmaddwd %%mm4,%%mm4\n" "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, pix2^2+pix3^2+pix6^2+pix7^2) */ "paddd %%mm3,%%mm4\n" "paddd %%mm2,%%mm7\n" "addl %2, %0\n" "paddd %%mm4,%%mm7\n" "dec %%ecx\n" "jnz 1b\n" "movq %%mm7,%%mm1\n" "psrlq $32, %%mm7\n" /* shift hi dword to lo */ "paddd %%mm7,%%mm1\n" "movd %%mm1,%1\n" : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" ); return tmp;}static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) { int tmp; asm volatile ( "movl $16,%%ecx\n" "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ "1:\n" "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ /* todo: mm1-mm2, mm3-mm4 */ /* algo: substract mm1 from mm2 with saturation and vice versa */ /* OR the results to get absolute difference */ "movq %%mm1,%%mm5\n" "movq %%mm3,%%mm6\n" "psubusb %%mm2,%%mm1\n" "psubusb %%mm4,%%mm3\n" "psubusb %%mm5,%%mm2\n" "psubusb %%mm6,%%mm4\n" "por %%mm1,%%mm2\n" "por %%mm3,%%mm4\n" /* now convert to 16-bit vectors so we can square them */ "movq %%mm2,%%mm1\n" "movq %%mm4,%%mm3\n" "punpckhbw %%mm0,%%mm2\n" "punpckhbw %%mm0,%%mm4\n" "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ "pmaddwd %%mm2,%%mm2\n" "pmaddwd %%mm4,%%mm4\n" "pmaddwd %%mm1,%%mm1\n" "pmaddwd %%mm3,%%mm3\n" "addl %3,%0\n" "addl %3,%1\n" "paddd %%mm2,%%mm1\n" "paddd %%mm4,%%mm3\n" "paddd %%mm1,%%mm7\n" "paddd %%mm3,%%mm7\n" "decl %%ecx\n" "jnz 1b\n" "movq %%mm7,%%mm1\n" "psrlq $32, %%mm7\n" /* shift hi dword to lo */ "paddd %%mm7,%%mm1\n" "movd %%mm1,%2\n" : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "%ecx"); return tmp;}static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ int i=0; asm volatile( "1: \n\t" "movq (%2, %0), %%mm0 \n\t" "movq (%1, %0), %%mm1 \n\t" "psubb %%mm0, %%mm1 \n\t" "movq %%mm1, (%3, %0) \n\t" "movq 8(%2, %0), %%mm0 \n\t" "movq 8(%1, %0), %%mm1 \n\t" "psubb %%mm0, %%mm1 \n\t" "movq %%mm1, 8(%3, %0) \n\t" "addl $16, %0 \n\t" "cmpl %4, %0 \n\t" " jb 1b \n\t" : "+r" (i) : "r"(src1), "r"(src2), "r"(dst), "r"(w-15) ); for(; i<w; i++) dst[i+0] = src1[i+0]-src2[i+0];}static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ int i=0; uint8_t l, lt; asm volatile( "1: \n\t" "movq -1(%1, %0), %%mm0 \n\t" // LT "movq (%1, %0), %%mm1 \n\t" // T "movq -1(%2, %0), %%mm2 \n\t" // L "movq (%2, %0), %%mm3 \n\t" // X "movq %%mm2, %%mm4 \n\t" // L "psubb %%mm0, %%mm2 \n\t" "paddb %%mm1, %%mm2 \n\t" // L + T - LT "movq %%mm4, %%mm5 \n\t" // L "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) "pminub %%mm5, %%mm1 \n\t" // min(T, L) "pminub %%mm2, %%mm4 \n\t" "pmaxub %%mm1, %%mm4 \n\t" "psubb %%mm4, %%mm3 \n\t" // dst - pred "movq %%mm3, (%3, %0) \n\t" "addl $8, %0 \n\t" "cmpl %4, %0 \n\t" " jb 1b \n\t" : "+r" (i) : "r"(src1), "r"(src2), "r"(dst), "r"(w) ); l= *left; lt= *left_top; dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); *left_top= src1[w-1]; *left = src2[w-1];}#define LBUTTERFLY2(a1,b1,a2,b2)\ "paddw " #b1 ", " #a1 " \n\t"\ "paddw " #b2 ", " #a2 " \n\t"\ "paddw " #b1 ", " #b1 " \n\t"\ "paddw " #b2 ", " #b2 " \n\t"\ "psubw " #a1 ", " #b1 " \n\t"\ "psubw " #a2 ", " #b2 " \n\t"#define HADAMARD48\ LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\ LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\ LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\ LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\ LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\ LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\#define MMABS(a,z)\ "pxor " #z ", " #z " \n\t"\ "pcmpgtw " #a ", " #z " \n\t"\ "pxor " #z ", " #a " \n\t"\ "psubw " #z ", " #a " \n\t"#define MMABS_SUM(a,z, sum)\ "pxor " #z ", " #z " \n\t"\ "pcmpgtw " #a ", " #z " \n\t"\ "pxor " #z ", " #a " \n\t"\ "psubw " #z ", " #a " \n\t"\ "paddusw " #a ", " #sum " \n\t"#define MMABS_MMX2(a,z)\ "pxor " #z ", " #z " \n\t"\ "psubw " #a ", " #z " \n\t"\ "pmaxsw " #z ", " #a " \n\t"#define MMABS_SUM_MMX2(a,z, sum)\ "pxor " #z ", " #z " \n\t"\ "psubw " #a ", " #z " \n\t"\ "pmaxsw " #z ", " #a " \n\t"\ "paddusw " #a ", " #sum " \n\t" #define SBUTTERFLY(a,b,t,n)\ "movq " #a ", " #t " \n\t" /* abcd */\ "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\#define TRANSPOSE4(a,b,c,d,t)\ SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\ SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\ SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\ SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */#define LOAD4(o, a, b, c, d)\ "movq "#o"(%1), " #a " \n\t"\ "movq "#o"+16(%1), " #b " \n\t"\ "movq "#o"+32(%1), " #c " \n\t"\ "movq "#o"+48(%1), " #d " \n\t"#define STORE4(o, a, b, c, d)\ "movq "#a", "#o"(%1) \n\t"\ "movq "#b", "#o"+16(%1) \n\t"\ "movq "#c", "#o"+32(%1) \n\t"\ "movq "#d", "#o"+48(%1) \n\t"\static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){ uint64_t temp[16] __align8; int sum=0; diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -