📄 dsputil_mmx.c
字号:
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
"1:\n"
"movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
"movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
"movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
"movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
/* todo: mm1-mm2, mm3-mm4 */
/* algo: substract mm1 from mm2 with saturation and vice versa */
/* OR the results to get absolute difference */
"movq %%mm1,%%mm5\n"
"movq %%mm3,%%mm6\n"
"psubusb %%mm2,%%mm1\n"
"psubusb %%mm4,%%mm3\n"
"psubusb %%mm5,%%mm2\n"
"psubusb %%mm6,%%mm4\n"
"por %%mm1,%%mm2\n"
"por %%mm3,%%mm4\n"
/* now convert to 16-bit vectors so we can square them */
"movq %%mm2,%%mm1\n"
"movq %%mm4,%%mm3\n"
"punpckhbw %%mm0,%%mm2\n"
"punpckhbw %%mm0,%%mm4\n"
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
"pmaddwd %%mm2,%%mm2\n"
"pmaddwd %%mm4,%%mm4\n"
"pmaddwd %%mm1,%%mm1\n"
"pmaddwd %%mm3,%%mm3\n"
"add %3,%0\n"
"add %3,%1\n"
"paddd %%mm2,%%mm1\n"
"paddd %%mm4,%%mm3\n"
"paddd %%mm1,%%mm7\n"
"paddd %%mm3,%%mm7\n"
"decl %%ecx\n"
"jnz 1b\n"
"movq %%mm7,%%mm1\n"
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
"paddd %%mm7,%%mm1\n"
"movd %%mm1,%2\n"
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
: "r" ((long)line_size) , "m" (h)
: "%ecx");
return tmp;
}
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
int tmp;
asm volatile (
"movl %3,%%ecx\n"
"pxor %%mm7,%%mm7\n"
"pxor %%mm6,%%mm6\n"
"movq (%0),%%mm0\n"
"movq %%mm0, %%mm1\n"
"psllq $8, %%mm0\n"
"psrlq $8, %%mm1\n"
"psrlq $8, %%mm0\n"
"movq %%mm0, %%mm2\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm0\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm2\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm0\n"
"psubw %%mm3, %%mm2\n"
"add %2,%0\n"
"movq (%0),%%mm4\n"
"movq %%mm4, %%mm1\n"
"psllq $8, %%mm4\n"
"psrlq $8, %%mm1\n"
"psrlq $8, %%mm4\n"
"movq %%mm4, %%mm5\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm4\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm5\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm4\n"
"psubw %%mm3, %%mm5\n"
"psubw %%mm4, %%mm0\n"
"psubw %%mm5, %%mm2\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm0, %%mm3\n\t"
"pcmpgtw %%mm2, %%mm1\n\t"
"pxor %%mm3, %%mm0\n"
"pxor %%mm1, %%mm2\n"
"psubw %%mm3, %%mm0\n"
"psubw %%mm1, %%mm2\n"
"paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n"
"add %2,%0\n"
"1:\n"
"movq (%0),%%mm0\n"
"movq %%mm0, %%mm1\n"
"psllq $8, %%mm0\n"
"psrlq $8, %%mm1\n"
"psrlq $8, %%mm0\n"
"movq %%mm0, %%mm2\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm0\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm2\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm0\n"
"psubw %%mm3, %%mm2\n"
"psubw %%mm0, %%mm4\n"
"psubw %%mm2, %%mm5\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm4, %%mm3\n\t"
"pcmpgtw %%mm5, %%mm1\n\t"
"pxor %%mm3, %%mm4\n"
"pxor %%mm1, %%mm5\n"
"psubw %%mm3, %%mm4\n"
"psubw %%mm1, %%mm5\n"
"paddw %%mm4, %%mm5\n"
"paddw %%mm5, %%mm6\n"
"add %2,%0\n"
"movq (%0),%%mm4\n"
"movq %%mm4, %%mm1\n"
"psllq $8, %%mm4\n"
"psrlq $8, %%mm1\n"
"psrlq $8, %%mm4\n"
"movq %%mm4, %%mm5\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm4\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm5\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm4\n"
"psubw %%mm3, %%mm5\n"
"psubw %%mm4, %%mm0\n"
"psubw %%mm5, %%mm2\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm0, %%mm3\n\t"
"pcmpgtw %%mm2, %%mm1\n\t"
"pxor %%mm3, %%mm0\n"
"pxor %%mm1, %%mm2\n"
"psubw %%mm3, %%mm0\n"
"psubw %%mm1, %%mm2\n"
"paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n"
"add %2,%0\n"
"subl $2, %%ecx\n"
" jnz 1b\n"
"movq %%mm6, %%mm0\n"
"punpcklwd %%mm7,%%mm0\n"
"punpckhwd %%mm7,%%mm6\n"
"paddd %%mm0, %%mm6\n"
"movq %%mm6,%%mm0\n"
"psrlq $32, %%mm6\n"
"paddd %%mm6,%%mm0\n"
"movd %%mm0,%1\n"
: "+r" (pix1), "=r"(tmp)
: "r" ((long)line_size) , "g" (h-2)
: "%ecx");
return tmp;
}
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
int tmp;
uint8_t * pix= pix1;
asm volatile (
"movl %3,%%ecx\n"
"pxor %%mm7,%%mm7\n"
"pxor %%mm6,%%mm6\n"
"movq (%0),%%mm0\n"
"movq 1(%0),%%mm1\n"
"movq %%mm0, %%mm2\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm0\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm2\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm0\n"
"psubw %%mm3, %%mm2\n"
"add %2,%0\n"
"movq (%0),%%mm4\n"
"movq 1(%0),%%mm1\n"
"movq %%mm4, %%mm5\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm4\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm5\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm4\n"
"psubw %%mm3, %%mm5\n"
"psubw %%mm4, %%mm0\n"
"psubw %%mm5, %%mm2\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm0, %%mm3\n\t"
"pcmpgtw %%mm2, %%mm1\n\t"
"pxor %%mm3, %%mm0\n"
"pxor %%mm1, %%mm2\n"
"psubw %%mm3, %%mm0\n"
"psubw %%mm1, %%mm2\n"
"paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n"
"add %2,%0\n"
"1:\n"
"movq (%0),%%mm0\n"
"movq 1(%0),%%mm1\n"
"movq %%mm0, %%mm2\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm0\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm2\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm0\n"
"psubw %%mm3, %%mm2\n"
"psubw %%mm0, %%mm4\n"
"psubw %%mm2, %%mm5\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm4, %%mm3\n\t"
"pcmpgtw %%mm5, %%mm1\n\t"
"pxor %%mm3, %%mm4\n"
"pxor %%mm1, %%mm5\n"
"psubw %%mm3, %%mm4\n"
"psubw %%mm1, %%mm5\n"
"paddw %%mm4, %%mm5\n"
"paddw %%mm5, %%mm6\n"
"add %2,%0\n"
"movq (%0),%%mm4\n"
"movq 1(%0),%%mm1\n"
"movq %%mm4, %%mm5\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm4\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm5\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm4\n"
"psubw %%mm3, %%mm5\n"
"psubw %%mm4, %%mm0\n"
"psubw %%mm5, %%mm2\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm0, %%mm3\n\t"
"pcmpgtw %%mm2, %%mm1\n\t"
"pxor %%mm3, %%mm0\n"
"pxor %%mm1, %%mm2\n"
"psubw %%mm3, %%mm0\n"
"psubw %%mm1, %%mm2\n"
"paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n"
"add %2,%0\n"
"subl $2, %%ecx\n"
" jnz 1b\n"
"movq %%mm6, %%mm0\n"
"punpcklwd %%mm7,%%mm0\n"
"punpckhwd %%mm7,%%mm6\n"
"paddd %%mm0, %%mm6\n"
"movq %%mm6,%%mm0\n"
"psrlq $32, %%mm6\n"
"paddd %%mm6,%%mm0\n"
"movd %%mm0,%1\n"
: "+r" (pix1), "=r"(tmp)
: "r" ((long)line_size) , "g" (h-2)
: "%ecx");
return tmp + hf_noise8_mmx(pix+8, line_size, h);
}
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
MpegEncContext *c = p;
int score1= sse16_mmx(c, pix1, pix2, line_size, h);
int score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
else return score1 + ABS(score2)*8;
}
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
MpegEncContext *c = p;
int score1= sse8_mmx(c, pix1, pix2, line_size, h);
int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
else return score1 + ABS(score2)*8;
}
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
int tmp;
assert( (((int)pix) & 7) == 0);
assert((line_size &7) ==0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n"\
"movq 8(%0), %%mm3\n"\
"add %2,%0\n"\
"movq %%mm2, " #out0 "\n"\
"movq %%mm3, " #out1 "\n"\
"psubusb " #in0 ", %%mm2\n"\
"psubusb " #in1 ", %%mm3\n"\
"psubusb " #out0 ", " #in0 "\n"\
"psubusb " #out1 ", " #in1 "\n"\
"por %%mm2, " #in0 "\n"\
"por %%mm3, " #in1 "\n"\
"movq " #in0 ", %%mm2\n"\
"movq " #in1 ", %%mm3\n"\
"punpcklbw %%mm7, " #in0 "\n"\
"punpcklbw %%mm7, " #in1 "\n"\
"punpckhbw %%mm7, %%mm2\n"\
"punpckhbw %%mm7, %%mm3\n"\
"paddw " #in1 ", " #in0 "\n"\
"paddw %%mm3, %%mm2\n"\
"paddw %%mm2, " #in0 "\n"\
"paddw " #in0 ", %%mm6\n"
asm volatile (
"movl %3,%%ecx\n"
"pxor %%mm6,%%mm6\n"
"pxor %%mm7,%%mm7\n"
"movq (%0),%%mm0\n"
"movq 8(%0),%%mm1\n"
"add %2,%0\n"
"subl $2, %%ecx\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
"1:\n"
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
"subl $2, %%ecx\n"
"jnz 1b\n"
"movq %%mm6,%%mm0\n"
"psrlq $32, %%mm6\n"
"paddw %%mm6,%%mm0\n"
"movq %%mm0,%%mm6\n"
"psrlq $16, %%mm0\n"
"paddw %%mm6,%%mm0\n"
"movd %%mm0,%1\n"
: "+r" (pix), "=r"(tmp)
: "r" ((long)line_size) , "m" (h)
: "%ecx");
return tmp & 0xFFFF;
}
#undef SUM
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
int tmp;
assert( (((int)pix) & 7) == 0);
assert((line_size &7) ==0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), " #out0 "\n"\
"movq 8(%0), " #out1 "\n"\
"add %2,%0\n"\
"psadbw " #out0 ", " #in0 "\n"\
"psadbw " #out1 ", " #in1 "\n"\
"paddw " #in1 ", " #in0 "\n"\
"paddw " #in0 ", %%mm6\n"
asm volatile (
"movl %3,%%ecx\n"
"pxor %%mm6,%%mm6\n"
"pxor %%mm7,%%mm7\n"
"movq (%0),%%mm0\n"
"movq 8(%0),%%mm1\n"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -