⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dsputil_mmx.c

📁 君正早期ucos系统(只有早期的才不没有打包成库),MPLAYER,文件系统,图片解码,浏览,电子书,录音,想学ucos,识货的人就下吧 russblock fmradio explore set
💻 C
📖 第 1 页 / 共 5 页
字号:
         "lea (%3, %3), %%"REG_a"       \n\t"         ASMALIGN(3)         "1:                            \n\t"         "movd (%1), %%mm0              \n\t"         "movd (%1, %3), %%mm1          \n\t"         "movd %%mm0, (%2)              \n\t"         "movd %%mm1, (%2, %3)          \n\t"         "add %%"REG_a", %1             \n\t"         "add %%"REG_a", %2             \n\t"         "movd (%1), %%mm0              \n\t"         "movd (%1, %3), %%mm1          \n\t"         "movd %%mm0, (%2)              \n\t"         "movd %%mm1, (%2, %3)          \n\t"         "add %%"REG_a", %1             \n\t"         "add %%"REG_a", %2             \n\t"         "subl $4, %0                   \n\t"         "jnz 1b                        \n\t"         : "+g"(h), "+r" (pixels),  "+r" (block)         : "r"((long)line_size)         : "%"REG_a, "memory"        );}static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h){    __asm __volatile(         "lea (%3, %3), %%"REG_a"       \n\t"         ASMALIGN(3)         "1:                            \n\t"         "movq (%1), %%mm0              \n\t"         "movq (%1, %3), %%mm1          \n\t"         "movq %%mm0, (%2)              \n\t"         "movq %%mm1, (%2, %3)          \n\t"         "add %%"REG_a", %1             \n\t"         "add %%"REG_a", %2             \n\t"         "movq (%1), %%mm0              \n\t"         "movq (%1, %3), %%mm1          \n\t"         "movq %%mm0, (%2)              \n\t"         "movq %%mm1, (%2, %3)          \n\t"         "add %%"REG_a", %1             \n\t"         "add %%"REG_a", %2             \n\t"         "subl $4, %0                   \n\t"         "jnz 1b                        \n\t"         : "+g"(h), "+r" (pixels),  "+r" (block)         : "r"((long)line_size)         : "%"REG_a, "memory"        );}static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h){    __asm __volatile(         "lea (%3, %3), %%"REG_a"       \n\t"         ASMALIGN(3)         "1:                            \n\t"         "movq (%1), %%mm0              \n\t"         "movq 8(%1), %%mm4             \n\t"         "movq (%1, %3), %%mm1          \n\t"         "movq 8(%1, %3), %%mm5         \n\t"         "movq %%mm0, (%2)              \n\t"         "movq %%mm4, 8(%2)             \n\t"         "movq %%mm1, (%2, %3)          \n\t"         "movq %%mm5, 8(%2, %3)         \n\t"         "add %%"REG_a", %1             \n\t"         "add %%"REG_a", %2             \n\t"         "movq (%1), %%mm0              \n\t"         "movq 8(%1), %%mm4             \n\t"         "movq (%1, %3), %%mm1          \n\t"         "movq 8(%1, %3), %%mm5         \n\t"         "movq %%mm0, (%2)              \n\t"         "movq %%mm4, 8(%2)             \n\t"         "movq %%mm1, (%2, %3)          \n\t"         "movq %%mm5, 8(%2, %3)         \n\t"         "add %%"REG_a", %1             \n\t"         "add %%"REG_a", %2             \n\t"         "subl $4, %0                   \n\t"         "jnz 1b                        \n\t"         : "+g"(h), "+r" (pixels),  "+r" (block)         : "r"((long)line_size)         : "%"REG_a, "memory"        );}static void clear_blocks_mmx(DCTELEM *blocks){    __asm __volatile(                "pxor %%mm7, %%mm7              \n\t"                "mov $-128*6, %%"REG_a"         \n\t"                "1:                             \n\t"                "movq %%mm7, (%0, %%"REG_a")    \n\t"                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"                "add $32, %%"REG_a"             \n\t"                " js 1b                         \n\t"                : : "r" (((uint8_t *)blocks)+128*6)                : "%"REG_a        );}#ifdef CONFIG_ENCODERSstatic int pix_sum16_mmx(uint8_t * pix, int line_size){    const int h=16;    int sum;    long index= -line_size*h;    __asm __volatile(                "pxor %%mm7, %%mm7              \n\t"                "pxor %%mm6, %%mm6              \n\t"                "1:                             \n\t"                "movq (%2, %1), %%mm0           \n\t"                "movq (%2, %1), %%mm1           \n\t"                "movq 8(%2, %1), %%mm2          \n\t"                "movq 8(%2, %1), %%mm3          \n\t"                "punpcklbw %%mm7, %%mm0         \n\t"                "punpckhbw %%mm7, %%mm1         \n\t"                "punpcklbw %%mm7, %%mm2         \n\t"                "punpckhbw %%mm7, %%mm3         \n\t"                "paddw %%mm0, %%mm1             \n\t"                "paddw %%mm2, %%mm3             \n\t"                "paddw %%mm1, %%mm3             \n\t"                "paddw %%mm3, %%mm6             \n\t"                "add %3, %1                     \n\t"                " js 1b                         \n\t"                "movq %%mm6, %%mm5              \n\t"                "psrlq $32, %%mm6               \n\t"                "paddw %%mm5, %%mm6             \n\t"                "movq %%mm6, %%mm5              \n\t"                "psrlq $16, %%mm6               \n\t"                "paddw %%mm5, %%mm6             \n\t"                "movd %%mm6, %0                 \n\t"                "andl $0xFFFF, %0               \n\t"                : "=&r" (sum), "+r" (index)                : "r" (pix - index), "r" ((long)line_size)        );        return sum;}#endif //CONFIG_ENCODERSstatic void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){    long i=0;    asm volatile(        "1:                             \n\t"        "movq  (%1, %0), %%mm0          \n\t"        "movq  (%2, %0), %%mm1          \n\t"        "paddb %%mm0, %%mm1             \n\t"        "movq %%mm1, (%2, %0)           \n\t"        "movq 8(%1, %0), %%mm0          \n\t"        "movq 8(%2, %0), %%mm1          \n\t"        "paddb %%mm0, %%mm1             \n\t"        "movq %%mm1, 8(%2, %0)          \n\t"        "add $16, %0                    \n\t"        "cmp %3, %0                     \n\t"        " jb 1b                         \n\t"        : "+r" (i)        : "r"(src), "r"(dst), "r"((long)w-15)    );    for(; i<w; i++)        dst[i+0] += src[i+0];}#define H263_LOOP_FILTER \        "pxor %%mm7, %%mm7              \n\t"\        "movq  %0, %%mm0                \n\t"\        "movq  %0, %%mm1                \n\t"\        "movq  %3, %%mm2                \n\t"\        "movq  %3, %%mm3                \n\t"\        "punpcklbw %%mm7, %%mm0         \n\t"\        "punpckhbw %%mm7, %%mm1         \n\t"\        "punpcklbw %%mm7, %%mm2         \n\t"\        "punpckhbw %%mm7, %%mm3         \n\t"\        "psubw %%mm2, %%mm0             \n\t"\        "psubw %%mm3, %%mm1             \n\t"\        "movq  %1, %%mm2                \n\t"\        "movq  %1, %%mm3                \n\t"\        "movq  %2, %%mm4                \n\t"\        "movq  %2, %%mm5                \n\t"\        "punpcklbw %%mm7, %%mm2         \n\t"\        "punpckhbw %%mm7, %%mm3         \n\t"\        "punpcklbw %%mm7, %%mm4         \n\t"\        "punpckhbw %%mm7, %%mm5         \n\t"\        "psubw %%mm2, %%mm4             \n\t"\        "psubw %%mm3, %%mm5             \n\t"\        "psllw $2, %%mm4                \n\t"\        "psllw $2, %%mm5                \n\t"\        "paddw %%mm0, %%mm4             \n\t"\        "paddw %%mm1, %%mm5             \n\t"\        "pxor %%mm6, %%mm6              \n\t"\        "pcmpgtw %%mm4, %%mm6           \n\t"\        "pcmpgtw %%mm5, %%mm7           \n\t"\        "pxor %%mm6, %%mm4              \n\t"\        "pxor %%mm7, %%mm5              \n\t"\        "psubw %%mm6, %%mm4             \n\t"\        "psubw %%mm7, %%mm5             \n\t"\        "psrlw $3, %%mm4                \n\t"\        "psrlw $3, %%mm5                \n\t"\        "packuswb %%mm5, %%mm4          \n\t"\        "packsswb %%mm7, %%mm6          \n\t"\        "pxor %%mm7, %%mm7              \n\t"\        "movd %4, %%mm2                 \n\t"\        "punpcklbw %%mm2, %%mm2         \n\t"\        "punpcklbw %%mm2, %%mm2         \n\t"\        "punpcklbw %%mm2, %%mm2         \n\t"\        "psubusb %%mm4, %%mm2           \n\t"\        "movq %%mm2, %%mm3              \n\t"\        "psubusb %%mm4, %%mm3           \n\t"\        "psubb %%mm3, %%mm2             \n\t"\        "movq %1, %%mm3                 \n\t"\        "movq %2, %%mm4                 \n\t"\        "pxor %%mm6, %%mm3              \n\t"\        "pxor %%mm6, %%mm4              \n\t"\        "paddusb %%mm2, %%mm3           \n\t"\        "psubusb %%mm2, %%mm4           \n\t"\        "pxor %%mm6, %%mm3              \n\t"\        "pxor %%mm6, %%mm4              \n\t"\        "paddusb %%mm2, %%mm2           \n\t"\        "packsswb %%mm1, %%mm0          \n\t"\        "pcmpgtb %%mm0, %%mm7           \n\t"\        "pxor %%mm7, %%mm0              \n\t"\        "psubb %%mm7, %%mm0             \n\t"\        "movq %%mm0, %%mm1              \n\t"\        "psubusb %%mm2, %%mm0           \n\t"\        "psubb %%mm0, %%mm1             \n\t"\        "pand %5, %%mm1                 \n\t"\        "psrlw $2, %%mm1                \n\t"\        "pxor %%mm7, %%mm1              \n\t"\        "psubb %%mm7, %%mm1             \n\t"\        "movq %0, %%mm5                 \n\t"\        "movq %3, %%mm6                 \n\t"\        "psubb %%mm1, %%mm5             \n\t"\        "paddb %%mm1, %%mm6             \n\t"static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){    if(ENABLE_ANY_H263) {    const int strength= ff_h263_loop_filter_strength[qscale];    asm volatile(        H263_LOOP_FILTER        "movq %%mm3, %1                 \n\t"        "movq %%mm4, %2                 \n\t"        "movq %%mm5, %0                 \n\t"        "movq %%mm6, %3                 \n\t"        : "+m" (*(uint64_t*)(src - 2*stride)),          "+m" (*(uint64_t*)(src - 1*stride)),          "+m" (*(uint64_t*)(src + 0*stride)),          "+m" (*(uint64_t*)(src + 1*stride))        : "g" (2*strength), "m"(ff_pb_FC)    );    }}static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...        "movd  %4, %%mm0                \n\t"        "movd  %5, %%mm1                \n\t"        "movd  %6, %%mm2                \n\t"        "movd  %7, %%mm3                \n\t"        "punpcklbw %%mm1, %%mm0         \n\t"        "punpcklbw %%mm3, %%mm2         \n\t"        "movq %%mm0, %%mm1              \n\t"        "punpcklwd %%mm2, %%mm0         \n\t"        "punpckhwd %%mm2, %%mm1         \n\t"        "movd  %%mm0, %0                \n\t"        "punpckhdq %%mm0, %%mm0         \n\t"        "movd  %%mm0, %1                \n\t"        "movd  %%mm1, %2                \n\t"        "punpckhdq %%mm1, %%mm1         \n\t"        "movd  %%mm1, %3                \n\t"        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),          "=m" (*(uint32_t*)(dst + 1*dst_stride)),          "=m" (*(uint32_t*)(dst + 2*dst_stride)),          "=m" (*(uint32_t*)(dst + 3*dst_stride))        :  "m" (*(uint32_t*)(src + 0*src_stride)),           "m" (*(uint32_t*)(src + 1*src_stride)),           "m" (*(uint32_t*)(src + 2*src_stride)),           "m" (*(uint32_t*)(src + 3*src_stride))    );}static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){    if(ENABLE_ANY_H263) {    const int strength= ff_h263_loop_filter_strength[qscale];    uint64_t temp[4] __attribute__ ((aligned(8)));    uint8_t *btemp= (uint8_t*)temp;    src -= 2;    transpose4x4(btemp  , src           , 8, stride);    transpose4x4(btemp+4, src + 4*stride, 8, stride);    asm volatile(        H263_LOOP_FILTER // 5 3 4 6        : "+m" (temp[0]),          "+m" (temp[1]),          "+m" (temp[2]),          "+m" (temp[3])        : "g" (2*strength), "m"(ff_pb_FC)    );    asm volatile(        "movq %%mm5, %%mm1              \n\t"        "movq %%mm4, %%mm0              \n\t"        "punpcklbw %%mm3, %%mm5         \n\t"        "punpcklbw %%mm6, %%mm4         \n\t"        "punpckhbw %%mm3, %%mm1         \n\t"        "punpckhbw %%mm6, %%mm0         \n\t"        "movq %%mm5, %%mm3              \n\t"        "movq %%mm1, %%mm6              \n\t"        "punpcklwd %%mm4, %%mm5         \n\t"        "punpcklwd %%mm0, %%mm1         \n\t"        "punpckhwd %%mm4, %%mm3         \n\t"        "punpckhwd %%mm0, %%mm6         \n\t"        "movd %%mm5, (%0)               \n\t"        "punpckhdq %%mm5, %%mm5         \n\t"        "movd %%mm5, (%0,%2)            \n\t"        "movd %%mm3, (%0,%2,2)          \n\t"        "punpckhdq %%mm3, %%mm3         \n\t"        "movd %%mm3, (%0,%3)            \n\t"        "movd %%mm1, (%1)               \n\t"        "punpckhdq %%mm1, %%mm1         \n\t"        "movd %%mm1, (%1,%2)            \n\t"        "movd %%mm6, (%1,%2,2)          \n\t"        "punpckhdq %%mm6, %%mm6         \n\t"        "movd %%mm6, (%1,%3)            \n\t"        :: "r" (src),           "r" (src + 4*stride),           "r" ((long)   stride ),           "r" ((long)(3*stride))    );    }}#ifdef CONFIG_ENCODERSstatic int pix_norm1_mmx(uint8_t *pix, int line_size) {    int tmp;  asm volatile (      "movl $16,%%ecx\n"      "pxor %%mm0,%%mm0\n"      "pxor %%mm7,%%mm7\n"      "1:\n"      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */      "pmaddwd %%mm3,%%mm3\n"      "pmaddwd %%mm4,%%mm4\n"      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,                                          pix2^2+pix3^2+pix6^2+pix7^2) */      "paddd %%mm3,%%mm4\n"      "paddd %%mm2,%%mm7\n"      "add %2, %0\n"      "paddd %%mm4,%%mm7\n"      "dec %%ecx\n"      "jnz 1b\n"      "movq %%mm7,%%mm1\n"      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */      "paddd %%mm7,%%mm1\n"      "movd %%mm1,%1\n"      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );    return tmp;}static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {    int tmp;  asm volatile (      "movl %4,%%ecx\n"      "shr $1,%%ecx\n"      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */      "1:\n"      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -