📄 op_neon.h.svn-base
字号:
/* * ARM NEON vector operations. * * Copyright (c) 2007 CodeSourcery. * Written by Paul Brook * * This code is licenced under the GPL. *//* Note that for NEON an "l" prefix means it is a wide operation, unlike scalar arm ops where it means a word size operation. *//* ??? NEON ops should probably have their own float status. */#define NFS &env->vfp.fp_status#define NEON_OP(name) void OPPROTO op_neon_##name (void)NEON_OP(getreg_T0){ T0 = *(uint32_t *)((char *) env + PARAM1);}NEON_OP(getreg_T1){ T1 = *(uint32_t *)((char *) env + PARAM1);}NEON_OP(getreg_T2){ T2 = *(uint32_t *)((char *) env + PARAM1);}NEON_OP(setreg_T0){ *(uint32_t *)((char *) env + PARAM1) = T0;}NEON_OP(setreg_T1){ *(uint32_t *)((char *) env + PARAM1) = T1;}NEON_OP(setreg_T2){ *(uint32_t *)((char *) env + PARAM1) = T2;}#define NEON_TYPE1(name, type) \typedef struct \{ \ type v1; \} neon_##name;#ifdef WORDS_BIGENDIAN#define NEON_TYPE2(name, type) \typedef struct \{ \ type v2; \ type v1; \} neon_##name;#define NEON_TYPE4(name, type) \typedef struct \{ \ type v4; \ type v3; \ type v2; \ type v1; \} neon_##name;#else#define NEON_TYPE2(name, type) \typedef struct \{ \ type v1; \ type v2; \} neon_##name;#define NEON_TYPE4(name, type) \typedef struct \{ \ type v1; \ type v2; \ type v3; \ type v4; \} neon_##name;#endifNEON_TYPE4(s8, int8_t)NEON_TYPE4(u8, uint8_t)NEON_TYPE2(s16, int16_t)NEON_TYPE2(u16, uint16_t)NEON_TYPE1(s32, int32_t)NEON_TYPE1(u32, uint32_t)#undef NEON_TYPE4#undef NEON_TYPE2#undef NEON_TYPE1/* Copy from a uint32_t to a vector structure type. */#define NEON_UNPACK(vtype, dest, val) do { \ union { \ vtype v; \ uint32_t i; \ } conv_u; \ conv_u.i = (val); \ dest = conv_u.v; \ } while(0)/* Copy from a vector structure type to a uint32_t. */#define NEON_PACK(vtype, dest, val) do { \ union { \ vtype v; \ uint32_t i; \ } conv_u; \ conv_u.v = (val); \ dest = conv_u.i; \ } while(0)#define NEON_DO1 \ NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);#define NEON_DO2 \ NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);#define NEON_DO4 \ NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);#define NEON_VOP(name, vtype, n) \NEON_OP(name) \{ \ vtype vsrc1; \ vtype vsrc2; \ vtype vdest; \ NEON_UNPACK(vtype, vsrc1, T0); \ NEON_UNPACK(vtype, vsrc2, T1); \ NEON_DO##n; \ NEON_PACK(vtype, T0, vdest); \ FORCE_RET(); \}#define NEON_VOP1(name, vtype, n) \NEON_OP(name) \{ \ vtype vsrc1; \ vtype vdest; \ NEON_UNPACK(vtype, vsrc1, T0); \ NEON_DO##n; \ NEON_PACK(vtype, T0, vdest); \ FORCE_RET(); \}/* Pairwise operations. *//* For 32-bit elements each segment only contains a single element, so the elementwise and pairwise operations are the same. */#define NEON_PDO2 \ NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);#define NEON_PDO4 \ NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \#define NEON_POP(name, vtype, n) \NEON_OP(name) \{ \ vtype vsrc1; \ vtype vsrc2; \ vtype vdest; \ NEON_UNPACK(vtype, vsrc1, T0); \ NEON_UNPACK(vtype, vsrc2, T1); \ NEON_PDO##n; \ NEON_PACK(vtype, T0, vdest); \ FORCE_RET(); \}#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1NEON_VOP(hadd_s8, neon_s8, 4)NEON_VOP(hadd_u8, neon_u8, 4)NEON_VOP(hadd_s16, neon_s16, 2)NEON_VOP(hadd_u16, neon_u16, 2)#undef NEON_FNNEON_OP(hadd_s32){ int32_t src1 = T0; int32_t src2 = T1; int32_t dest; dest = (src1 >> 1) + (src2 >> 1); if (src1 & src2 & 1) dest++; T0 = dest; FORCE_RET();}NEON_OP(hadd_u32){ uint32_t src1 = T0; uint32_t src2 = T1; uint32_t dest; dest = (src1 >> 1) + (src2 >> 1); if (src1 & src2 & 1) dest++; T0 = dest; FORCE_RET();}#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1NEON_VOP(rhadd_s8, neon_s8, 4)NEON_VOP(rhadd_u8, neon_u8, 4)NEON_VOP(rhadd_s16, neon_s16, 2)NEON_VOP(rhadd_u16, neon_u16, 2)#undef NEON_FNNEON_OP(rhadd_s32){ int32_t src1 = T0; int32_t src2 = T1; int32_t dest; dest = (src1 >> 1) + (src2 >> 1); if ((src1 | src2) & 1) dest++; T0 = dest; FORCE_RET();}NEON_OP(rhadd_u32){ uint32_t src1 = T0; uint32_t src2 = T1; uint32_t dest; dest = (src1 >> 1) + (src2 >> 1); if ((src1 | src2) & 1) dest++; T0 = dest; FORCE_RET();}#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1NEON_VOP(hsub_s8, neon_s8, 4)NEON_VOP(hsub_u8, neon_u8, 4)NEON_VOP(hsub_s16, neon_s16, 2)NEON_VOP(hsub_u16, neon_u16, 2)#undef NEON_FNNEON_OP(hsub_s32){ int32_t src1 = T0; int32_t src2 = T1; int32_t dest; dest = (src1 >> 1) - (src2 >> 1); if ((~src1) & src2 & 1) dest--; T0 = dest; FORCE_RET();}NEON_OP(hsub_u32){ uint32_t src1 = T0; uint32_t src2 = T1; uint32_t dest; dest = (src1 >> 1) - (src2 >> 1); if ((~src1) & src2 & 1) dest--; T0 = dest; FORCE_RET();}/* ??? bsl, bif and bit are all the same op, just with the oparands in a differnet order. It's currently easier to have 3 differnt ops than rearange the operands. *//* Bitwise Select. */NEON_OP(bsl){ T0 = (T0 & T2) | (T1 & ~T2);}/* Bitwise Insert If True. */NEON_OP(bit){ T0 = (T0 & T1) | (T2 & ~T1);}/* Bitwise Insert If False. */NEON_OP(bif){ T0 = (T2 & T1) | (T0 & ~T1);}#define NEON_USAT(dest, src1, src2, type) do { \ uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ if (tmp != (type)tmp) { \ env->QF = 1; \ dest = ~0; \ } else { \ dest = tmp; \ }} while(0)#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)NEON_VOP(qadd_u8, neon_u8, 4)#undef NEON_FN#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)NEON_VOP(qadd_u16, neon_u16, 2)#undef NEON_FN#undef NEON_USAT#define NEON_SSAT(dest, src1, src2, type) do { \ int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ if (tmp != (type)tmp) { \ env->QF = 1; \ if (src2 > 0) { \ tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ } else { \ tmp = 1 << (sizeof(type) * 8 - 1); \ } \ } \ dest = tmp; \ } while(0)#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)NEON_VOP(qadd_s8, neon_s8, 4)#undef NEON_FN#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)NEON_VOP(qadd_s16, neon_s16, 2)#undef NEON_FN#undef NEON_SSAT#define NEON_USAT(dest, src1, src2, type) do { \ uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ if (tmp != (type)tmp) { \ env->QF = 1; \ dest = 0; \ } else { \ dest = tmp; \ }} while(0)#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)NEON_VOP(qsub_u8, neon_u8, 4)#undef NEON_FN#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)NEON_VOP(qsub_u16, neon_u16, 2)#undef NEON_FN#undef NEON_USAT#define NEON_SSAT(dest, src1, src2, type) do { \ int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ if (tmp != (type)tmp) { \ env->QF = 1; \ if (src2 < 0) { \ tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ } else { \ tmp = 1 << (sizeof(type) * 8 - 1); \ } \ } \ dest = tmp; \ } while(0)#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)NEON_VOP(qsub_s8, neon_s8, 4)#undef NEON_FN#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)NEON_VOP(qsub_s16, neon_s16, 2)#undef NEON_FN#undef NEON_SSAT#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0NEON_VOP(cgt_s8, neon_s8, 4)NEON_VOP(cgt_u8, neon_u8, 4)NEON_VOP(cgt_s16, neon_s16, 2)NEON_VOP(cgt_u16, neon_u16, 2)NEON_VOP(cgt_s32, neon_s32, 1)NEON_VOP(cgt_u32, neon_u32, 1)#undef NEON_FN#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0NEON_VOP(cge_s8, neon_s8, 4)NEON_VOP(cge_u8, neon_u8, 4)NEON_VOP(cge_s16, neon_s16, 2)NEON_VOP(cge_u16, neon_u16, 2)NEON_VOP(cge_s32, neon_s32, 1)NEON_VOP(cge_u32, neon_u32, 1)#undef NEON_FN#define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src2; \ if (tmp < 0) { \ dest = src1 >> -tmp; \ } else { \ dest = src1 << tmp; \ }} while (0)NEON_VOP(shl_s8, neon_s8, 4)NEON_VOP(shl_u8, neon_u8, 4)NEON_VOP(shl_s16, neon_s16, 2)NEON_VOP(shl_u16, neon_u16, 2)NEON_VOP(shl_s32, neon_s32, 1)NEON_VOP(shl_u32, neon_u32, 1)#undef NEON_FNNEON_OP(shl_u64){ int8_t shift = T2; uint64_t val = T0 | ((uint64_t)T1 << 32); if (shift < 0) { val >>= -shift; } else { val <<= shift; } T0 = val; T1 = val >> 32; FORCE_RET();}NEON_OP(shl_s64){ int8_t shift = T2; int64_t val = T0 | ((uint64_t)T1 << 32); if (shift < 0) { val >>= -shift; } else { val <<= shift; } T0 = val; T1 = val >> 32; FORCE_RET();}#define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src1; \ if (tmp < 0) { \ dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \ } else { \ dest = src2 << tmp; \ }} while (0)NEON_VOP(rshl_s8, neon_s8, 4)NEON_VOP(rshl_u8, neon_u8, 4)NEON_VOP(rshl_s16, neon_s16, 2)NEON_VOP(rshl_u16, neon_u16, 2)NEON_VOP(rshl_s32, neon_s32, 1)NEON_VOP(rshl_u32, neon_u32, 1)#undef NEON_FNNEON_OP(rshl_u64){ int8_t shift = T2; uint64_t val = T0 | ((uint64_t)T1 << 32); if (shift < 0) { val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift; val >>= -shift; } else { val <<= shift; } T0 = val; T1 = val >> 32; FORCE_RET();}NEON_OP(rshl_s64){ int8_t shift = T2; int64_t val = T0 | ((uint64_t)T1 << 32); if (shift < 0) { val = (val + ((int64_t)1 << (-1 - shift))) >> -shift; } else { val <<= shift; } T0 = val; T1 = val >> 32; FORCE_RET();}#define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src1; \ if (tmp < 0) { \ dest = src2 >> -tmp; \ } else { \ dest = src2 << tmp; \ if ((dest >> tmp) != src2) { \ env->QF = 1; \ dest = ~0; \ } \ }} while (0)NEON_VOP(qshl_s8, neon_s8, 4)NEON_VOP(qshl_s16, neon_s16, 2)NEON_VOP(qshl_s32, neon_s32, 1)#undef NEON_FNNEON_OP(qshl_s64){ int8_t shift = T2; int64_t val = T0 | ((uint64_t)T1 << 32); if (shift < 0) { val >>= -shift; } else { int64_t tmp = val; val <<= shift; if ((val >> shift) != tmp) { env->QF = 1; val = (tmp >> 63) ^ 0x7fffffffffffffffULL; } } T0 = val; T1 = val >> 32; FORCE_RET();}#define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src1; \ if (tmp < 0) { \ dest = src2 >> -tmp; \ } else { \ dest = src2 << tmp; \ if ((dest >> tmp) != src2) { \ env->QF = 1; \ dest = src2 >> 31; \ } \ }} while (0)NEON_VOP(qshl_u8, neon_u8, 4)NEON_VOP(qshl_u16, neon_u16, 2)NEON_VOP(qshl_u32, neon_u32, 1)#undef NEON_FNNEON_OP(qshl_u64){ int8_t shift = T2; uint64_t val = T0 | ((uint64_t)T1 << 32); if (shift < 0) { val >>= -shift; } else { uint64_t tmp = val; val <<= shift; if ((val >> shift) != tmp) { env->QF = 1; val = ~(uint64_t)0; } } T0 = val; T1 = val >> 32; FORCE_RET();}#define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src1; \ if (tmp < 0) { \ dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \ } else { \ dest = src2 << tmp; \ if ((dest >> tmp) != src2) { \ dest = ~0; \ } \ }} while (0)NEON_VOP(qrshl_s8, neon_s8, 4)NEON_VOP(qrshl_s16, neon_s16, 2)NEON_VOP(qrshl_s32, neon_s32, 1)#undef NEON_FN#define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src1; \ if (tmp < 0) { \ dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \ } else { \ dest = src2 << tmp; \ if ((dest >> tmp) != src2) { \ env->QF = 1; \ dest = src2 >> 31; \ } \ }} while (0)NEON_VOP(qrshl_u8, neon_u8, 4)NEON_VOP(qrshl_u16, neon_u16, 2)NEON_VOP(qrshl_u32, neon_u32, 1)#undef NEON_FN#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2NEON_VOP(max_s8, neon_s8, 4)NEON_VOP(max_u8, neon_u8, 4)NEON_VOP(max_s16, neon_s16, 2)NEON_VOP(max_u16, neon_u16, 2)NEON_VOP(max_s32, neon_s32, 1)NEON_VOP(max_u32, neon_u32, 1)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -