📄 op_neon.h.svn-base
字号:
NEON_POP(pmax_s8, neon_s8, 4)NEON_POP(pmax_u8, neon_u8, 4)NEON_POP(pmax_s16, neon_s16, 2)NEON_POP(pmax_u16, neon_u16, 2)#undef NEON_FNNEON_OP(max_f32){ float32 f0 = vfp_itos(T0); float32 f1 = vfp_itos(T1); T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1; FORCE_RET();}#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2NEON_VOP(min_s8, neon_s8, 4)NEON_VOP(min_u8, neon_u8, 4)NEON_VOP(min_s16, neon_s16, 2)NEON_VOP(min_u16, neon_u16, 2)NEON_VOP(min_s32, neon_s32, 1)NEON_VOP(min_u32, neon_u32, 1)NEON_POP(pmin_s8, neon_s8, 4)NEON_POP(pmin_u8, neon_u8, 4)NEON_POP(pmin_s16, neon_s16, 2)NEON_POP(pmin_u16, neon_u16, 2)#undef NEON_FNNEON_OP(min_f32){ float32 f0 = vfp_itos(T0); float32 f1 = vfp_itos(T1); T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1; FORCE_RET();}#define NEON_FN(dest, src1, src2) \ dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)NEON_VOP(abd_s8, neon_s8, 4)NEON_VOP(abd_u8, neon_u8, 4)NEON_VOP(abd_s16, neon_s16, 2)NEON_VOP(abd_u16, neon_u16, 2)NEON_VOP(abd_s32, neon_s32, 1)NEON_VOP(abd_u32, neon_u32, 1)#undef NEON_FNNEON_OP(abd_f32){ float32 f0 = vfp_itos(T0); float32 f1 = vfp_itos(T1); T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1) ? float32_sub(f0, f1, NFS) : float32_sub(f1, f0, NFS)); FORCE_RET();}#define NEON_FN(dest, src1, src2) dest = src1 + src2NEON_VOP(add_u8, neon_u8, 4)NEON_VOP(add_u16, neon_u16, 2)NEON_POP(padd_u8, neon_u8, 4)NEON_POP(padd_u16, neon_u16, 2)#undef NEON_FNNEON_OP(add_f32){ T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS)); FORCE_RET();}#define NEON_FN(dest, src1, src2) dest = src1 - src2NEON_VOP(sub_u8, neon_u8, 4)NEON_VOP(sub_u16, neon_u16, 2)#undef NEON_FNNEON_OP(sub_f32){ T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS)); FORCE_RET();}#define NEON_FN(dest, src1, src2) dest = src2 - src1NEON_VOP(rsb_u8, neon_u8, 4)NEON_VOP(rsb_u16, neon_u16, 2)#undef NEON_FNNEON_OP(rsb_f32){ T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS)); FORCE_RET();}#define NEON_FN(dest, src1, src2) dest = src1 * src2NEON_VOP(mul_u8, neon_u8, 4)NEON_VOP(mul_u16, neon_u16, 2)#undef NEON_FNNEON_OP(mul_f32){ T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS)); FORCE_RET();}NEON_OP(mul_p8){ T0 = helper_neon_mul_p8(T0, T1);}#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0NEON_VOP(tst_u8, neon_u8, 4)NEON_VOP(tst_u16, neon_u16, 2)NEON_VOP(tst_u32, neon_u32, 1)#undef NEON_FN#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0NEON_VOP(ceq_u8, neon_u8, 4)NEON_VOP(ceq_u16, neon_u16, 2)NEON_VOP(ceq_u32, neon_u32, 1)#undef NEON_FN#define NEON_QDMULH16(dest, src1, src2, round) do { \ uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ env->QF = 1; \ tmp = (tmp >> 31) ^ ~SIGNBIT; \ } \ tmp <<= 1; \ if (round) { \ int32_t old = tmp; \ tmp += 1 << 15; \ if ((int32_t)tmp < old) { \ env->QF = 1; \ tmp = SIGNBIT - 1; \ } \ } \ dest = tmp >> 16; \ } while(0)#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)NEON_VOP(qdmulh_s16, neon_s16, 2)#undef NEON_FN#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)NEON_VOP(qrdmulh_s16, neon_s16, 2)#undef NEON_FN#undef NEON_QDMULH16#define SIGNBIT64 ((uint64_t)1 << 63)#define NEON_QDMULH32(dest, src1, src2, round) do { \ uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ env->QF = 1; \ tmp = (tmp >> 63) ^ ~SIGNBIT64; \ } else { \ tmp <<= 1; \ } \ if (round) { \ int64_t old = tmp; \ tmp += (int64_t)1 << 31; \ if ((int64_t)tmp < old) { \ env->QF = 1; \ tmp = SIGNBIT64 - 1; \ } \ } \ dest = tmp >> 32; \ } while(0)#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)NEON_VOP(qdmulh_s32, neon_s32, 1)#undef NEON_FN#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)NEON_VOP(qrdmulh_s32, neon_s32, 1)#undef NEON_FN#undef NEON_QDMULH32NEON_OP(recps_f32){ T0 = vfp_stoi(helper_recps_f32(vfp_itos(T0), vfp_itos(T1))); FORCE_RET();}NEON_OP(rsqrts_f32){ T0 = vfp_stoi(helper_rsqrts_f32(vfp_itos(T0), vfp_itos(T1))); FORCE_RET();}/* Floating point comparisons produce an integer result. */#define NEON_VOP_FCMP(name, cmp) \NEON_OP(name) \{ \ if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \ T0 = -1; \ else \ T0 = 0; \ FORCE_RET(); \}NEON_VOP_FCMP(ceq_f32, ==)NEON_VOP_FCMP(cge_f32, >=)NEON_VOP_FCMP(cgt_f32, >)NEON_OP(acge_f32){ float32 f0 = float32_abs(vfp_itos(T0)); float32 f1 = float32_abs(vfp_itos(T1)); T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0; FORCE_RET();}NEON_OP(acgt_f32){ float32 f0 = float32_abs(vfp_itos(T0)); float32 f1 = float32_abs(vfp_itos(T1)); T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0; FORCE_RET();}/* Narrowing instructions. The named type is the destination type. */NEON_OP(narrow_u8){ T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00) | ((T1 << 16) & 0xff0000) | (T1 << 24); FORCE_RET();}NEON_OP(narrow_sat_u8){ neon_u16 src; neon_u8 dest;#define SAT8(d, s) \ if (s > 0xff) { \ d = 0xff; \ env->QF = 1; \ } else { \ d = s; \ } NEON_UNPACK(neon_u16, src, T0); SAT8(dest.v1, src.v1); SAT8(dest.v2, src.v2); NEON_UNPACK(neon_u16, src, T1); SAT8(dest.v3, src.v1); SAT8(dest.v4, src.v2); NEON_PACK(neon_u8, T0, dest); FORCE_RET();#undef SAT8}NEON_OP(narrow_sat_s8){ neon_s16 src; neon_s8 dest;#define SAT8(d, s) \ if (s != (uint8_t)s) { \ d = (s >> 15) ^ 0x7f; \ env->QF = 1; \ } else { \ d = s; \ } NEON_UNPACK(neon_s16, src, T0); SAT8(dest.v1, src.v1); SAT8(dest.v2, src.v2); NEON_UNPACK(neon_s16, src, T1); SAT8(dest.v3, src.v1); SAT8(dest.v4, src.v2); NEON_PACK(neon_s8, T0, dest); FORCE_RET();#undef SAT8}NEON_OP(narrow_u16){ T0 = (T0 & 0xffff) | (T1 << 16);}NEON_OP(narrow_sat_u16){ if (T0 > 0xffff) { T0 = 0xffff; env->QF = 1; } if (T1 > 0xffff) { T1 = 0xffff; env->QF = 1; } T0 |= T1 << 16; FORCE_RET();}NEON_OP(narrow_sat_s16){ if ((int32_t)T0 != (int16_t)T0) { T0 = ((int32_t)T0 >> 31) ^ 0x7fff; env->QF = 1; } if ((int32_t)T1 != (int16_t) T1) { T1 = ((int32_t)T1 >> 31) ^ 0x7fff; env->QF = 1; } T0 = (uint16_t)T0 | (T1 << 16); FORCE_RET();}NEON_OP(narrow_sat_u32){ if (T1) { T0 = 0xffffffffu; env->QF = 1; } FORCE_RET();}NEON_OP(narrow_sat_s32){ int32_t sign = (int32_t)T1 >> 31; if ((int32_t)T1 != sign) { T0 = sign ^ 0x7fffffff; env->QF = 1; } FORCE_RET();}/* Narrowing instructions. Named type is the narrow type. */NEON_OP(narrow_high_u8){ T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00) | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000); FORCE_RET();}NEON_OP(narrow_high_u16){ T0 = (T0 >> 16) | (T1 & 0xffff0000); FORCE_RET();}NEON_OP(narrow_high_round_u8){ T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00) | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000); FORCE_RET();}NEON_OP(narrow_high_round_u16){ T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000); FORCE_RET();}NEON_OP(narrow_high_round_u32){ if (T0 >= 0x80000000u) T0 = T1 + 1; else T0 = T1; FORCE_RET();}/* Widening instructions. Named type is source type. */NEON_OP(widen_s8){ uint32_t src; src = T0; T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16); T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16);}NEON_OP(widen_u8){ T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff); T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff);}NEON_OP(widen_s16){ int32_t src; src = T0; T0 = (int16_t)src; T1 = src >> 16;}NEON_OP(widen_u16){ T1 = T0 >> 16; T0 &= 0xffff;}NEON_OP(widen_s32){ T1 = (int32_t)T0 >> 31; FORCE_RET();}NEON_OP(widen_high_u8){ T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00); T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00);}NEON_OP(widen_high_u16){ T1 = T0 & 0xffff0000; T0 <<= 16;}/* Long operations. The type is the wide type. */NEON_OP(shll_u16){ int shift = PARAM1; uint32_t mask; mask = 0xffff >> (16 - shift); mask |= mask << 16; mask = ~mask; T0 = (T0 << shift) & mask; T1 = (T1 << shift) & mask; FORCE_RET();}NEON_OP(shll_u64){ int shift = PARAM1; T1 <<= shift; T1 |= T0 >> (32 - shift); T0 <<= shift; FORCE_RET();}NEON_OP(addl_u16){ uint32_t tmp; uint32_t high; tmp = env->vfp.scratch[0]; high = (T0 >> 16) + (tmp >> 16); T0 = (uint16_t)(T0 + tmp); T0 |= (high << 16); tmp = env->vfp.scratch[1]; high = (T1 >> 16) + (tmp >> 16); T1 = (uint16_t)(T1 + tmp); T1 |= (high << 16); FORCE_RET();}NEON_OP(addl_u32){ T0 += env->vfp.scratch[0]; T1 += env->vfp.scratch[1]; FORCE_RET();}NEON_OP(addl_u64){ uint64_t tmp; tmp = T0 | ((uint64_t)T1 << 32); tmp += env->vfp.scratch[0]; tmp += (uint64_t)env->vfp.scratch[1] << 32; T0 = tmp; T1 = tmp >> 32; FORCE_RET();}NEON_OP(subl_u16){ uint32_t tmp; uint32_t high; tmp = env->vfp.scratch[0]; high = (T0 >> 16) - (tmp >> 16); T0 = (uint16_t)(T0 - tmp); T0 |= (high << 16); tmp = env->vfp.scratch[1]; high = (T1 >> 16) - (tmp >> 16); T1 = (uint16_t)(T1 - tmp); T1 |= (high << 16); FORCE_RET();}NEON_OP(subl_u32){ T0 -= env->vfp.scratch[0]; T1 -= env->vfp.scratch[1]; FORCE_RET();}NEON_OP(subl_u64){ uint64_t tmp; tmp = T0 | ((uint64_t)T1 << 32); tmp -= env->vfp.scratch[0]; tmp -= (uint64_t)env->vfp.scratch[1] << 32; T0 = tmp; T1 = tmp >> 32; FORCE_RET();}#define DO_ABD(dest, x, y, type) do { \ type tmp_x = x; \ type tmp_y = y; \ dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ } while(0)NEON_OP(abdl_u16){ uint32_t tmp; uint32_t low; uint32_t high; DO_ABD(low, T0, T1, uint8_t); DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t); low |= tmp << 16; DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t); DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t); high |= tmp << 16; T0 = low; T1 = high; FORCE_RET();}NEON_OP(abdl_s16){ uint32_t tmp; uint32_t low; uint32_t high; DO_ABD(low, T0, T1, int8_t); DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t); low |= tmp << 16; DO_ABD(high, T0 >> 16, T1 >> 16, int8_t); DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t); high |= tmp << 16; T0 = low; T1 = high; FORCE_RET();}NEON_OP(abdl_u32){ uint32_t low; uint32_t high; DO_ABD(low, T0, T1, uint16_t); DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t); T0 = low; T1 = high; FORCE_RET();}NEON_OP(abdl_s32){ uint32_t low; uint32_t high; DO_ABD(low, T0, T1, int16_t); DO_ABD(high, T0 >> 16, T1 >> 16, int16_t); T0 = low; T1 = high; FORCE_RET();}NEON_OP(abdl_u64){ DO_ABD(T0, T0, T1, uint32_t); T1 = 0;}NEON_OP(abdl_s64){ DO_ABD(T0, T0, T1, int32_t); T1 = 0;}#undef DO_ABD/* Widening multiple. Named type is the source type. */#define DO_MULL(dest, x, y, type1, type2) do { \ type1 tmp_x = x; \ type1 tmp_y = y; \ dest = (type2)((type2)tmp_x * (type2)tmp_y); \ } while(0)NEON_OP(mull_u8){ uint32_t tmp;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -