880 poly8x8_t a = vreinterpret_p8_u64(_a);
881 poly8x8_t b = vreinterpret_p8_u64(_b);
884 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
885 vcreate_u8(0x00000000ffffffff));
886 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
887 vcreate_u8(0x0000000000000000));
890 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));
892 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));
894 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));
896 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));
898 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));
900 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));
902 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));
904 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));
907 uint8x16_t l = veorq_u8(e, f);
908 uint8x16_t m = veorq_u8(g, h);
909 uint8x16_t n = veorq_u8(i, j);
913#if defined(__aarch64__)
914 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
915 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
916 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
917 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
918 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
919 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
920 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
921 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
923 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
924 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
925 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
926 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
930 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
931 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
932 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
936 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
937 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
938 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
941#if defined(__aarch64__)
942 uint8x16_t t0 = vreinterpretq_u8_u64(
943 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
944 uint8x16_t t1 = vreinterpretq_u8_u64(
945 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
946 uint8x16_t t2 = vreinterpretq_u8_u64(
947 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
948 uint8x16_t t3 = vreinterpretq_u8_u64(
949 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
951 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
952 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
953 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
954 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
957 uint8x16_t t0_shift = vextq_u8(t0, t0, 15);
958 uint8x16_t t1_shift = vextq_u8(t1, t1, 14);
959 uint8x16_t t2_shift = vextq_u8(t2, t2, 13);
960 uint8x16_t t3_shift = vextq_u8(t3, t3, 12);
963 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
964 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
965 uint8x16_t mix = veorq_u8(d, cross1);
966 uint8x16_t r = veorq_u8(mix, cross2);
967 return vreinterpretq_u64_u8(r);
3894#if defined(__ARM_FEATURE_FRINT)
3896#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
3897 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
3909 float *f = (
float *) &a;
3912 uint32x4_t signmask = vdupq_n_u32(0x80000000);
3915 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3917 int32x4_t r_trunc = vcvtq_s32_f32(
3919 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3920 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
3921 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3923 float32x4_t delta = vsubq_f32(
3925 vcvtq_f32_s32(r_trunc));
3926 uint32x4_t is_delta_half =
3927 vceqq_f32(delta, half);
3929 vbslq_s32(is_delta_half, r_even, r_normal));
3932 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
3938 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
6351#if defined(__aarch64__) || defined(_M_ARM64)
6354 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6355 vmovl_s8(vget_low_s8(b)));
6356 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6357 vmovl_s8(vget_high_s8(b)));
6359 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6367 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6368 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6371 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6372 int16x8_t b_odd = vshrq_n_s16(b, 8);
6375 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6376 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
7001 float32x4_t elementwise_prod =
_mm_mul_ps(a, b);
7003#if defined(__aarch64__) || defined(_M_ARM64)
7009 if ((imm & 0x0F) == 0x0F) {
7010 if (!(imm & (1 << 4)))
7011 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
7012 if (!(imm & (1 << 5)))
7013 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
7014 if (!(imm & (1 << 6)))
7015 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
7016 if (!(imm & (1 << 7)))
7017 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
7026 s += vgetq_lane_f32(elementwise_prod, 0);
7028 s += vgetq_lane_f32(elementwise_prod, 1);
7030 s += vgetq_lane_f32(elementwise_prod, 2);
7032 s += vgetq_lane_f32(elementwise_prod, 3);
7034 const float32_t res[4] = {
7035 (imm & 0x1) ? s : 0.0f,
7036 (imm & 0x2) ? s : 0.0f,
7037 (imm & 0x4) ? s : 0.0f,
7038 (imm & 0x8) ? s : 0.0f,
7295 switch (imm & 0x4) {
7305#if defined(__GNUC__) || defined(__clang__)
7306 __builtin_unreachable();
7307#elif defined(_MSC_VER)
7313 switch (imm & 0x3) {
7315 _b = vreinterpretq_u8_u32(
7319 _b = vreinterpretq_u8_u32(
7323 _b = vreinterpretq_u8_u32(
7327 _b = vreinterpretq_u8_u32(
7331#if defined(__GNUC__) || defined(__clang__)
7332 __builtin_unreachable();
7333#elif defined(_MSC_VER)
7339 int16x8_t c04, c15, c26, c37;
7340 uint8x8_t low_b = vget_low_u8(_b);
7341 c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
7342 uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
7343 c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
7344 uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
7345 c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
7346 uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
7347 c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
7348#if defined(__aarch64__) || defined(_M_ARM64)
7350 c04 = vpaddq_s16(c04, c26);
7352 c15 = vpaddq_s16(c15, c37);
7355 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7357 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7359 vreinterpretq_s16_s32(trn2_c)));
7361 int16x4_t c01, c23, c45, c67;
7362 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
7363 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
7364 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
7365 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
7368 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
7974 int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
7975 int m1 = 0x10000 - (1 << la);
7976 int tb = 0x10000 - (1 << lb);
7977 uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
7978 uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
7979 vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7980 vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
7981 vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
7982 vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
7983 vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
7984 tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
7985 tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
7987 res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
7988 res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
7989 res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
7990 res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
7991 res_lo = vand_u8(res_lo, vec_mask);
7992 res_hi = vand_u8(res_hi, vec_mask);
8611#if defined(__aarch64__) || defined(_M_ARM64)
8612 static const uint8_t shift_rows[] = {
8613 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8614 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8616 static const uint8_t ror32by8[] = {
8617 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8618 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8625 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8640 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8641 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8642 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8648#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8649 (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8650 ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8652#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b ))
8654#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8655#define SSE2NEON_AES_U0(p) \
8656 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8657#define SSE2NEON_AES_U1(p) \
8658 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8659#define SSE2NEON_AES_U2(p) \
8660 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8661#define SSE2NEON_AES_U3(p) \
8662 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8666 static const uint32_t
ALIGN_STRUCT(16) aes_table[4][256] = {
8672#undef SSE2NEON_AES_B2W
8673#undef SSE2NEON_AES_F2
8674#undef SSE2NEON_AES_F3
8675#undef SSE2NEON_AES_U0
8676#undef SSE2NEON_AES_U1
8677#undef SSE2NEON_AES_U2
8678#undef SSE2NEON_AES_U3
8690 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8691 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8692 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8693 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8694 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8695 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8696 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8697 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8708#if defined(__aarch64__)
8709 static const uint8_t inv_shift_rows[] = {
8710 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8711 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8713 static const uint8_t ror32by8[] = {
8714 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8715 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8722 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8732 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8733 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8735 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8737 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
8739 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8740 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8747 uint8_t i, e, f, g, h, v[4][4];
8748 uint8_t *_a = (uint8_t *) &a;
8749 for (i = 0; i < 16; ++i) {
8754 for (i = 0; i < 4; ++i) {
8866#if defined(__aarch64__)
8867 static const uint8_t ror32by8[] = {
8868 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8869 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8875 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8876 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8878 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8881 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8882 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8883 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8887 uint8_t i, e, f, g, h, v[4][4];
8889 for (i = 0; i < 4; ++i) {