Skip to content

Commit fe8c9bf

Browse files
authored
Optimize _mm_mulhi_epi16/_mm_mulhi_epu16 (emscripten-core#14693)
Use extended multiplication instructions from the final WebAssembly SIMD specification for emulation of these intrinsics
1 parent 4d03955 commit fe8c9bf

2 files changed

Lines changed: 6 additions & 10 deletions

File tree

site/source/docs/porting/simd.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -593,9 +593,9 @@ The following table highlights the availability and expected performance of diff
593593
* - _mm_mul_sd
594594
- ⚠️ emulated with a shuffle
595595
* - _mm_mulhi_epi16
596-
- ⚠️ emulated with a SIMD four widen+two mul+generic shuffle
596+
- ⚠️ emulated with a 2x SIMD extmul+generic shuffle
597597
* - _mm_mulhi_epu16
598-
- ⚠️ emulated with a SIMD four widen+two mul+generic shuffle
598+
- ⚠️ emulated with a 2x SIMD extmul+generic shuffle
599599
* - _mm_mullo_epi16
600600
- ✅ wasm_i16x8_mul
601601
* - _mm_or_pd

system/include/compat/emmintrin.h

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -678,20 +678,16 @@ _mm_min_epu8(__m128i __a, __m128i __b)
678678
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
679679
_mm_mulhi_epi16(__m128i __a, __m128i __b)
680680
{
681-
const v128_t lo = wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8((v128_t)__a),
682-
wasm_i32x4_widen_low_i16x8((v128_t)__b));
683-
const v128_t hi = wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8((v128_t)__a),
684-
wasm_i32x4_widen_high_i16x8((v128_t)__b));
681+
const v128_t lo = wasm_i32x4_extmul_low_i16x8((v128_t)__a, (v128_t)__b);
682+
const v128_t hi = wasm_i32x4_extmul_high_i16x8((v128_t)__a, (v128_t)__b);
685683
return (__m128i)wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15);
686684
}
687685

688686
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
689687
_mm_mulhi_epu16(__m128i __a, __m128i __b)
690688
{
691-
const v128_t lo = wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8((v128_t)__a),
692-
wasm_u32x4_extend_low_u16x8((v128_t)__b));
693-
const v128_t hi = wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8((v128_t)__a),
694-
wasm_u32x4_extend_high_u16x8((v128_t)__b));
689+
const v128_t lo = wasm_u32x4_extmul_low_u16x8((v128_t)__a, (v128_t)__b);
690+
const v128_t hi = wasm_u32x4_extmul_high_u16x8((v128_t)__a, (v128_t)__b);
695691
return (__m128i)wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15);
696692
}
697693

0 commit comments

Comments
 (0)