Merge "Supported FMA Intrinsics in Audio Resampler FIR processing module"

gugelfrei
Treehugger Robot 4 years ago committed by Gerrit Code Review
commit a2fa90baca

@ -19,6 +19,25 @@ cc_defaults {
// uncomment to disable NEON on architectures that actually do support NEON, for benchmarking
// "-DUSE_NEON=false",
],
arch: {
x86: {
avx2: {
cflags: [
"-mavx2",
"-mfma",
],
},
},
x86_64: {
avx2: {
cflags: [
"-mavx2",
"-mfma",
],
},
},
},
}
cc_library_shared {

@ -36,13 +36,20 @@ namespace android {
#include <arm_neon.h>
#endif
#if defined(__SSSE3__) // Should be supported in x86 ABI for both 32 & 64-bit.
#if defined(__AVX2__) // Should be supported in x86 ABI for both 32 & 64-bit.
#define USE_AVX2 (true) // Inference AVX2/FMA Intrinsics
#define USE_SSE (true)
#include <immintrin.h>
#elif defined(__SSSE3__) // Should be supported in x86 ABI for both 32 & 64-bit.
#define USE_SSE (true) // Inference SSE Intrinsics
#define USE_AVX2 (false)
#include <tmmintrin.h>
#else
#define USE_SSE (false)
#define USE_AVX2(false)
#endif
template<typename T, typename U>
struct is_same
{

@ -80,11 +80,16 @@ static inline void ProcessSSEIntrinsic(float* out,
posCoef1 = _mm_sub_ps(posCoef1, posCoef);
negCoef = _mm_sub_ps(negCoef, negCoef1);
#if USE_AVX2
posCoef = _mm_fmadd_ps(posCoef1, interp, posCoef);
negCoef = _mm_fmadd_ps(negCoef, interp, negCoef1);
#else
posCoef1 = _mm_mul_ps(posCoef1, interp);
negCoef = _mm_mul_ps(negCoef, interp);
posCoef = _mm_add_ps(posCoef1, posCoef);
negCoef = _mm_add_ps(negCoef, negCoef1);
#endif //USE_AVX2
}
switch (CHANNELS) {
case 1: {
@ -94,11 +99,17 @@ static inline void ProcessSSEIntrinsic(float* out,
sN += 4;
posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
#if USE_AVX2
accL = _mm_fmadd_ps(posSamp, posCoef, accL);
accL = _mm_fmadd_ps(negSamp, negCoef, accL);
#else
posSamp = _mm_mul_ps(posSamp, posCoef);
negSamp = _mm_mul_ps(negSamp, negCoef);
accL = _mm_add_ps(accL, posSamp);
accL = _mm_add_ps(accL, negSamp);
#endif
} break;
case 2: {
__m128 posSamp0 = _mm_loadu_ps(sP);
@ -114,15 +125,23 @@ static inline void ProcessSSEIntrinsic(float* out,
__m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
__m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
posSampL = _mm_mul_ps(posSampL, posCoef);
posSampR = _mm_mul_ps(posSampR, posCoef);
negSampL = _mm_mul_ps(negSampL, negCoef);
negSampR = _mm_mul_ps(negSampR, negCoef);
#if USE_AVX2
accL = _mm_fmadd_ps(posSampL, posCoef, accL);
accR = _mm_fmadd_ps(posSampR, posCoef, accR);
accL = _mm_fmadd_ps(negSampL, negCoef, accL);
accR = _mm_fmadd_ps(negSampR, negCoef, accR);
#else
posSampL = _mm_mul_ps(posSampL, posCoef);
posSampR = _mm_mul_ps(posSampR, posCoef);
negSampL = _mm_mul_ps(negSampL, negCoef);
negSampR = _mm_mul_ps(negSampR, negCoef);
accL = _mm_add_ps(accL, posSampL);
accR = _mm_add_ps(accR, posSampR);
accL = _mm_add_ps(accL, negSampL);
accR = _mm_add_ps(accR, negSampR);
#endif
accL = _mm_add_ps(accL, posSampL);
accR = _mm_add_ps(accR, posSampR);
accL = _mm_add_ps(accL, negSampL);
accR = _mm_add_ps(accR, negSampR);
} break;
}
} while (count -= 4);
@ -144,9 +163,13 @@ static inline void ProcessSSEIntrinsic(float* out,
outAccum = _mm_hadd_ps(accL, accR);
outAccum = _mm_hadd_ps(outAccum, outAccum);
}
#if USE_AVX2
outSamp = _mm_fmadd_ps(outAccum, vLR,outSamp);
#else
outAccum = _mm_mul_ps(outAccum, vLR);
outSamp = _mm_add_ps(outSamp, outAccum);
#endif
_mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
}

Loading…
Cancel
Save