[openal] [PATCH] Add SSE version of Resample_lerp32

Sun Jun 1 06:03:04 EDT 2014

This reduces cpu time spent in the function by around 15% during the OpenArena benchmark of the Phoronix Test Suite. Also if the -msse4.1 switch is used when building OpenAL the time is reduced by around 43%. I think this is because _mm_extract_epi32() is being used in 4.1 to extract the position without needing shuffling.

This reduces the total time spent in OpenAL during the benchmark from 14.2% to 13.35% for SSE2 or 11.78% for SSE4.1

These percentages were gathered with callgrind
---
 Alc/ALu.c        |  4 ++++
 Alc/mixer_defs.h |  4 ++++
 Alc/mixer_sse.c  | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+)

diff --git a/Alc/ALu.c b/Alc/ALu.c
index 82932cf..1c5cf5d 100644
--- a/Alc/ALu.c
+++ b/Alc/ALu.c
@@ -87,6 +87,10 @@ static ResamplerFunc SelectResampler(enum Resampler Resampler, ALuint increment)
         case PointResampler:
             return Resample_point32_C;
         case LinearResampler:
+#ifdef HAVE_SSE
+            if((CPUCapFlags&CPU_CAP_SSE))
+                return Resample_lerp32_SSE;
+#endif
             return Resample_lerp32_C;
         case CubicResampler:
             return Resample_cubic32_C;
diff --git a/Alc/mixer_defs.h b/Alc/mixer_defs.h
index 04fd1f5..67176aa 100644
--- a/Alc/mixer_defs.h
+++ b/Alc/mixer_defs.h
@@ -42,6 +42,10 @@ void MixSend_SSE(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
                  struct MixGainMono *Gain, ALuint Counter, ALuint OutPos,
                  ALuint BufferSize);
 
+/* SSE resamplers */
+const ALfloat *Resample_lerp32_SSE(const ALfloat *src, ALuint frac, ALuint increment,
+                                   ALfloat *restrict dst, ALuint numsamples);
+
 /* Neon mixers */
 void MixDirect_Hrtf_Neon(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
                          ALuint Counter, ALuint Offset, ALuint OutPos, const ALuint IrSize,
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index c4e1fdf..1f0ab95 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -138,6 +138,71 @@ static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
 #undef SUFFIX
 
 
+const ALfloat *Resample_lerp32_SSE(const ALfloat *src, ALuint frac, ALuint increment,
+                                   ALfloat *restrict dst, ALuint numsamples)
+{
+    ALuint i;
+    ALuint pos;
+    ALuint pos0_tmp = 0;
+    ALuint frac0_tmp = frac;
+
+    ALuint frac0_incr = frac0_tmp + increment;
+    ALuint pos1_tmp = frac0_incr>>FRACTIONBITS;
+    ALuint frac1_tmp = (frac0_incr & FRACTIONMASK);
+
+    ALuint frac1_incr = frac1_tmp + increment;
+    ALuint pos2_tmp = pos1_tmp + (frac1_incr>>FRACTIONBITS);
+    ALuint frac2_tmp = (frac1_incr & FRACTIONMASK);
+
+    ALuint frac2_incr = frac2_tmp + increment;
+    ALuint pos3_tmp = pos2_tmp + (frac2_incr>>FRACTIONBITS);
+    ALuint frac3_tmp = (frac2_incr & FRACTIONMASK);
+
+    __m128i frac4 = _mm_set_epi32(frac3_tmp, frac2_tmp, frac1_tmp, frac0_tmp);
+    const __m128i increment4 = _mm_set1_epi32(increment*4);
+    const __m128 fracOne4 = _mm_set1_ps(1.0f/FRACTIONONE);
+    const __m128i fracMask4 = _mm_set1_epi32(FRACTIONMASK);
+
+    __m128i pos4 = _mm_set_epi32(pos3_tmp, pos2_tmp, pos1_tmp, pos0_tmp);
+
+    for(i = 0;i < numsamples-3;i += 4)
+    {
+        __m128 val1 = _mm_set_ps(src[pos3_tmp], src[pos2_tmp], src[pos1_tmp], src[pos0_tmp]);
+        __m128 val2 = _mm_set_ps(src[pos3_tmp+1], src[pos2_tmp+1], src[pos1_tmp+1], src[pos0_tmp+1]);
+
+        /* val1 + (val2-val1)*mu */
+        const __m128 r0 = _mm_sub_ps(val2, val1);
+        const __m128 mu = _mm_mul_ps(_mm_cvtepi32_ps(frac4), fracOne4);
+        const __m128 r1 = _mm_mul_ps(mu, r0);
+        const __m128 out = _mm_add_ps(val1, r1);
+
+        _mm_store_ps(&dst[i], out);
+
+        frac4 = _mm_add_epi32(frac4, increment4);
+        pos4 = _mm_add_epi32(pos4, _mm_srli_epi32(frac4, FRACTIONBITS));
+        frac4 = _mm_and_si128(frac4, fracMask4);
+
+        pos0_tmp = _mm_cvtsi128_si32(pos4);
+        pos1_tmp = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(1,1,1,1)));
+        pos2_tmp = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(2,2,2,2)));
+        pos3_tmp = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(3,3,3,3)));
+    }
+
+    pos = pos0_tmp;
+    frac = _mm_cvtsi128_si32(frac4);
+
+    for(;i < numsamples;i++)
+    {
+        dst[i] = lerp(src[pos], src[pos+1], frac * (1.0f/FRACTIONONE));
+
+        frac += increment;
+        pos  += frac>>FRACTIONBITS;
+        frac &= FRACTIONMASK;
+    }
+    return dst;
+}
+
+
 void MixDirect_SSE(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
                    MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
 {
-- 
1.9.0