[openal] [PATCH] Add SSE version of Resample_lerp32
Timothy Arceri
t_arceri at yahoo.com.au
Sun Jun 1 06:03:04 EDT 2014
This reduces cpu time spent in the function by around 15% during the OpenArena benchmark of the Phoronix Test Suite. Also if the -msse4.1 switch is used when building OpenAL the time is reduced by around 43%. I think this is because _mm_extract_epi32() is being used in 4.1 to extract the position without needing shuffling.
This reduces the total time spent in OpenAL during the benchmark from 14.2% to 13.35% for SSE2 or 11.78% for SSE4.1
These percentages were gathered with callgrind
---
Alc/ALu.c | 4 ++++
Alc/mixer_defs.h | 4 ++++
Alc/mixer_sse.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 73 insertions(+)
diff --git a/Alc/ALu.c b/Alc/ALu.c
index 82932cf..1c5cf5d 100644
--- a/Alc/ALu.c
+++ b/Alc/ALu.c
@@ -87,6 +87,10 @@ static ResamplerFunc SelectResampler(enum Resampler Resampler, ALuint increment)
case PointResampler:
return Resample_point32_C;
case LinearResampler:
+#ifdef HAVE_SSE
+ if((CPUCapFlags&CPU_CAP_SSE))
+ return Resample_lerp32_SSE;
+#endif
return Resample_lerp32_C;
case CubicResampler:
return Resample_cubic32_C;
diff --git a/Alc/mixer_defs.h b/Alc/mixer_defs.h
index 04fd1f5..67176aa 100644
--- a/Alc/mixer_defs.h
+++ b/Alc/mixer_defs.h
@@ -42,6 +42,10 @@ void MixSend_SSE(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
struct MixGainMono *Gain, ALuint Counter, ALuint OutPos,
ALuint BufferSize);
+/* SSE resamplers */
+const ALfloat *Resample_lerp32_SSE(const ALfloat *src, ALuint frac, ALuint increment,
+ ALfloat *restrict dst, ALuint numsamples);
+
/* Neon mixers */
void MixDirect_Hrtf_Neon(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
ALuint Counter, ALuint Offset, ALuint OutPos, const ALuint IrSize,
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index c4e1fdf..1f0ab95 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -138,6 +138,71 @@ static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
#undef SUFFIX
+const ALfloat *Resample_lerp32_SSE(const ALfloat *src, ALuint frac, ALuint increment,
+ ALfloat *restrict dst, ALuint numsamples)
+{
+ ALuint i;
+ ALuint pos;
+ ALuint pos0_tmp = 0;
+ ALuint frac0_tmp = frac;
+
+ ALuint frac0_incr = frac0_tmp + increment;
+ ALuint pos1_tmp = frac0_incr>>FRACTIONBITS;
+ ALuint frac1_tmp = (frac0_incr & FRACTIONMASK);
+
+ ALuint frac1_incr = frac1_tmp + increment;
+ ALuint pos2_tmp = pos1_tmp + (frac1_incr>>FRACTIONBITS);
+ ALuint frac2_tmp = (frac1_incr & FRACTIONMASK);
+
+ ALuint frac2_incr = frac2_tmp + increment;
+ ALuint pos3_tmp = pos2_tmp + (frac2_incr>>FRACTIONBITS);
+ ALuint frac3_tmp = (frac2_incr & FRACTIONMASK);
+
+ __m128i frac4 = _mm_set_epi32(frac3_tmp, frac2_tmp, frac1_tmp, frac0_tmp);
+ const __m128i increment4 = _mm_set1_epi32(increment*4);
+ const __m128 fracOne4 = _mm_set1_ps(1.0f/FRACTIONONE);
+ const __m128i fracMask4 = _mm_set1_epi32(FRACTIONMASK);
+
+ __m128i pos4 = _mm_set_epi32(pos3_tmp, pos2_tmp, pos1_tmp, pos0_tmp);
+
+ for(i = 0;i < numsamples-3;i += 4)
+ {
+ __m128 val1 = _mm_set_ps(src[pos3_tmp], src[pos2_tmp], src[pos1_tmp], src[pos0_tmp]);
+ __m128 val2 = _mm_set_ps(src[pos3_tmp+1], src[pos2_tmp+1], src[pos1_tmp+1], src[pos0_tmp+1]);
+
+ /* val1 + (val2-val1)*mu */
+ const __m128 r0 = _mm_sub_ps(val2, val1);
+ const __m128 mu = _mm_mul_ps(_mm_cvtepi32_ps(frac4), fracOne4);
+ const __m128 r1 = _mm_mul_ps(mu, r0);
+ const __m128 out = _mm_add_ps(val1, r1);
+
+ _mm_store_ps(&dst[i], out);
+
+ frac4 = _mm_add_epi32(frac4, increment4);
+ pos4 = _mm_add_epi32(pos4, _mm_srli_epi32(frac4, FRACTIONBITS));
+ frac4 = _mm_and_si128(frac4, fracMask4);
+
+ pos0_tmp = _mm_cvtsi128_si32(pos4);
+ pos1_tmp = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(1,1,1,1)));
+ pos2_tmp = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(2,2,2,2)));
+ pos3_tmp = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(3,3,3,3)));
+ }
+
+ pos = pos0_tmp;
+ frac = _mm_cvtsi128_si32(frac4);
+
+ for(;i < numsamples;i++)
+ {
+ dst[i] = lerp(src[pos], src[pos+1], frac * (1.0f/FRACTIONONE));
+
+ frac += increment;
+ pos += frac>>FRACTIONBITS;
+ frac &= FRACTIONMASK;
+ }
+ return dst;
+}
+
+
void MixDirect_SSE(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
{
--
1.9.0
More information about the openal
mailing list