[openal] [PATCH] Add SSE version of Resample_lerp32

Sun Jun 1 06:19:56 EDT 2014

I'm open to suggestions to improve this further. This is my first time
using SSE so its very possible I haven't done this the best way. Also
one thing I was worried about is using _mm_cvtepi32_ps() to convert
'frac' from an integer to a float as its meant to be used on signed
integers. Is it likely that this value will ever be so large that this
will actually matter?

On Sun, 2014-06-01 at 20:03 +1000, Timothy Arceri wrote:
> This reduces cpu time spent in the function by around 15% during the OpenArena benchmark of the Phoronix Test Suite. Also if the -msse4.1 switch is used when building OpenAL the time is reduced by around 43%. I think this is because _mm_extract_epi32() is being used in 4.1 to extract the position without needing shuffling.
> 
> This reduces the total time spent in OpenAL during the benchmark from 14.2% to 13.35% for SSE2 or 11.78% for SSE4.1
> 
> These percentages were gathered with callgrind
> ---
>  Alc/ALu.c        |  4 ++++
>  Alc/mixer_defs.h |  4 ++++
>  Alc/mixer_sse.c  | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 73 insertions(+)
> 
> diff --git a/Alc/ALu.c b/Alc/ALu.c
> index 82932cf..1c5cf5d 100644
> --- a/Alc/ALu.c
> +++ b/Alc/ALu.c
> @@ -87,6 +87,10 @@ static ResamplerFunc SelectResampler(enum Resampler Resampler, ALuint increment)
>          case PointResampler:
>              return Resample_point32_C;
>          case LinearResampler:
> +#ifdef HAVE_SSE
> +            if((CPUCapFlags&CPU_CAP_SSE))
> +                return Resample_lerp32_SSE;
> +#endif
>              return Resample_lerp32_C;
>          case CubicResampler:
>              return Resample_cubic32_C;
> diff --git a/Alc/mixer_defs.h b/Alc/mixer_defs.h
> index 04fd1f5..67176aa 100644
> --- a/Alc/mixer_defs.h
> +++ b/Alc/mixer_defs.h
> @@ -42,6 +42,10 @@ void MixSend_SSE(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
>                   struct MixGainMono *Gain, ALuint Counter, ALuint OutPos,
>                   ALuint BufferSize);
>  
> +/* SSE resamplers */
> +const ALfloat *Resample_lerp32_SSE(const ALfloat *src, ALuint frac, ALuint increment,
> +                                   ALfloat *restrict dst, ALuint numsamples);
> +
>  /* Neon mixers */
>  void MixDirect_Hrtf_Neon(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
>                           ALuint Counter, ALuint Offset, ALuint OutPos, const ALuint IrSize,
> diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
> index c4e1fdf..1f0ab95 100644
> --- a/Alc/mixer_sse.c
> +++ b/Alc/mixer_sse.c
> @@ -138,6 +138,71 @@ static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
>  #undef SUFFIX
>  
> 
> +const ALfloat *Resample_lerp32_SSE(const ALfloat *src, ALuint frac, ALuint increment,
> +                                   ALfloat *restrict dst, ALuint numsamples)
> +{
> +    ALuint i;
> +    ALuint pos;
> +    ALuint pos0_tmp = 0;
> +    ALuint frac0_tmp = frac;
> +
> +    ALuint frac0_incr = frac0_tmp + increment;
> +    ALuint pos1_tmp = frac0_incr>>FRACTIONBITS;
> +    ALuint frac1_tmp = (frac0_incr & FRACTIONMASK);
> +
> +    ALuint frac1_incr = frac1_tmp + increment;
> +    ALuint pos2_tmp = pos1_tmp + (frac1_incr>>FRACTIONBITS);
> +    ALuint frac2_tmp = (frac1_incr & FRACTIONMASK);
> +
> +    ALuint frac2_incr = frac2_tmp + increment;
> +    ALuint pos3_tmp = pos2_tmp + (frac2_incr>>FRACTIONBITS);
> +    ALuint frac3_tmp = (frac2_incr & FRACTIONMASK);
> +
> +    __m128i frac4 = _mm_set_epi32(frac3_tmp, frac2_tmp, frac1_tmp, frac0_tmp);
> +    const __m128i increment4 = _mm_set1_epi32(increment*4);
> +    const __m128 fracOne4 = _mm_set1_ps(1.0f/FRACTIONONE);
> +    const __m128i fracMask4 = _mm_set1_epi32(FRACTIONMASK);
> +
> +    __m128i pos4 = _mm_set_epi32(pos3_tmp, pos2_tmp, pos1_tmp, pos0_tmp);
> +
> +    for(i = 0;i < numsamples-3;i += 4)
> +    {
> +        __m128 val1 = _mm_set_ps(src[pos3_tmp], src[pos2_tmp], src[pos1_tmp], src[pos0_tmp]);
> +        __m128 val2 = _mm_set_ps(src[pos3_tmp+1], src[pos2_tmp+1], src[pos1_tmp+1], src[pos0_tmp+1]);
> +
> +        /* val1 + (val2-val1)*mu */
> +        const __m128 r0 = _mm_sub_ps(val2, val1);
> +        const __m128 mu = _mm_mul_ps(_mm_cvtepi32_ps(frac4), fracOne4);
> +        const __m128 r1 = _mm_mul_ps(mu, r0);
> +        const __m128 out = _mm_add_ps(val1, r1);
> +
> +        _mm_store_ps(&dst[i], out);
> +
> +        frac4 = _mm_add_epi32(frac4, increment4);
> +        pos4 = _mm_add_epi32(pos4, _mm_srli_epi32(frac4, FRACTIONBITS));
> +        frac4 = _mm_and_si128(frac4, fracMask4);
> +
> +        pos0_tmp = _mm_cvtsi128_si32(pos4);
> +        pos1_tmp = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(1,1,1,1)));
> +        pos2_tmp = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(2,2,2,2)));
> +        pos3_tmp = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(3,3,3,3)));
> +    }
> +
> +    pos = pos0_tmp;
> +    frac = _mm_cvtsi128_si32(frac4);
> +
> +    for(;i < numsamples;i++)
> +    {
> +        dst[i] = lerp(src[pos], src[pos+1], frac * (1.0f/FRACTIONONE));
> +
> +        frac += increment;
> +        pos  += frac>>FRACTIONBITS;
> +        frac &= FRACTIONMASK;
> +    }
> +    return dst;
> +}
> +
> +
>  void MixDirect_SSE(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
>                     MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
>  {