VectorMath.cpp [plain text]

/*
 * Copyright (C) 2010, Google Inc. All rights reserved.
 * Copyright (C) 2020, Apple Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1.  Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2.  Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"

#if ENABLE(WEB_AUDIO)

#include "AudioUtilities.h"
#include "VectorMath.h"

#if USE(ACCELERATE)
#include <Accelerate/Accelerate.h>
#endif

#if CPU(X86_SSE2)
#include <emmintrin.h>
#endif

#if HAVE(ARM_NEON_INTRINSICS)
#include <arm_neon.h>
#endif

#include <algorithm>
#include <math.h>

namespace WebCore {

namespace VectorMath {

#if USE(ACCELERATE)
// On the Mac we use the highly optimized versions in Accelerate.framework

void multiplyByScalar(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
    vDSP_vsmul(inputVector, 1, &scalar, outputVector, 1, numberOfElementsToProcess);
}

void add(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
    vDSP_vadd(inputVector1, 1, inputVector2, 1, outputVector, 1, numberOfElementsToProcess);
}

void addScalar(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
    vDSP_vsadd(inputVector, 1, &scalar, outputVector, 1, numberOfElementsToProcess);
}

void multiply(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
    vDSP_vmul(inputVector1, 1, inputVector2, 1, outputVector, 1, numberOfElementsToProcess);
}

void multiplyComplex(const float* realVector1, const float* imagVector1, const float* realVector2, const float* imag2P, float* realOutputVector, float* imagDestP, size_t numberOfElementsToProcess)
{
    DSPSplitComplex sc1;
    DSPSplitComplex sc2;
    DSPSplitComplex dest;
    sc1.realp = const_cast<float*>(realVector1);
    sc1.imagp = const_cast<float*>(imagVector1);
    sc2.realp = const_cast<float*>(realVector2);
    sc2.imagp = const_cast<float*>(imag2P);
    dest.realp = realOutputVector;
    dest.imagp = imagDestP;
    vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, numberOfElementsToProcess, 1);
}

void multiplyByScalarThenAddToOutput(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
    vDSP_vsma(inputVector, 1, &scalar, outputVector, 1, outputVector, 1, numberOfElementsToProcess);
}

void multiplyByScalarThenAddToVector(const float* inputVector1, float scalar, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
    vDSP_vsma(inputVector1, 1, &scalar, inputVector2, 1, outputVector, 1, numberOfElementsToProcess);
}

void addVectorsThenMultiplyByScalar(const float* inputVector1, const float* inputVector2, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
    vDSP_vasm(inputVector1, 1, inputVector2, 1, &scalar, outputVector, 1, numberOfElementsToProcess);
}

float maximumMagnitude(const float* inputVector, size_t numberOfElementsToProcess)
{
    float maximumValue = 0;
    vDSP_maxmgv(inputVector, 1, &maximumValue, numberOfElementsToProcess);
    return maximumValue;
}

float sumOfSquares(const float* inputVector, size_t numberOfElementsToProcess)
{
    float sum = 0;
    vDSP_svesq(const_cast<float*>(inputVector), 1, &sum, numberOfElementsToProcess);
    return sum;
}

void clamp(const float* inputVector, float minimum, float maximum, float* outputVector, size_t numberOfElementsToProcess)
{
    vDSP_vclip(const_cast<float*>(inputVector), 1, &minimum, &maximum, outputVector, 1, numberOfElementsToProcess);
}

void linearToDecibels(const float* inputVector, float* outputVector, size_t numberOfElementsToProcess)
{
    float reference = 1;
    vDSP_vdbcon(inputVector, 1, &reference, outputVector, 1, numberOfElementsToProcess, 1);
}

#else

static inline bool is16ByteAligned(const float* vector)
{
    return !(reinterpret_cast<uintptr_t>(vector) & 0x0F);
}

void multiplyByScalarThenAddToVector(const float* inputVector1, float scalar, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
    multiplyByScalarThenAddToOutput(inputVector1, scalar, outputVector, numberOfElementsToProcess);
    add(outputVector, inputVector2, outputVector, numberOfElementsToProcess);
}

void multiplyByScalarThenAddToOutput(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
    size_t n = numberOfElementsToProcess;

#if CPU(X86_SSE2)
    // If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    while (!is16ByteAligned(inputVector) && n) {
        *outputVector += scalar * *inputVector;
        inputVector++;
        outputVector++;
        n--;
    }

    // Now the inputVector is aligned, use SSE.
    size_t tailFrames = n % 4;
    const float* endP = outputVector + n - tailFrames;

    __m128 pSource;
    __m128 dest;
    __m128 temp;
    __m128 mScale = _mm_set_ps1(scalar);

    bool destAligned = is16ByteAligned(outputVector);

#define SSE2_MULT_ADD(loadInstr, storeInstr)       \
    while (outputVector < endP)                    \
    {                                              \
        pSource = _mm_load_ps(inputVector);        \
        temp = _mm_mul_ps(pSource, mScale);        \
        dest = _mm_##loadInstr##_ps(outputVector); \
        dest = _mm_add_ps(dest, temp);             \
        _mm_##storeInstr##_ps(outputVector, dest); \
        inputVector += 4;                          \
        outputVector += 4;                         \
    }

    if (destAligned)
        SSE2_MULT_ADD(load, store)
    else
        SSE2_MULT_ADD(loadu, storeu)

    n = tailFrames;
#elif HAVE(ARM_NEON_INTRINSICS)
    size_t tailFrames = n % 4;
    const float* endP = outputVector + n - tailFrames;

    float32x4_t k = vdupq_n_f32(scalar);
    while (outputVector < endP) {
        float32x4_t source = vld1q_f32(inputVector);
        float32x4_t dest = vld1q_f32(outputVector);

        dest = vmlaq_f32(dest, source, k);
        vst1q_f32(outputVector, dest);

        inputVector += 4;
        outputVector += 4;
    }
    n = tailFrames;
#endif
    while (n--) {
        *outputVector += *inputVector * scalar;
        ++inputVector;
        ++outputVector;
    }
}

void multiplyByScalar(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
    size_t n = numberOfElementsToProcess;

#if CPU(X86_SSE2)
    // If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    while (!is16ByteAligned(inputVector) && n) {
        *outputVector = scalar * *inputVector;
        inputVector++;
        outputVector++;
        n--;
    }

    // Now the inputVector address is aligned and start to apply SSE.
    size_t group = n / 4;
    __m128 mScale = _mm_set_ps1(scalar);
    __m128* pSource;
    __m128* pDest;
    __m128 dest;


    if (!is16ByteAligned(outputVector)) {
        while (group--) {
            pSource = reinterpret_cast<__m128*>(const_cast<float*>(inputVector));
            dest = _mm_mul_ps(*pSource, mScale);
            _mm_storeu_ps(outputVector, dest);

            inputVector += 4;
            outputVector += 4;
        }
    } else {
        while (group--) {
            pSource = reinterpret_cast<__m128*>(const_cast<float*>(inputVector));
            pDest = reinterpret_cast<__m128*>(outputVector);
            *pDest = _mm_mul_ps(*pSource, mScale);

            inputVector += 4;
            outputVector += 4;
        }
    }

    // Non-SSE handling for remaining frames which is less than 4.
    n %= 4;
#elif HAVE(ARM_NEON_INTRINSICS)
    size_t tailFrames = n % 4;
    const float* endP = outputVector + n - tailFrames;

    while (outputVector < endP) {
        float32x4_t source = vld1q_f32(inputVector);
        vst1q_f32(outputVector, vmulq_n_f32(source, scalar));

        inputVector += 4;
        outputVector += 4;
    }
    n = tailFrames;
#endif
    while (n--) {
        *outputVector = scalar * *inputVector;
        ++inputVector;
        ++outputVector;
    }
}

void addScalar(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
    size_t n = numberOfElementsToProcess;

#if CPU(X86_SSE2)
    // If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    while (!is16ByteAligned(inputVector) && n) {
        *outputVector = *inputVector + scalar;
        inputVector++;
        outputVector++;
        n--;
    }

    // Now the inputVector address is aligned and start to apply SSE.
    size_t group = n / 4;
    __m128 mScalar = _mm_set_ps1(scalar);
    __m128* pSource;
    __m128* pDest;
    __m128 dest;

    bool destAligned = is16ByteAligned(outputVector);
    if (destAligned) { // all aligned
        while (group--) {
            pSource = reinterpret_cast<__m128*>(const_cast<float*>(inputVector));
            pDest = reinterpret_cast<__m128*>(outputVector);
            *pDest = _mm_add_ps(*pSource, mScalar);

            inputVector += 4;
            outputVector += 4;
        }
    } else {
        while (group--) {
            pSource = reinterpret_cast<__m128*>(const_cast<float*>(inputVector));
            dest = _mm_add_ps(*pSource, mScalar);
            _mm_storeu_ps(outputVector, dest);

            inputVector += 4;
            outputVector += 4;
        }
    }

    // Non-SSE handling for remaining frames which is less than 4.
    n %= 4;
#elif HAVE(ARM_NEON_INTRINSICS)
    size_t tailFrames = n % 4;
    const float* endP = outputVector + n - tailFrames;
    float32x4_t scalarVector = vdupq_n_f32(scalar);

    while (outputVector < endP) {
        float32x4_t source = vld1q_f32(inputVector);
        vst1q_f32(outputVector, vaddq_f32(source, scalarVector));

        inputVector += 4;
        outputVector += 4;
    }
    n = tailFrames;
#endif
    while (n--) {
        *outputVector = *inputVector + scalar;
        ++inputVector;
        ++outputVector;
    }
}

void add(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
    size_t n = numberOfElementsToProcess;

#if CPU(X86_SSE2)
    // If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    while (!is16ByteAligned(inputVector1) && n) {
        *outputVector = *inputVector1 + *inputVector2;
        inputVector1++;
        inputVector2++;
        outputVector++;
        n--;
    }

    // Now the inputVector1 address is aligned and start to apply SSE.
    size_t group = n / 4;
    __m128* pSource1;
    __m128* pSource2;
    __m128* pDest;
    __m128 source2;
    __m128 dest;

    bool source2Aligned = is16ByteAligned(inputVector2);
    bool destAligned = is16ByteAligned(outputVector);

    if (source2Aligned && destAligned) { // all aligned
        while (group--) {
            pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
            pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector2));
            pDest = reinterpret_cast<__m128*>(outputVector);
            *pDest = _mm_add_ps(*pSource1, *pSource2);

            inputVector1 += 4;
            inputVector2 += 4;
            outputVector += 4;
        }

    } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
        while (group--) {
            pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
            pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector2));
            dest = _mm_add_ps(*pSource1, *pSource2);
            _mm_storeu_ps(outputVector, dest);

            inputVector1 += 4;
            inputVector2 += 4;
            outputVector += 4;
        }

    } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
        while (group--) {
            pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
            source2 = _mm_loadu_ps(inputVector2);
            pDest = reinterpret_cast<__m128*>(outputVector);
            *pDest = _mm_add_ps(*pSource1, source2);

            inputVector1 += 4;
            inputVector2 += 4;
            outputVector += 4;
        }
    } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
        while (group--) {
            pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
            source2 = _mm_loadu_ps(inputVector2);
            dest = _mm_add_ps(*pSource1, source2);
            _mm_storeu_ps(outputVector, dest);

            inputVector1 += 4;
            inputVector2 += 4;
            outputVector += 4;
        }
    }

    // Non-SSE handling for remaining frames which is less than 4.
    n %= 4;
#elif HAVE(ARM_NEON_INTRINSICS)
    size_t tailFrames = n % 4;
    const float* endP = outputVector + n - tailFrames;

    while (outputVector < endP) {
        float32x4_t source1 = vld1q_f32(inputVector1);
        float32x4_t source2 = vld1q_f32(inputVector2);
        vst1q_f32(outputVector, vaddq_f32(source1, source2));

        inputVector1 += 4;
        inputVector2 += 4;
        outputVector += 4;
    }
    n = tailFrames;
#endif
    while (n--) {
        *outputVector = *inputVector1 + *inputVector2;
        ++inputVector1;
        ++inputVector2;
        ++outputVector;
    }
}

void multiply(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
    size_t n = numberOfElementsToProcess;

#if CPU(X86_SSE2)
    // If the inputVector1 address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    while (!is16ByteAligned(inputVector1) && n) {
        *outputVector = *inputVector1 * *inputVector2;
        inputVector1++;
        inputVector2++;
        outputVector++;
        n--;
    }

    // Now the inputVector1 address aligned and start to apply SSE.
    size_t tailFrames = n % 4;
    const float* endP = outputVector + n - tailFrames;
    __m128 pSource1;
    __m128 pSource2;
    __m128 dest;

    bool source2Aligned = is16ByteAligned(inputVector2);
    bool destAligned = is16ByteAligned(outputVector);

#define SSE2_MULT(loadInstr, storeInstr)               \
    while (outputVector < endP)                        \
    {                                                  \
        pSource1 = _mm_load_ps(inputVector1);          \
        pSource2 = _mm_##loadInstr##_ps(inputVector2); \
        dest = _mm_mul_ps(pSource1, pSource2);         \
        _mm_##storeInstr##_ps(outputVector, dest);     \
        inputVector1 += 4;                             \
        inputVector2 += 4;                             \
        outputVector += 4;                             \
    }

    if (source2Aligned && destAligned) // Both aligned.
        SSE2_MULT(load, store)
    else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
        SSE2_MULT(load, storeu)
    else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
        SSE2_MULT(loadu, store)
    else // Neither aligned.
        SSE2_MULT(loadu, storeu)

    n = tailFrames;
#elif HAVE(ARM_NEON_INTRINSICS)
    size_t tailFrames = n % 4;
    const float* endP = outputVector + n - tailFrames;

    while (outputVector < endP) {
        float32x4_t source1 = vld1q_f32(inputVector1);
        float32x4_t source2 = vld1q_f32(inputVector2);
        vst1q_f32(outputVector, vmulq_f32(source1, source2));

        inputVector1 += 4;
        inputVector2 += 4;
        outputVector += 4;
    }
    n = tailFrames;
#endif
    while (n--) {
        *outputVector = *inputVector1 * *inputVector2;
        ++inputVector1;
        ++inputVector2;
        ++outputVector;
    }
}

void multiplyComplex(const float* realVector1, const float* imagVector1, const float* realVector2, const float* imag2P, float* realOutputVector, float* imagDestP, size_t numberOfElementsToProcess)
{
    unsigned i = 0;
#if CPU(X86_SSE2)
    // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned. 
    // Otherwise, fall through to the scalar code below.
    if (is16ByteAligned(realVector1) && is16ByteAligned(imagVector1) && is16ByteAligned(realVector2) && is16ByteAligned(imag2P) && is16ByteAligned(realOutputVector) && is16ByteAligned(imagDestP)) {
        unsigned endSize = numberOfElementsToProcess - numberOfElementsToProcess % 4;
        while (i < endSize) {
            __m128 real1 = _mm_load_ps(realVector1 + i);
            __m128 real2 = _mm_load_ps(realVector2 + i);
            __m128 imag1 = _mm_load_ps(imagVector1 + i);
            __m128 imag2 = _mm_load_ps(imag2P + i);
            __m128 real = _mm_mul_ps(real1, real2);
            real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
            __m128 imag = _mm_mul_ps(real1, imag2);
            imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
            _mm_store_ps(realOutputVector + i, real);
            _mm_store_ps(imagDestP + i, imag);
            i += 4;
        }
    }
#elif HAVE(ARM_NEON_INTRINSICS)
        unsigned endSize = numberOfElementsToProcess - numberOfElementsToProcess % 4;
        while (i < endSize) {
            float32x4_t real1 = vld1q_f32(realVector1 + i);
            float32x4_t real2 = vld1q_f32(realVector2 + i);
            float32x4_t imag1 = vld1q_f32(imagVector1 + i);
            float32x4_t imag2 = vld1q_f32(imag2P + i);

            float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
            float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);

            vst1q_f32(realOutputVector + i, realResult);
            vst1q_f32(imagDestP + i, imagResult);

            i += 4;
        }
#endif
    for (; i < numberOfElementsToProcess; ++i) {
        // Read and compute result before storing them, in case the
        // destination is the same as one of the sources.
        realOutputVector[i] = realVector1[i] * realVector2[i] - imagVector1[i] * imag2P[i];
        imagDestP[i] = realVector1[i] * imag2P[i] + imagVector1[i] * realVector2[i];
    }
}

float sumOfSquares(const float* inputVector, size_t numberOfElementsToProcess)
{
    size_t n = numberOfElementsToProcess;
    float sum = 0;

#if CPU(X86_SSE2)
    // If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    while (!is16ByteAligned(inputVector) && n) {
        float sample = *inputVector;
        sum += sample * sample;
        inputVector++;
        n--;
    }

    // Now the inputVector is aligned, use SSE.
    size_t tailFrames = n % 4;
    const float* endP = inputVector + n - tailFrames;
    __m128 source;
    __m128 mSum = _mm_setzero_ps();

    while (inputVector < endP) {
        source = _mm_load_ps(inputVector);
        source = _mm_mul_ps(source, source);
        mSum = _mm_add_ps(mSum, source);
        inputVector += 4;
    }

    // Summarize the SSE results.
    const float* groupSumP = reinterpret_cast<float*>(&mSum);
    sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];

    n = tailFrames;
#elif HAVE(ARM_NEON_INTRINSICS)
    size_t tailFrames = n % 4;
    const float* endP = inputVector + n - tailFrames;

    float32x4_t fourSum = vdupq_n_f32(0);
    while (inputVector < endP) {
        float32x4_t source = vld1q_f32(inputVector);
        fourSum = vmlaq_f32(fourSum, source, source);
        inputVector += 4;
    }
    float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));

    float groupSum[2];
    vst1_f32(groupSum, twoSum);
    sum += groupSum[0] + groupSum[1];

    n = tailFrames;
#endif

    while (n--) {
        float sample = *inputVector;
        sum += sample * sample;
        ++inputVector;
    }

    return sum;
}

float maximumMagnitude(const float* inputVector, size_t numberOfElementsToProcess)
{
    size_t n = numberOfElementsToProcess;
    float max = 0;

#if CPU(X86_SSE2)
    // If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    while (!is16ByteAligned(inputVector) && n) {
        max = std::max(max, std::abs(*inputVector));
        inputVector++;
        n--;
    }

    // Now the inputVector is aligned, use SSE.
    size_t tailFrames = n % 4;
    const float* endP = inputVector + n - tailFrames;
    __m128 source;
    __m128 mMax = _mm_setzero_ps();
    int mask = 0x7FFFFFFF;
    __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask));

    while (inputVector < endP) {
        source = _mm_load_ps(inputVector);
        // Calculate the absolute value by anding source with mask, the sign bit is set to 0.
        source = _mm_and_ps(source, mMask);
        mMax = _mm_max_ps(mMax, source);
        inputVector += 4;
    }

    // Get max from the SSE results.
    const float* groupMaxP = reinterpret_cast<float*>(&mMax);
    max = std::max(max, groupMaxP[0]);
    max = std::max(max, groupMaxP[1]);
    max = std::max(max, groupMaxP[2]);
    max = std::max(max, groupMaxP[3]);

    n = tailFrames;
#elif HAVE(ARM_NEON_INTRINSICS)
    size_t tailFrames = n % 4;
    const float* endP = inputVector + n - tailFrames;

    float32x4_t fourMax = vdupq_n_f32(0);
    while (inputVector < endP) {
        float32x4_t source = vld1q_f32(inputVector);
        fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
        inputVector += 4;
    }
    float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));

    float groupMax[2];
    vst1_f32(groupMax, twoMax);
    max = std::max(groupMax[0], groupMax[1]);

    n = tailFrames;
#endif

    while (n--) {
        max = std::max(max, std::abs(*inputVector));
        ++inputVector;
    }

    return max;
}

void clamp(const float* inputVector, float minimum, float maximum, float* outputVector, size_t numberOfElementsToProcess)
{
    size_t n = numberOfElementsToProcess;

    // FIXME: Optimize for SSE2.
#if HAVE(ARM_NEON_INTRINSICS)
    size_t tailFrames = n % 4;
    const float* endP = outputVector + n - tailFrames;

    float32x4_t low = vdupq_n_f32(minimum);
    float32x4_t high = vdupq_n_f32(maximum);
    while (outputVector < endP) {
        float32x4_t source = vld1q_f32(inputVector);
        vst1q_f32(outputVector, vmaxq_f32(vminq_f32(source, high), low));
        inputVector += 4;
        outputVector += 4;
    }
    n = tailFrames;
#endif
    while (n--) {
        *outputVector = std::clamp(*inputVector, minimum, maximum);
        ++inputVector;
        ++outputVector;
    }
}

void linearToDecibels(const float* inputVector, float* outputVector, size_t numberOfElementsToProcess)
{
    for (size_t i = 0; i < numberOfElementsToProcess; ++i)
        outputVector[i] = AudioUtilities::linearToDecibels(inputVector[i]);
}

void addVectorsThenMultiplyByScalar(const float* inputVector1, const float* inputVector2, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
    add(inputVector1, inputVector2, outputVector, numberOfElementsToProcess);
    multiplyByScalar(outputVector, scalar, outputVector, numberOfElementsToProcess);
}

#endif // USE(ACCELERATE)

} // namespace VectorMath

} // namespace WebCore

#endif // ENABLE(WEB_AUDIO)