diff --git a/silk/VAD.c b/silk/VAD.c index d0cda5216..402fe6f76 100644 --- a/silk/VAD.c +++ b/silk/VAD.c @@ -33,7 +33,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "stack_alloc.h" /* Silk VAD noise level estimation */ -# if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +# if !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_ARM_MAY_HAVE_NEON_INTR) static OPUS_INLINE void silk_VAD_GetNoiseLevels( const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */ silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */ @@ -297,7 +297,7 @@ opus_int silk_VAD_GetSA_Q8_c( /* O Return v /**************************/ /* Noise level estimation */ /**************************/ -# if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +# if !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_ARM_MAY_HAVE_NEON_INTR) static OPUS_INLINE #endif void silk_VAD_GetNoiseLevels( diff --git a/silk/arm/VAD_arm.h b/silk/arm/VAD_arm.h new file mode 100644 index 000000000..ee49f100d --- /dev/null +++ b/silk/arm/VAD_arm.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2026 Xiph.Org Foundation */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef SILK_VAD_ARM_H +#define SILK_VAD_ARM_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) + +/* When NEON is enabled, silk_VAD_GetNoiseLevels is exported (rather than a + static inline in VAD.c) so the NEON silk_VAD_GetSA_Q8 can call it, mirroring + the x86 path. */ +void silk_VAD_GetNoiseLevels( + const opus_int32 pX[ VAD_N_BANDS ], + silk_VAD_state *psSilk_VAD +); + +opus_int silk_VAD_GetSA_Q8_neon( + silk_encoder_state *psEncC, + const opus_int16 pIn[] +); + +#if defined(OPUS_ARM_PRESUME_NEON_INTR) + +#define OVERRIDE_silk_VAD_GetSA_Q8 +#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \ + ((void)(arch), silk_VAD_GetSA_Q8_neon(psEnC, pIn)) + +#elif defined(OPUS_HAVE_RTCD) + +#define OVERRIDE_silk_VAD_GetSA_Q8 +extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])( + silk_encoder_state *psEncC, + const opus_int16 pIn[] +); +#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \ + ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn)) + +#endif + +#endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */ + +#endif /* SILK_VAD_ARM_H */ diff --git a/silk/arm/VAD_neon_intr.c b/silk/arm/VAD_neon_intr.c new file mode 100644 index 000000000..32141d1d2 --- /dev/null +++ b/silk/arm/VAD_neon_intr.c @@ -0,0 +1,288 @@ +/* Copyright (c) 2026 Xiph.Org Foundation + Arm NEON port of the SSE4.1 implementation by + XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers (Cisco Systems, INC) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include + +#include "main.h" +#include "stack_alloc.h" + +/* Weighting factors for tilt measure */ +static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 }; + +/***************************************/ +/* Get the speech activity level in Q8 */ +/***************************************/ +opus_int silk_VAD_GetSA_Q8_neon( /* O Return value, 0 if success */ + silk_encoder_state *psEncC, /* I/O Encoder state */ + const opus_int16 pIn[] /* I PCM input */ +) +{ + opus_int SA_Q15, pSNR_dB_Q7, input_tilt; + opus_int decimated_framelength1, decimated_framelength2; + opus_int decimated_framelength; + opus_int dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s; + opus_int32 sumSquared, smooth_coef_Q16; + opus_int16 HPstateTmp; + VARDECL( opus_int16, X ); + opus_int32 Xnrg[ VAD_N_BANDS ]; + opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ]; + opus_int32 speech_nrg, x_tmp; + opus_int X_offset[ VAD_N_BANDS ]; + opus_int ret = 0; + silk_VAD_state *psSilk_VAD = &psEncC->sVAD; + + SAVE_STACK; + +#ifdef OPUS_CHECK_ASM + silk_encoder_state psEncC_c; + opus_int ret_c; + + silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) ); + ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn ); +#endif + + /* Safety checks */ + silk_assert( VAD_N_BANDS == 4 ); + celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length ); + celt_assert( psEncC->frame_length <= 512 ); + celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) ); + + /***********************/ + /* Filter and Decimate */ + /***********************/ + decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 ); + decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 ); + decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 ); + /* Decimate into 4 bands: + 0 L 3L L 3L 5L + - -- - -- -- + 8 8 2 4 4 + + [0-1 kHz| temp. |1-2 kHz| 2-4 kHz | 4-8 kHz | + + They're arranged to allow the minimal ( frame_length / 4 ) extra + scratch space during the downsampling process */ + X_offset[ 0 ] = 0; + X_offset[ 1 ] = decimated_framelength + decimated_framelength2; + X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength; + X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2; + ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 ); + + /* 0-8 kHz to 0-4 kHz and 4-8 kHz */ + silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[ 0 ], + X, &X[ X_offset[ 3 ] ], psEncC->frame_length ); + + /* 0-4 kHz to 0-2 kHz and 2-4 kHz */ + silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ], + X, &X[ X_offset[ 2 ] ], decimated_framelength1 ); + + /* 0-2 kHz to 0-1 kHz and 1-2 kHz */ + silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ], + X, &X[ X_offset[ 1 ] ], decimated_framelength2 ); + + /*********************************************/ + /* HP filter on lowest band (differentiator) */ + /*********************************************/ + X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 ); + HPstateTmp = X[ decimated_framelength - 1 ]; + for( i = decimated_framelength - 1; i > 0; i-- ) { + X[ i - 1 ] = silk_RSHIFT( X[ i - 1 ], 1 ); + X[ i ] -= X[ i - 1 ]; + } + X[ 0 ] -= psSilk_VAD->HPstate; + psSilk_VAD->HPstate = HPstateTmp; + + /*************************************/ + /* Calculate the energy in each band */ + /*************************************/ + for( b = 0; b < VAD_N_BANDS; b++ ) { + /* Find the decimated framelength in the non-uniformly divided bands */ + decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) ); + + /* Split length into subframe lengths */ + dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 ); + dec_subframe_offset = 0; + + /* Compute energy per sub-frame */ + /* initialize with summed energy of last subframe */ + Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ]; + for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) { + int32x4_t acc_s32x4; + sumSquared = 0; + + acc_s32x4 = vdupq_n_s32( 0 ); + + /* Process 8 samples per iteration: ( X[i] >> 3 )^2, summed in int32 + lanes. Bit-exact with the C reference (exact integer sum of + squares; no overflow for dec_subframe_length <= 128). */ + for( i = 0; i < dec_subframe_length - 7; i += 8 ) + { + int16x8_t x_s16x8 = vld1q_s16( &X[ X_offset[ b ] + i + dec_subframe_offset ] ); + x_s16x8 = vshrq_n_s16( x_s16x8, 3 ); + acc_s32x4 = vmlal_s16( acc_s32x4, vget_low_s16( x_s16x8 ), vget_low_s16( x_s16x8 ) ); + acc_s32x4 = vmlal_s16( acc_s32x4, vget_high_s16( x_s16x8 ), vget_high_s16( x_s16x8 ) ); + } + + sumSquared += vaddvq_s32( acc_s32x4 ); + + for( ; i < dec_subframe_length; i++ ) { + /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2. */ + /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128) */ + x_tmp = silk_RSHIFT( + X[ X_offset[ b ] + i + dec_subframe_offset ], 3 ); + sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp ); + + /* Safety check */ + silk_assert( sumSquared >= 0 ); + } + + /* Add/saturate summed energy of current subframe */ + if( s < VAD_INTERNAL_SUBFRAMES - 1 ) { + Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared ); + } else { + /* Look-ahead subframe */ + Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) ); + } + + dec_subframe_offset += dec_subframe_length; + } + psSilk_VAD->XnrgSubfr[ b ] = sumSquared; + } + + /********************/ + /* Noise estimation */ + /********************/ + silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD ); + + /***********************************************/ + /* Signal-plus-noise to noise ratio estimation */ + /***********************************************/ + sumSquared = 0; + input_tilt = 0; + for( b = 0; b < VAD_N_BANDS; b++ ) { + speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ]; + if( speech_nrg > 0 ) { + /* Divide, with sufficient resolution */ + if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) { + NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 ); + } else { + NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 ); + } + + /* Convert to log domain */ + SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128; + + /* Sum-of-squares */ + sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 ); /* Q14 */ + + /* Tilt measure */ + if( speech_nrg < ( (opus_int32)1 << 20 ) ) { + /* Scale down SNR value for small subband speech energies */ + SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 ); + } + input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 ); + } else { + NrgToNoiseRatio_Q8[ b ] = 256; + } + } + + /* Mean-of-squares */ + sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */ + + /* Root-mean-square approximation, scale to dBs, and write to output pointer */ + pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */ + + /*********************************/ + /* Speech Probability Estimation */ + /*********************************/ + SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 ); + + /**************************/ + /* Frequency Tilt Measure */ + /**************************/ + psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 ); + + /**************************************************/ + /* Scale the sigmoid output based on power levels */ + /**************************************************/ + speech_nrg = 0; + for( b = 0; b < VAD_N_BANDS; b++ ) { + /* Accumulate signal-without-noise energies, higher frequency bands have more weight */ + speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 ); + } + + if( psEncC->frame_length == 20 * psEncC->fs_kHz ) { + speech_nrg = silk_RSHIFT32( speech_nrg, 1 ); + } + /* Power scaling */ + if( speech_nrg <= 0 ) { + SA_Q15 = silk_RSHIFT( SA_Q15, 1 ); + } else if( speech_nrg < 16384 ) { + speech_nrg = silk_LSHIFT32( speech_nrg, 16 ); + + /* square-root */ + speech_nrg = silk_SQRT_APPROX( speech_nrg ); + SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 ); + } + + /* Copy the resulting speech activity in Q8 */ + psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX ); + + /***********************************/ + /* Energy Level and SNR estimation */ + /***********************************/ + /* Smoothing coefficient */ + smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) ); + + if( psEncC->frame_length == 10 * psEncC->fs_kHz ) { + smooth_coef_Q16 >>= 1; + } + + for( b = 0; b < VAD_N_BANDS; b++ ) { + /* compute smoothed energy-to-noise ratio per band */ + psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ], + NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 ); + + /* signal to noise ratio in dB per band */ + SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 ); + /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */ + psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) ); + } + +#ifdef OPUS_CHECK_ASM + silk_assert( ret == ret_c ); + silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) ); +#endif + + RESTORE_STACK; + return( ret ); +} diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c index a91f79b59..3e3b3e6e7 100644 --- a/silk/arm/arm_silk_map.c +++ b/silk/arm/arm_silk_map.c @@ -37,6 +37,17 @@ POSSIBILITY OF SUCH DAMAGE. # if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && \ !defined(OPUS_ARM_PRESUME_NEON_INTR)) +opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])( + silk_encoder_state *psEncC, /* I/O Encoder state */ + const opus_int16 pIn[] /* I PCM input */ +) = { + silk_VAD_GetSA_Q8_c, /* ARMv4 */ + silk_VAD_GetSA_Q8_c, /* EDSP */ + silk_VAD_GetSA_Q8_c, /* Media */ + silk_VAD_GetSA_Q8_neon, /* Neon */ + silk_VAD_GetSA_Q8_neon, /* dotprod */ +}; + void (*const SILK_BIQUAD_ALT_STRIDE2_IMPL[OPUS_ARCHMASK + 1])( const opus_int16 *in, /* I input signal */ const opus_int32 *B_Q28, /* I MA coefficients [3] */ diff --git a/silk/main.h b/silk/main.h index 5ca9aa52a..8450c9056 100644 --- a/silk/main.h +++ b/silk/main.h @@ -44,6 +44,7 @@ POSSIBILITY OF SUCH DAMAGE. #if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) #include "arm/NSQ_del_dec_arm.h" +#include "arm/VAD_arm.h" #endif /* Convert Left/Right stereo signal to adaptive Mid/Side representation */ diff --git a/silk_sources.mk b/silk_sources.mk index 3780b1647..68eea346d 100644 --- a/silk_sources.mk +++ b/silk_sources.mk @@ -96,7 +96,8 @@ SILK_SOURCES_ARM_NEON_INTR = \ silk/arm/biquad_alt_neon_intr.c \ silk/arm/LPC_inv_pred_gain_neon_intr.c \ silk/arm/NSQ_del_dec_neon_intr.c \ -silk/arm/NSQ_neon.c +silk/arm/NSQ_neon.c \ +silk/arm/VAD_neon_intr.c SILK_SOURCES_FIXED = \ silk/fixed/LTP_analysis_filter_FIX.c \