diff --git a/CMakeLists.txt b/CMakeLists.txt index aa7b1ae17..16b0f212c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -580,6 +580,19 @@ if(NOT OPUS_DISABLE_INTRINSICS) add_sources_group(opus lpcnet ${dnn_sources_arm_neon}) endif() + # Armv8.2 DOTPROD: build the int8 dot-product DNN kernels with the dotprod + # -march flag so cgemv8x4 uses vdotq_s32 instead of the slower vmull_s8 / + # vpadalq_s16 emulation. Dispatched at runtime via the RTCD table, matching + # the Autotools and Meson builds (the source was previously never compiled). + if(COMPILER_SUPPORT_DOTPROD AND OPUS_MAY_HAVE_DOTPROD) + target_compile_definitions(opus PRIVATE OPUS_ARM_MAY_HAVE_DOTPROD) + if(OPUS_DNN) + add_sources_group(opus lpcnet ${dnn_sources_arm_dotprod}) + set_source_files_properties(${dnn_sources_arm_dotprod} + PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+dotprod") + endif() + endif() + # silk arm neon depends on main_Fix.h target_include_directories(opus PRIVATE silk/fixed) diff --git a/cmake/OpusConfig.cmake b/cmake/OpusConfig.cmake index 9dc5a8124..62ae90ae6 100644 --- a/cmake/OpusConfig.cmake +++ b/cmake/OpusConfig.cmake @@ -90,6 +90,12 @@ elseif(OPUS_CPU_ARM AND NOT OPUS_DISABLE_INTRINSICS) set(OPUS_PRESUME_NEON ON) endif() endif() + if(COMPILER_SUPPORT_NEON AND NOT OPUS_CPU_ARM_MS) + opus_detect_dotprod(COMPILER_SUPPORT_DOTPROD) + if(COMPILER_SUPPORT_DOTPROD) + option(OPUS_MAY_HAVE_DOTPROD "Does runtime check for dotprod support" ON) + endif() + endif() endif() if(MSVC) diff --git a/cmake/OpusFunctions.cmake b/cmake/OpusFunctions.cmake index 2db77d7f8..bd18b754c 100644 --- a/cmake/OpusFunctions.cmake +++ b/cmake/OpusFunctions.cmake @@ -143,6 +143,26 @@ function(opus_detect_neon COMPILER_SUPPORT_NEON) endif() endfunction() +function(opus_detect_dotprod COMPILER_SUPPORT_DOTPROD) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64|ARM)") + message(STATUS "Check DOTPROD support by compiler") + include(CheckCSourceCompiles) + set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+dotprod") + check_c_source_compiles(" +#include +int main(void) { + int32x4_t acc = vdupq_n_s32(0); + int8x16_t a = vdupq_n_s8(1); + acc = vdotq_s32(acc, a, a); + return vgetq_lane_s32(acc, 0); +}" COMPILER_SUPPORT_DOTPROD_INTR) + unset(CMAKE_REQUIRED_FLAGS) + if(COMPILER_SUPPORT_DOTPROD_INTR) + set(COMPILER_SUPPORT_DOTPROD 1 PARENT_SCOPE) + endif() + endif() +endfunction() + function(opus_supports_cpu_detection RUNTIME_CPU_CAPABILITY_DETECTION) set(RUNTIME_CPU_CAPABILITY_DETECTION 0 PARENT_SCOPE) if(OPUS_CPU_X86 OR OPUS_CPU_X64)