diff --git a/mysql-test/suite/sys_vars/r/all_vars.result b/mysql-test/suite/sys_vars/r/all_vars.result index 458c7b15de28..80f137561cbd 100644 --- a/mysql-test/suite/sys_vars/r/all_vars.result +++ b/mysql-test/suite/sys_vars/r/all_vars.result @@ -42,6 +42,8 @@ init_slave init_slave innodb_dedicated_server innodb_dedicated_server +innodb_large_page_populate +innodb_large_page_populate innodb_log_spin_cpu_abs_lwm innodb_log_spin_cpu_abs_lwm innodb_log_spin_cpu_pct_hwm diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 3cb9d0bd74c2..a826a68b6278 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -24162,6 +24162,15 @@ static MYSQL_SYSVAR_BOOL( nullptr, nullptr, true); #endif /* HAVE_LIBNUMA */ +#ifdef UNIV_LINUX +static MYSQL_SYSVAR_BOOL( + large_page_populate, srv_large_page_populate, PLUGIN_VAR_NOCMDARG, + "Pre-populate huge / large pages used by the InnoDB buffer pool and page " + "allocator at allocation time. When OFF (default), population is skipped " + "to reduce startup time and NUMA overhead on large systems.", + nullptr, nullptr, false); +#endif /* UNIV_LINUX */ + static MYSQL_SYSVAR_BOOL( api_enable_binlog, ib_binlog_enabled, PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, @@ -24620,6 +24629,9 @@ static SYS_VAR *innobase_system_variables[] = { #ifdef HAVE_LIBNUMA MYSQL_SYSVAR(numa_interleave), #endif /* HAVE_LIBNUMA */ +#ifdef UNIV_LINUX + MYSQL_SYSVAR(large_page_populate), +#endif /* UNIV_LINUX */ MYSQL_SYSVAR(change_buffering), MYSQL_SYSVAR(change_buffer_max_size), #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG diff --git a/storage/innobase/include/detail/ut/large_page_alloc.h b/storage/innobase/include/detail/ut/large_page_alloc.h index 9e60599c918e..cd7ed32ae033 100644 --- a/storage/innobase/include/detail/ut/large_page_alloc.h +++ b/storage/innobase/include/detail/ut/large_page_alloc.h @@ -48,6 +48,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "storage/innobase/include/detail/ut/helper.h" #include "storage/innobase/include/detail/ut/page_metadata.h" #include "storage/innobase/include/detail/ut/pfs.h" +#include "storage/innobase/include/os0populate.h" extern const size_t large_page_default_size; @@ -112,7 +113,8 @@ struct Large_page_alloc : public allocator_traits { static inline void *alloc(std::size_t size, bool populate) { auto total_len = round_to_next_multiple( size + page_allocation_metadata::len, large_page_default_size); - auto mem = large_page_aligned_alloc(total_len, populate); + bool do_populate = populate && srv_large_page_populate; + auto mem = large_page_aligned_alloc(total_len, do_populate); if (unlikely(!mem)) return nullptr; page_allocation_metadata::datalen(mem, total_len); page_allocation_metadata::page_type(mem, Page_type::large_page); @@ -249,7 +251,8 @@ struct Large_page_alloc_pfs : public allocator_traits { page_allocation_metadata::pfs_metadata::pfs_memory_key_t key) { auto total_len = round_to_next_multiple( size + page_allocation_metadata::len, large_page_default_size); - auto mem = large_page_aligned_alloc(total_len, populate); + bool do_populate = populate && srv_large_page_populate; + auto mem = large_page_aligned_alloc(total_len, do_populate); if (unlikely(!mem)) return nullptr; #ifdef HAVE_PSI_MEMORY_INTERFACE diff --git a/storage/innobase/include/detail/ut/page_alloc.h b/storage/innobase/include/detail/ut/page_alloc.h index c483049dbde4..413f126f0503 100644 --- a/storage/innobase/include/detail/ut/page_alloc.h +++ b/storage/innobase/include/detail/ut/page_alloc.h @@ -66,6 +66,7 @@ namespace detail { @return Pointer to the allocated storage. nullptr if allocation failed. */ inline void *page_aligned_alloc(size_t n_bytes, bool populate) { + const bool do_populate = populate && srv_large_page_populate; #ifdef _WIN32 // With lpAddress set to nullptr, VirtualAlloc will internally round n_bytes // to the multiple of system page size if it is not already @@ -83,7 +84,7 @@ inline void *page_aligned_alloc(size_t n_bytes, bool populate) { // multiple of system page size if it is not already void *ptr = mmap(nullptr, n_bytes, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON | (populate ? OS_MAP_POPULATE : 0), -1, 0); + MAP_PRIVATE | MAP_ANON | (do_populate ? OS_MAP_POPULATE : 0), -1, 0); if (unlikely(ptr == (void *)-1)) { ib::log_warn(ER_IB_MSG_856) << "page_aligned_alloc mmap(" << n_bytes << " bytes) failed;" @@ -93,7 +94,7 @@ inline void *page_aligned_alloc(size_t n_bytes, bool populate) { } #endif - if (populate) prefault_if_not_map_populate(ptr, n_bytes); + if (do_populate) prefault_if_not_map_populate(ptr, n_bytes); return ptr; } diff --git a/storage/innobase/include/os0populate.h b/storage/innobase/include/os0populate.h index f17b3995edfd..b1ea3ffbb58f 100644 --- a/storage/innobase/include/os0populate.h +++ b/storage/innobase/include/os0populate.h @@ -17,4 +17,18 @@ void prefault_if_not_map_populate(void *ptr [[maybe_unused]], size_t n_bytes [[maybe_unused]]); +/* UNIV_LINUX is supplied on the compiler command line (see innodb.cmake), so it +is available here even though this header must not include univ.i. */ +#ifdef UNIV_LINUX +/** When false, huge-page / page-aligned allocations skip pre-population +(MAP_POPULATE and the explicit prefault step) even when the caller requests it. +Backed by the innodb_large_page_populate system variable. */ +extern bool srv_large_page_populate; +#else +/* On non-Linux platforms population is always honored as requested. constexpr +gives this internal linkage, so each translation unit gets its own copy and +there is no multiple-definition problem. */ +constexpr bool srv_large_page_populate = true; +#endif + #endif diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index c7bc55eab1eb..df9a84a4f44e 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -235,6 +235,13 @@ bool srv_use_native_aio = false; bool srv_numa_interleave = false; +#ifdef UNIV_LINUX +/** Whether huge / page-aligned allocations honor a caller's populate request. +Exposed as the innodb_large_page_populate system variable. Default off: skip +pre-population to reduce startup time and NUMA overhead on large systems. */ +bool srv_large_page_populate = false; +#endif /* UNIV_LINUX */ + #ifdef UNIV_DEBUG /** Force all user tables to use page compression. */ ulong srv_debug_compress;