diff --git a/mysql-test/suite/sys_vars/r/all_vars.result b/mysql-test/suite/sys_vars/r/all_vars.result index bc7056288965..d6a66a13b39b 100644 --- a/mysql-test/suite/sys_vars/r/all_vars.result +++ b/mysql-test/suite/sys_vars/r/all_vars.result @@ -46,6 +46,8 @@ init_replica init_replica init_slave init_slave +innodb_buffer_pool_lazy_latch_init +innodb_buffer_pool_lazy_latch_init innodb_dedicated_server innodb_dedicated_server innodb_log_spin_cpu_abs_lwm diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 747e146424d8..457e333cc981 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -83,6 +83,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "srv0start.h" #include "sync0sync.h" #include "trx0trx.h" +#include "ut0cpu_cache.h" #include "ut0new.h" #include "scope_guard.h" @@ -824,16 +825,49 @@ static void pfs_register_buffer_block( } #endif /* PFS_GROUP_BUFFER_SYNC */ -/** Initializes a buffer control block when the buf_pool is created. */ -static void buf_block_init( - buf_pool_t *buf_pool, /*!< in: buffer pool instance */ - buf_block_t *block, /*!< in: pointer to control block */ - byte *frame) /*!< in: pointer to buffer frame */ -{ +/** Initialize latches for a buffer block. */ +void buf_block_initialize_latches(buf_block_t *block) { + ut_a(!block->latches_initialized); + + /* This runs in one of three race-free contexts: + - eager init during chunk creation: at startup, or under free_list_mutex + for resize (see buf_chunk_init); + - lazy init on first use: the block has just been taken from the free list + and is owned exclusively by this thread (BUF_BLOCK_READY_FOR_USE), so it + is not yet reachable by any other thread. */ + ut_ad(srv_is_being_started || + mutex_own(&buf_pool_from_block(block)->free_list_mutex) || + buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE); + + mutex_create(LATCH_ID_BUF_BLOCK_MUTEX, &block->mutex); + +#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC + rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, LATCH_ID_BUF_BLOCK_LOCK); + ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, &block->debug_latch, + LATCH_ID_BUF_BLOCK_DEBUG)); +#else + rw_lock_create(buf_block_lock_key, &block->lock, LATCH_ID_BUF_BLOCK_LOCK); + ut_d(rw_lock_create(buf_block_debug_latch_key, &block->debug_latch, + LATCH_ID_BUF_BLOCK_DEBUG)); +#endif + +#ifdef UNIV_DEBUG + block->lock.m_id = LATCH_ID_BUF_BLOCK_LOCK; + block->debug_latch.m_id = LATCH_ID_BUF_BLOCK_DEBUG; +#endif /* UNIV_DEBUG */ + + block->lock.is_block_lock = true; + + ut_ad(rw_lock_validate(&block->lock)); + + block->latches_initialized = true; +} + +/** Lightweight initialization of a buffer control block: no latches created. */ +static void buf_block_init_light(buf_pool_t *buf_pool, buf_block_t *block, + byte *frame) { UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE); - /* This function should only be executed at database startup or by - buf_pool_resize(). Either way, adaptive hash index must not exist. */ block->ahi.assert_empty_on_init(); block->frame = frame; @@ -863,34 +897,13 @@ static void buf_block_init( page_zip_des_init(&block->page.zip); - mutex_create(LATCH_ID_BUF_BLOCK_MUTEX, &block->mutex); - -#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC - /* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration - of buffer block rwlock with performance schema. - - If PFS_GROUP_BUFFER_SYNC is defined, skip the registration - since buffer block rwlock will be registered later in - pfs_register_buffer_block(). */ - - rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, LATCH_ID_BUF_BLOCK_LOCK); - - ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, &block->debug_latch, - LATCH_ID_BUF_BLOCK_DEBUG)); - -#else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */ - - rw_lock_create(buf_block_lock_key, &block->lock, LATCH_ID_BUF_BLOCK_LOCK); - - ut_d(rw_lock_create(buf_block_debug_latch_key, &block->debug_latch, - LATCH_ID_BUF_BLOCK_DEBUG)); - -#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */ - - block->lock.is_block_lock = true; - - ut_ad(rw_lock_validate(&(block->lock))); + /* Latches are NOT initialized here. Block creation happens either at + startup (no concurrency on this instance yet) or, for resize, under the + instance's free_list_mutex. */ + ut_ad(srv_is_being_started || mutex_own(&buf_pool->free_list_mutex)); + block->latches_initialized = false; } + /* We maintain our private view of innobase_should_madvise_buf_pool() which we initialize at the beginning of buf_pool_init() and then update when the @@global.innodb_buffer_pool_in_core_file changes. @@ -1077,7 +1090,8 @@ static buf_chunk_t *buf_chunk_init( buf_chunk_t *chunk, /*!< out: chunk of buffers */ ulonglong mem_size, /*!< in: requested size in bytes */ bool populate, /*!< in: virtual page preallocation */ - std::mutex *mutex) /*!< in,out: Mutex protecting chunk map. */ + std::mutex *mutex) /*!< in,out: mutex protecting the chunk map, or + nullptr when no concurrency is possible */ { buf_block_t *block; byte *frame; @@ -1139,7 +1153,15 @@ static buf_chunk_t *buf_chunk_init( block = chunk->blocks; for (i = chunk->size; i--;) { - buf_block_init(buf_pool, block, frame); + buf_block_init_light(buf_pool, block, frame); + + /* When lazy latch initialization is disabled, create the latches now + (eager initialization, the original behavior). When enabled, they are + created on first use in buf_LRU_get_free_only(). */ + if (!srv_buf_pool_lazy_latch_init) { + buf_block_initialize_latches(block); + } + UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE); /* Add the block to the free list */ @@ -1153,6 +1175,9 @@ static buf_chunk_t *buf_chunk_init( frame += UNIV_PAGE_SIZE; } + /* buf_pool instances are created in parallel during buf_pool_init(), so the + caller passes a mutex to serialize inserts into the shared chunk map. During + resize there is no concurrency and mutex is nullptr. */ if (mutex != nullptr) { mutex->lock(); } @@ -1166,6 +1191,7 @@ static buf_chunk_t *buf_chunk_init( #ifdef PFS_GROUP_BUFFER_SYNC pfs_register_buffer_block(chunk); #endif /* PFS_GROUP_BUFFER_SYNC */ + return (chunk); } @@ -1276,7 +1302,8 @@ static void buf_pool_set_sizes(void) { @param[in] buf_pool buffer pool instance @param[in] buf_pool_size size in bytes @param[in] instance_no id of the instance -@param[in,out] mutex Mutex to protect common data structures +@param[in,out] mutex mutex protecting the shared chunk map while + instances are created in parallel @param[out] err DB_SUCCESS if all goes well @param[in] populate virtual page preallocation */ static void buf_pool_create(buf_pool_t *buf_pool, ulint buf_pool_size, @@ -1357,14 +1384,18 @@ static void buf_pool_create(buf_pool_t *buf_pool, ulint buf_pool_size, do { if (!buf_chunk_init(buf_pool, chunk, chunk_size, populate, mutex)) { + /* Failure cleanup at startup, under chunks_mutex. */ + ut_ad(mutex_own(&buf_pool->chunks_mutex)); while (--chunk >= buf_pool->chunks) { buf_block_t *block = chunk->blocks; for (i = chunk->size; i--; block++) { - mutex_free(&block->mutex); - rw_lock_free(&block->lock); - - ut_d(rw_lock_free(&block->debug_latch)); + /* Only blocks whose latches were lazily created need freeing. */ + if (block->latches_initialized) { + mutex_free(&block->mutex); + rw_lock_free(&block->lock); + ut_d(rw_lock_free(&block->debug_latch)); + } } buf_pool->deallocate_chunk(chunk); } @@ -1493,14 +1524,18 @@ static void buf_pool_free_instance(buf_pool_t *buf_pool) { chunks = buf_pool->chunks; chunk = chunks + buf_pool->n_chunks; + ut_ad(mutex_own(&buf_pool->chunks_mutex)); + while (--chunk >= chunks) { buf_block_t *block = chunk->blocks; for (ulint i = chunk->size; i--; block++) { - mutex_free(&block->mutex); - rw_lock_free(&block->lock); - - ut_d(rw_lock_free(&block->debug_latch)); + /* Only blocks whose latches were lazily created need to be freed. */ + if (block->latches_initialized) { + mutex_free(&block->mutex); + rw_lock_free(&block->lock); + ut_d(rw_lock_free(&block->debug_latch)); + } } buf_pool->deallocate_chunk(chunk); @@ -1592,12 +1627,16 @@ dberr_t buf_pool_init(ulint total_size, bool populate, ulint n_instances) { std::vector threads; - std::mutex m; + /* Shared by the worker threads of this batch to serialize chunk-map + inserts. Placed in its own cache line so the contended lock word does not + false-share with neighbouring stack data. */ + ut::Cacheline_aligned m; for (ulint id = i; id < n; ++id) { threads.emplace_back(os_thread_create( buf_pool_create_thread_key, 0, buf_pool_create, &buf_pool_ptr[id], - size, id, &m, std::ref(errs[id]), populate)); + size, id, static_cast(&m), std::ref(errs[id]), + populate)); threads[id - i].start(); } @@ -2477,20 +2516,24 @@ static void buf_pool_resize() { ulint sum_freed = 0; + /* Resize holds the instance's free_list_mutex (and runs with + buf_pool_resizing set), so reading latches_initialized here is safe. */ + ut_ad(buf_pool_resizing); + ut_ad(mutex_own(&buf_pool->free_list_mutex)); + while (chunk < echunk) { buf_block_t *block = chunk->blocks; for (ulint j = chunk->size; j--; block++) { - mutex_free(&block->mutex); - rw_lock_free(&block->lock); - - ut_d(rw_lock_free(&block->debug_latch)); + if (block->latches_initialized) { + mutex_free(&block->mutex); + rw_lock_free(&block->lock); + ut_d(rw_lock_free(&block->debug_latch)); + } } buf_pool->deallocate_chunk(chunk); - sum_freed += chunk->size; - ++chunk; } @@ -3583,7 +3626,7 @@ buf_block_t *buf_block_from_ahi(const byte *ptr) { buf_block_t *block = &chunk->blocks[offs]; - /* The function buf_chunk_init() invokes buf_block_init() so that + /* The function buf_chunk_init() invokes buf_block_init_light() so that block[n].frame == block->frame + n * UNIV_PAGE_SIZE. Check it. */ ut_ad(block->frame == page_align(ptr)); /* Read the state of the block without holding a mutex. diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index ad092edbffef..d34abc8d9db2 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -1235,6 +1235,17 @@ buf_block_t *buf_LRU_get_free_only(buf_pool_t *buf_pool) { ut_ad(buf_pool_from_block(block) == buf_pool); + /* Initialize latches on first use. The block has just been removed from + the free list and is owned exclusively by this thread, so accessing the + non-atomic latches_initialized flag here is race-free; the prior thread's + initialization (if any) is visible through the free_list_mutex handoff. + No mutex protects this access - exclusive ownership does. */ + ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE); + ut_ad(!block->page.in_free_list); + if (!block->latches_initialized) { + buf_block_initialize_latches(block); + } + return (block); } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 69a2c99df38e..674b03a0eadf 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -24170,6 +24170,14 @@ static MYSQL_SYSVAR_BOOL( nullptr, nullptr, true); #endif /* HAVE_LIBNUMA */ +static MYSQL_SYSVAR_BOOL( + buffer_pool_lazy_latch_init, srv_buf_pool_lazy_latch_init, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Create buffer pool block latches (mutex, rw-locks) lazily on first use" + " instead of eagerly while the buffer pool is built. Speeds up buffer" + " pool initialization for large pools. OFF by default.", + nullptr, nullptr, false); + static MYSQL_SYSVAR_BOOL( api_enable_binlog, ib_binlog_enabled, PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, @@ -24626,6 +24634,7 @@ static SYS_VAR *innobase_system_variables[] = { #ifdef HAVE_LIBNUMA MYSQL_SYSVAR(numa_interleave), #endif /* HAVE_LIBNUMA */ + MYSQL_SYSVAR(buffer_pool_lazy_latch_init), MYSQL_SYSVAR(change_buffering), MYSQL_SYSVAR(change_buffer_max_size), #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 2c365f1f4c37..7c8bc54c72cb 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1907,6 +1907,16 @@ struct buf_block_t { single thread. */ bool made_dirty_with_no_latch; + /** Whether this block's latches (mutex, lock, debug_latch) have been + created. See buf_block_initialize_latches(). + + Not atomic: every false->true transition happens while the block is owned + exclusively by a single thread (just removed from the free list, not yet in + the page hash or LRU), and its visibility to threads that later reuse the + block is carried by the buf_pool->free_list_mutex release/acquire handoff. + Reads during teardown happen under chunks_mutex with no concurrent writers. */ + bool latches_initialized{false}; + #ifndef UNIV_HOTBACKUP #ifdef UNIV_DEBUG /** @name Debug fields */ @@ -2051,6 +2061,10 @@ static inline uint64_t buf_pool_hash_zip_frame(void *ptr) { static inline uint64_t buf_pool_hash_zip(buf_block_t *b) { return buf_pool_hash_zip_frame(b->frame); } + +/* Lazy latch initialization for buffer block. */ +void buf_block_initialize_latches(buf_block_t *block); + /** @} */ /** A "Hazard Pointer" class used to iterate over page lists diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 284e5c35783b..bd551cbc1c01 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -414,6 +414,11 @@ Currently we support native aio on windows and linux */ extern bool srv_use_native_aio; extern bool srv_numa_interleave; +/** When true, buffer pool blocks are created with lightweight initialization +and their latches (mutex, rw-locks) are created lazily on first use. When +false (default), latches are created eagerly while the buffer pool is built. */ +extern bool srv_buf_pool_lazy_latch_init; + /* The innodb_directories variable value. This a list of directories deliminated by ';', i.e the FIL_PATH_SEPARATOR. */ extern char *srv_innodb_directories; diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index e7e56501ce0f..fa0a4a78b8e7 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -225,6 +225,9 @@ bool srv_use_native_aio = false; bool srv_numa_interleave = false; +/** See srv0srv.h. Default off: latches are created eagerly. */ +bool srv_buf_pool_lazy_latch_init = false; + #ifdef UNIV_DEBUG /** Force all user tables to use page compression. */ ulong srv_debug_compress;