From 91072151af48e3dd0639aaea54aeca7a22a11774 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:15 -0500 Subject: [PATCH 01/39] drm/xe: Fix vm_bind_ioctl double free bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira KERNEL-572 cve CVE-2025-38731 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Christoph Manszewski commit a01b704527c28a2fd43a17a85f8996b75ec8492a If the argument check during an array bind fails, the bind_ops are freed twice as seen below. Fix this by setting bind_ops to NULL after freeing. ================================================================== BUG: KASAN: double-free in xe_vm_bind_ioctl+0x1b2/0x21f0 [xe] Free of addr ffff88813bb9b800 by task xe_vm/14198 CPU: 5 UID: 0 PID: 14198 Comm: xe_vm Not tainted 6.16.0-xe-eudebug-cmanszew+ #520 PREEMPT(full) Hardware name: Intel Corporation Alder Lake Client Platform/AlderLake-P DDR5 RVP, BIOS ADLPFWI1.R00.2411.A02.2110081023 10/08/2021 Call Trace: dump_stack_lvl+0x82/0xd0 print_report+0xcb/0x610 ? __virt_addr_valid+0x19a/0x300 ? xe_vm_bind_ioctl+0x1b2/0x21f0 [xe] kasan_report_invalid_free+0xc8/0xf0 ? xe_vm_bind_ioctl+0x1b2/0x21f0 [xe] ? xe_vm_bind_ioctl+0x1b2/0x21f0 [xe] check_slab_allocation+0x102/0x130 kfree+0x10d/0x440 ? should_fail_ex+0x57/0x2f0 ? xe_vm_bind_ioctl+0x1b2/0x21f0 [xe] xe_vm_bind_ioctl+0x1b2/0x21f0 [xe] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe] ? __lock_acquire+0xab9/0x27f0 ? lock_acquire+0x165/0x300 ? drm_dev_enter+0x53/0xe0 [drm] ? find_held_lock+0x2b/0x80 ? drm_dev_exit+0x30/0x50 [drm] ? drm_ioctl_kernel+0x128/0x1c0 [drm] drm_ioctl_kernel+0x128/0x1c0 [drm] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe] ? find_held_lock+0x2b/0x80 ? __pfx_drm_ioctl_kernel+0x10/0x10 [drm] ? should_fail_ex+0x57/0x2f0 ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe] drm_ioctl+0x352/0x620 [drm] ? __pfx_drm_ioctl+0x10/0x10 [drm] ? __pfx_rpm_resume+0x10/0x10 ? do_raw_spin_lock+0x11a/0x1b0 ? find_held_lock+0x2b/0x80 ? __pm_runtime_resume+0x61/0xc0 ? rcu_is_watching+0x20/0x50 ? trace_irq_enable.constprop.0+0xac/0xe0 xe_drm_ioctl+0x91/0xc0 [xe] __x64_sys_ioctl+0xb2/0x100 ? rcu_is_watching+0x20/0x50 do_syscall_64+0x68/0x2e0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fa9acb24ded Fixes: b43e864af0d4 ("drm/xe/uapi: Add DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR") Cc: Matthew Brost Cc: Himal Prasad Ghimiray Cc: Thomas Hellström Signed-off-by: Christoph Manszewski Reviewed-by: Matthew Brost Signed-off-by: Matthew Brost Link: https://lore.kernel.org/r/20250813101231.196632-2-christoph.manszewski@intel.com (cherry picked from commit a01b704527c28a2fd43a17a85f8996b75ec8492a) Signed-off-by: Jonathan Maple --- drivers/gpu/drm/xe/xe_vm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index ecae71a03b83c..c39583583b42e 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -3184,6 +3184,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm, free_bind_ops: if (args->num_binds > 1) kvfree(*bind_ops); + *bind_ops = NULL; return err; } @@ -3289,7 +3290,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) struct xe_exec_queue *q = NULL; u32 num_syncs, num_ufence = 0; struct xe_sync_entry *syncs = NULL; - struct drm_xe_vm_bind_op *bind_ops; + struct drm_xe_vm_bind_op *bind_ops = NULL; struct xe_vma_ops vops; struct dma_fence *fence; int err; From 091ee216d86eeed8564d4aeaab2d17b7953d5186 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:16 -0500 Subject: [PATCH 02/39] irqchip/gic-v2m: Prevent use after free of gicv2m_get_fwnode() jira KERNEL-572 cve CVE-2025-37819 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Suzuki K Poulose commit 3318dc299b072a0511d6dfd8367f3304fb6d9827 With ACPI in place, gicv2m_get_fwnode() is registered with the pci subsystem as pci_msi_get_fwnode_cb(), which may get invoked at runtime during a PCI host bridge probe. But, the call back is wrongly marked as __init, causing it to be freed, while being registered with the PCI subsystem and could trigger: Unable to handle kernel paging request at virtual address ffff8000816c0400 gicv2m_get_fwnode+0x0/0x58 (P) pci_set_bus_msi_domain+0x74/0x88 pci_register_host_bridge+0x194/0x548 This is easily reproducible on a Juno board with ACPI boot. Retain the function for later use. Fixes: 0644b3daca28 ("irqchip/gic-v2m: acpi: Introducing GICv2m ACPI support") Signed-off-by: Suzuki K Poulose Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org (cherry picked from commit 3318dc299b072a0511d6dfd8367f3304fb6d9827) Signed-off-by: Jonathan Maple --- drivers/irqchip/irq-gic-v2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index 57e0470e0d133..34f437207adf7 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -420,7 +420,7 @@ static int __init gicv2m_of_init(struct fwnode_handle *parent_handle, #ifdef CONFIG_ACPI static int acpi_num_msi; -static __init struct fwnode_handle *gicv2m_get_fwnode(struct device *dev) +static struct fwnode_handle *gicv2m_get_fwnode(struct device *dev) { struct v2m_data *data; From 6c999e42590d6ed9e934ec52aefeecdc18dc875a Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:16 -0500 Subject: [PATCH 03/39] mptcp: fix race condition in mptcp_schedule_work() jira KERNEL-572 cve CVE-2025-40258 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Eric Dumazet commit 035bca3f017ee9dea3a5a756e77a6f7138cc6eea syzbot reported use-after-free in mptcp_schedule_work() [1] Issue here is that mptcp_schedule_work() schedules a work, then gets a refcount on sk->sk_refcnt if the work was scheduled. This refcount will be released by mptcp_worker(). [A] if (schedule_work(...)) { [B] sock_hold(sk); return true; } Problem is that mptcp_worker() can run immediately and complete before [B] We need instead : sock_hold(sk); if (schedule_work(...)) return true; sock_put(sk); [1] refcount_t: addition on 0; use-after-free. WARNING: CPU: 1 PID: 29 at lib/refcount.c:25 refcount_warn_saturate+0xfa/0x1d0 lib/refcount.c:25 Call Trace: __refcount_add include/linux/refcount.h:-1 [inline] __refcount_inc include/linux/refcount.h:366 [inline] refcount_inc include/linux/refcount.h:383 [inline] sock_hold include/net/sock.h:816 [inline] mptcp_schedule_work+0x164/0x1a0 net/mptcp/protocol.c:943 mptcp_tout_timer+0x21/0xa0 net/mptcp/protocol.c:2316 call_timer_fn+0x17e/0x5f0 kernel/time/timer.c:1747 expire_timers kernel/time/timer.c:1798 [inline] __run_timers kernel/time/timer.c:2372 [inline] __run_timer_base+0x648/0x970 kernel/time/timer.c:2384 run_timer_base kernel/time/timer.c:2393 [inline] run_timer_softirq+0xb7/0x180 kernel/time/timer.c:2403 handle_softirqs+0x22f/0x710 kernel/softirq.c:622 __do_softirq kernel/softirq.c:656 [inline] run_ktimerd+0xcf/0x190 kernel/softirq.c:1138 smpboot_thread_fn+0x542/0xa60 kernel/smpboot.c:160 kthread+0x711/0x8a0 kernel/kthread.c:463 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Cc: stable@vger.kernel.org Fixes: 3b1d6210a957 ("mptcp: implement and use MPTCP-level retransmission") Reported-by: syzbot+355158e7e301548a1424@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6915b46f.050a0220.3565dc.0028.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251113103924.3737425-1-edumazet@google.com Signed-off-by: Jakub Kicinski (cherry picked from commit 035bca3f017ee9dea3a5a756e77a6f7138cc6eea) Signed-off-by: Jonathan Maple --- net/mptcp/protocol.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b174bfc27d68e..791dfbfd634ac 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -851,14 +851,19 @@ static void mptcp_reset_rtx_timer(struct sock *sk) bool mptcp_schedule_work(struct sock *sk) { - if (inet_sk_state_load(sk) != TCP_CLOSE && - schedule_work(&mptcp_sk(sk)->work)) { - /* each subflow already holds a reference to the sk, and the - * workqueue is invoked by a subflow, so sk can't go away here. - */ - sock_hold(sk); + if (inet_sk_state_load(sk) == TCP_CLOSE) + return false; + + /* Get a reference on this socket, mptcp_worker() will release it. + * As mptcp_worker() might complete before us, we can not avoid + * a sock_hold()/sock_put() if schedule_work() returns false. + */ + sock_hold(sk); + + if (schedule_work(&mptcp_sk(sk)->work)) return true; - } + + sock_put(sk); return false; } From aa77d3799e09abe4d185173d190e6950ba79b76e Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:16 -0500 Subject: [PATCH 04/39] devlink: rate: Unset parent pointer in devl_rate_nodes_destroy jira KERNEL-572 cve CVE-2025-40251 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Shay Drory commit f94c1a114ac209977bdf5ca841b98424295ab1f0 The function devl_rate_nodes_destroy is documented to "Unset parent for all rate objects". However, it was only calling the driver-specific `rate_leaf_parent_set` or `rate_node_parent_set` ops and decrementing the parent's refcount, without actually setting the `devlink_rate->parent` pointer to NULL. This leaves a dangling pointer in the `devlink_rate` struct, which cause refcount error in netdevsim[1] and mlx5[2]. In addition, this is inconsistent with the behavior of `devlink_nl_rate_parent_node_set`, where the parent pointer is correctly cleared. This patch fixes the issue by explicitly setting `devlink_rate->parent` to NULL after notifying the driver, thus fulfilling the function's documented behavior for all rate objects. [1] repro steps: echo 1 > /sys/bus/netdevsim/new_device devlink dev eswitch set netdevsim/netdevsim1 mode switchdev echo 1 > /sys/bus/netdevsim/devices/netdevsim1/sriov_numvfs devlink port function rate add netdevsim/netdevsim1/test_node devlink port function rate set netdevsim/netdevsim1/128 parent test_node echo 1 > /sys/bus/netdevsim/del_device dmesg: refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 8 PID: 1530 at lib/refcount.c:31 refcount_warn_saturate+0x42/0xe0 CPU: 8 UID: 0 PID: 1530 Comm: bash Not tainted 6.18.0-rc4+ #1 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:refcount_warn_saturate+0x42/0xe0 Call Trace: devl_rate_leaf_destroy+0x8d/0x90 __nsim_dev_port_del+0x6c/0x70 [netdevsim] nsim_dev_reload_destroy+0x11c/0x140 [netdevsim] nsim_drv_remove+0x2b/0xb0 [netdevsim] device_release_driver_internal+0x194/0x1f0 bus_remove_device+0xc6/0x130 device_del+0x159/0x3c0 device_unregister+0x1a/0x60 del_device_store+0x111/0x170 [netdevsim] kernfs_fop_write_iter+0x12e/0x1e0 vfs_write+0x215/0x3d0 ksys_write+0x5f/0xd0 do_syscall_64+0x55/0x10f0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 [2] devlink dev eswitch set pci/0000:08:00.0 mode switchdev devlink port add pci/0000:08:00.0 flavour pcisf pfnum 0 sfnum 1000 devlink port function rate add pci/0000:08:00.0/group1 devlink port function rate set pci/0000:08:00.0/32768 parent group1 modprobe -r mlx5_ib mlx5_fwctl mlx5_core dmesg: refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 7 PID: 16151 at lib/refcount.c:31 refcount_warn_saturate+0x42/0xe0 CPU: 7 UID: 0 PID: 16151 Comm: bash Not tainted 6.17.0-rc7_for_upstream_min_debug_2025_10_02_12_44 #1 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 RIP: 0010:refcount_warn_saturate+0x42/0xe0 Call Trace: devl_rate_leaf_destroy+0x8d/0x90 mlx5_esw_offloads_devlink_port_unregister+0x33/0x60 [mlx5_core] mlx5_esw_offloads_unload_rep+0x3f/0x50 [mlx5_core] mlx5_eswitch_unload_sf_vport+0x40/0x90 [mlx5_core] mlx5_sf_esw_event+0xc4/0x120 [mlx5_core] notifier_call_chain+0x33/0xa0 blocking_notifier_call_chain+0x3b/0x50 mlx5_eswitch_disable_locked+0x50/0x110 [mlx5_core] mlx5_eswitch_disable+0x63/0x90 [mlx5_core] mlx5_unload+0x1d/0x170 [mlx5_core] mlx5_uninit_one+0xa2/0x130 [mlx5_core] remove_one+0x78/0xd0 [mlx5_core] pci_device_remove+0x39/0xa0 device_release_driver_internal+0x194/0x1f0 unbind_store+0x99/0xa0 kernfs_fop_write_iter+0x12e/0x1e0 vfs_write+0x215/0x3d0 ksys_write+0x5f/0xd0 do_syscall_64+0x53/0x1f0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: d75559845078 ("devlink: Allow setting parent node of rate objects") Signed-off-by: Shay Drory Reviewed-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1763381149-1234377-1-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski (cherry picked from commit f94c1a114ac209977bdf5ca841b98424295ab1f0) Signed-off-by: Jonathan Maple --- net/devlink/rate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 8828ffaf6cbc0..620524bd7c3c9 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -701,13 +701,15 @@ void devl_rate_nodes_destroy(struct devlink *devlink) if (!devlink_rate->parent) continue; - refcount_dec(&devlink_rate->parent->refcnt); if (devlink_rate_is_leaf(devlink_rate)) ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); else if (devlink_rate_is_node(devlink_rate)) ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); + + refcount_dec(&devlink_rate->parent->refcnt); + devlink_rate->parent = NULL; } list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate)) { From c3ce1a8ddef8742f62f7b3656447289666815137 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:16 -0500 Subject: [PATCH 05/39] Bluetooth: hci_sync: fix race in hci_cmd_sync_dequeue_once jira KERNEL-572 cve CVE-2025-40318 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Cen Zhang commit 09b0cd1297b4dbfe736aeaa0ceeab2265f47f772 hci_cmd_sync_dequeue_once() does lookup and then cancel the entry under two separate lock sections. Meanwhile, hci_cmd_sync_work() can also delete the same entry, leading to double list_del() and "UAF". Fix this by holding cmd_sync_work_lock across both lookup and cancel, so that the entry cannot be removed concurrently. Fixes: 505ea2b29592 ("Bluetooth: hci_sync: Add helper functions to manipulate cmd_sync queue") Reported-by: Cen Zhang Signed-off-by: Cen Zhang Signed-off-by: Luiz Augusto von Dentz (cherry picked from commit 09b0cd1297b4dbfe736aeaa0ceeab2265f47f772) Signed-off-by: Jonathan Maple --- net/bluetooth/hci_sync.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 2a2df10e7ef30..57fedbe2a9a32 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -863,11 +863,17 @@ bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev, { struct hci_cmd_sync_work_entry *entry; - entry = hci_cmd_sync_lookup_entry(hdev, func, data, destroy); - if (!entry) + mutex_lock(&hdev->cmd_sync_work_lock); + + entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy); + if (!entry) { + mutex_unlock(&hdev->cmd_sync_work_lock); return false; + } - hci_cmd_sync_cancel_entry(hdev, entry); + _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); + + mutex_unlock(&hdev->cmd_sync_work_lock); return true; } From 3de95b133569e7ed8338d90d25566323895ecbce Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:17 -0500 Subject: [PATCH 06/39] net/sched: mqprio: fix stack out-of-bounds write in tc entry parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira KERNEL-572 cve CVE-2025-38568 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Maher Azzouzi commit ffd2dc4c6c49ff4f1e5d34e454a6a55608104c17 TCA_MQPRIO_TC_ENTRY_INDEX is validated using NLA_POLICY_MAX(NLA_U32, TC_QOPT_MAX_QUEUE), which allows the value TC_QOPT_MAX_QUEUE (16). This leads to a 4-byte out-of-bounds stack write in the fp[] array, which only has room for 16 elements (0–15). Fix this by changing the policy to allow only up to TC_QOPT_MAX_QUEUE - 1. Fixes: f62af20bed2d ("net/sched: mqprio: allow per-TC user input of FP adminStatus") Reviewed-by: Eric Dumazet Signed-off-by: Maher Azzouzi Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250802001857.2702497-1-kuba@kernel.org Signed-off-by: Jakub Kicinski (cherry picked from commit ffd2dc4c6c49ff4f1e5d34e454a6a55608104c17) Signed-off-by: Jonathan Maple --- net/sched/sch_mqprio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 51d4013b61219..f3e5ef9a95925 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -152,7 +152,7 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, static const struct nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = { [TCA_MQPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, - TC_QOPT_MAX_QUEUE), + TC_QOPT_MAX_QUEUE - 1), [TCA_MQPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), From f458fdb856f90757778c5bd1058098c3ac1806b1 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:17 -0500 Subject: [PATCH 07/39] Bluetooth: hci_event: validate skb length for unknown CC opcode jira KERNEL-572 cve CVE-2025-40301 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Raphael Pinsonneault-Thibeault commit 5c5f1f64681cc889d9b13e4a61285e9e029d6ab5 In hci_cmd_complete_evt(), if the command complete event has an unknown opcode, we assume the first byte of the remaining skb->data contains the return status. However, parameter data has previously been pulled in hci_event_func(), which may leave the skb empty. If so, using skb->data[0] for the return status uses un-init memory. The fix is to check skb->len before using skb->data. Reported-by: syzbot+a9a4bedfca6aa9d7fa24@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a9a4bedfca6aa9d7fa24 Tested-by: syzbot+a9a4bedfca6aa9d7fa24@syzkaller.appspotmail.com Fixes: afcb3369f46ed ("Bluetooth: hci_event: Fix vendor (unknown) opcode status handling") Signed-off-by: Raphael Pinsonneault-Thibeault Signed-off-by: Luiz Augusto von Dentz (cherry picked from commit 5c5f1f64681cc889d9b13e4a61285e9e029d6ab5) Signed-off-by: Jonathan Maple --- net/bluetooth/hci_event.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b54749d4c7eee..3334a6e9ad98b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4242,6 +4242,13 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data, } if (i == ARRAY_SIZE(hci_cc_table)) { + if (!skb->len) { + bt_dev_err(hdev, "Unexpected cc 0x%4.4x with no status", + *opcode); + *status = HCI_ERROR_UNSPECIFIED; + return; + } + /* Unknown opcode, assume byte 0 contains the status, so * that e.g. __hci_cmd_sync() properly returns errors * for vendor specific commands send by HCI drivers. From d84b002e46e6f25ed492310c5d46714620a4ee33 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:17 -0500 Subject: [PATCH 08/39] Bluetooth: MGMT: Fix OOB access in parse_adv_monitor_pattern() jira KERNEL-572 cve CVE-2025-40294 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Ilia Gavrilov commit 8d59fba49362c65332395789fd82771f1028d87e In the parse_adv_monitor_pattern() function, the value of the 'length' variable is currently limited to HCI_MAX_EXT_AD_LENGTH(251). The size of the 'value' array in the mgmt_adv_pattern structure is 31. If the value of 'pattern[i].length' is set in the user space and exceeds 31, the 'patterns[i].value' array can be accessed out of bound when copied. Increasing the size of the 'value' array in the 'mgmt_adv_pattern' structure will break the userspace. Considering this, and to avoid OOB access revert the limits for 'offset' and 'length' back to the value of HCI_MAX_AD_LENGTH. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with SVACE. Fixes: db08722fc7d4 ("Bluetooth: hci_core: Fix missing instances using HCI_MAX_AD_LENGTH") Cc: stable@vger.kernel.org Signed-off-by: Ilia Gavrilov Signed-off-by: Luiz Augusto von Dentz (cherry picked from commit 8d59fba49362c65332395789fd82771f1028d87e) Signed-off-by: Jonathan Maple --- include/net/bluetooth/mgmt.h | 2 +- net/bluetooth/mgmt.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 6095cbb03811d..e5a672b2924e1 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -775,7 +775,7 @@ struct mgmt_adv_pattern { __u8 ad_type; __u8 offset; __u8 length; - __u8 value[31]; + __u8 value[HCI_MAX_AD_LENGTH]; } __packed; #define MGMT_OP_ADD_ADV_PATTERNS_MONITOR 0x0052 diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 11bac2c15b52f..0b05124851685 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -5395,9 +5395,9 @@ static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, for (i = 0; i < pattern_count; i++) { offset = patterns[i].offset; length = patterns[i].length; - if (offset >= HCI_MAX_EXT_AD_LENGTH || - length > HCI_MAX_EXT_AD_LENGTH || - (offset + length) > HCI_MAX_EXT_AD_LENGTH) + if (offset >= HCI_MAX_AD_LENGTH || + length > HCI_MAX_AD_LENGTH || + (offset + length) > HCI_MAX_AD_LENGTH) return MGMT_STATUS_INVALID_PARAMS; p = kmalloc(sizeof(*p), GFP_KERNEL); From abed06acd0b392df91d301a2bb020067488c8781 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:17 -0500 Subject: [PATCH 09/39] fs/proc: fix uaf in proc_readdir_de() jira KERNEL-572 cve CVE-2025-40271 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Wei Yang commit 895b4c0c79b092d732544011c3cecaf7322c36a1 Pde is erased from subdir rbtree through rb_erase(), but not set the node to EMPTY, which may result in uaf access. We should use RB_CLEAR_NODE() set the erased node to EMPTY, then pde_subdir_next() will return NULL to avoid uaf access. We found an uaf issue while using stress-ng testing, need to run testcase getdent and tun in the same time. The steps of the issue is as follows: 1) use getdent to traverse dir /proc/pid/net/dev_snmp6/, and current pde is tun3; 2) in the [time windows] unregister netdevice tun3 and tun2, and erase them from rbtree. erase tun3 first, and then erase tun2. the pde(tun2) will be released to slab; 3) continue to getdent process, then pde_subdir_next() will return pde(tun2) which is released, it will case uaf access. CPU 0 | CPU 1 ------------------------------------------------------------------------- traverse dir /proc/pid/net/dev_snmp6/ | unregister_netdevice(tun->dev) //tun3 tun2 sys_getdents64() | iterate_dir() | proc_readdir() | proc_readdir_de() | snmp6_unregister_dev() pde_get(de); | proc_remove() read_unlock(&proc_subdir_lock); | remove_proc_subtree() | write_lock(&proc_subdir_lock); [time window] | rb_erase(&root->subdir_node, &parent->subdir); | write_unlock(&proc_subdir_lock); read_lock(&proc_subdir_lock); | next = pde_subdir_next(de); | pde_put(de); | de = next; //UAF | rbtree of dev_snmp6 | pde(tun3) / \ NULL pde(tun2) Link: https://lkml.kernel.org/r/20251025024233.158363-1-albin_yang@163.com Signed-off-by: Wei Yang Cc: Al Viro Cc: Christian Brauner Cc: wangzijie Cc: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton (cherry picked from commit 895b4c0c79b092d732544011c3cecaf7322c36a1) Signed-off-by: Jonathan Maple --- fs/proc/generic.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 3431b083f7d05..9b3b4efe2041f 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -687,6 +687,12 @@ void pde_put(struct proc_dir_entry *pde) } } +static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent) +{ + rb_erase(&pde->subdir_node, &parent->subdir); + RB_CLEAR_NODE(&pde->subdir_node); +} + /* * Remove a /proc entry and free it if it's not currently in use. */ @@ -709,7 +715,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) WARN(1, "removing permanent /proc entry '%s'", de->name); de = NULL; } else { - rb_erase(&de->subdir_node, &parent->subdir); + pde_erase(de, parent); if (S_ISDIR(de->mode)) parent->nlink--; } @@ -753,7 +759,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) root->parent->name, root->name); return -EINVAL; } - rb_erase(&root->subdir_node, &parent->subdir); + pde_erase(root, parent); de = root; while (1) { @@ -765,7 +771,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) next->parent->name, next->name); return -EINVAL; } - rb_erase(&next->subdir_node, &de->subdir); + pde_erase(next, de); de = next; continue; } From 8dabb2c04f94a78885ff54d4325d91c588f08961 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:18 -0500 Subject: [PATCH 10/39] eventpoll: don't decrement ep refcount while still holding the ep mutex jira KERNEL-572 cve CVE-2025-38349 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Linus Torvalds commit 8c2e52ebbe885c7eeaabd3b7ddcdc1246fc400d2 Jann Horn points out that epoll is decrementing the ep refcount and then doing a mutex_unlock(&ep->mtx); afterwards. That's very wrong, because it can lead to a use-after-free. That pattern is actually fine for the very last reference, because the code in question will delay the actual call to "ep_free(ep)" until after it has unlocked the mutex. But it's wrong for the much subtler "next to last" case when somebody *else* may also be dropping their reference and free the ep while we're still using the mutex. Note that this is true even if that other user is also using the same ep mutex: mutexes, unlike spinlocks, can not be used for object ownership, even if they guarantee mutual exclusion. A mutex "unlock" operation is not atomic, and as one user is still accessing the mutex as part of unlocking it, another user can come in and get the now released mutex and free the data structure while the first user is still cleaning up. See our mutex documentation in Documentation/locking/mutex-design.rst, in particular the section [1] about semantics: "mutex_unlock() may access the mutex structure even after it has internally released the lock already - so it's not safe for another context to acquire the mutex and assume that the mutex_unlock() context is not using the structure anymore" So if we drop our ep ref before the mutex unlock, but we weren't the last one, we may then unlock the mutex, another user comes in, drops _their_ reference and releases the 'ep' as it now has no users - all while the mutex_unlock() is still accessing it. Fix this by simply moving the ep refcount dropping to outside the mutex: the refcount itself is atomic, and doesn't need mutex protection (that's the whole _point_ of refcounts: unlike mutexes, they are inherently about object lifetimes). Reported-by: Jann Horn Link: https://docs.kernel.org/locking/mutex-design.html#semantics [1] Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Linus Torvalds (cherry picked from commit 8c2e52ebbe885c7eeaabd3b7ddcdc1246fc400d2) Signed-off-by: Jonathan Maple --- fs/eventpoll.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 64eec8ec3b75f..ced30ff9ce093 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -854,7 +854,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); - return ep_refcount_dec_and_test(ep); + return true; } /* @@ -862,14 +862,14 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) */ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) { - WARN_ON_ONCE(__ep_remove(ep, epi, false)); + if (__ep_remove(ep, epi, false)) + WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); } static void ep_clear_and_put(struct eventpoll *ep) { struct rb_node *rbp, *next; struct epitem *epi; - bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) @@ -902,10 +902,8 @@ static void ep_clear_and_put(struct eventpoll *ep) cond_resched(); } - dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); - - if (dispose) + if (ep_refcount_dec_and_test(ep)) ep_free(ep); } @@ -1108,7 +1106,7 @@ void eventpoll_release_file(struct file *file) dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); - if (dispose) + if (dispose && ep_refcount_dec_and_test(ep)) ep_free(ep); goto again; } From fba7b95c53ec4f4b45fe08bba881d81f916743d9 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:18 -0500 Subject: [PATCH 11/39] net: dst: add four helpers to annotate data-races around dst->dev jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Eric Dumazet commit 88fe14253e181878c2ddb51a298ae8c468a63010 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/88fe1425.failed dst->dev is read locklessly in many contexts, and written in dst_dev_put(). Fixing all the races is going to need many changes. We probably will have to add full RCU protection. Add three helpers to ease this painful process. static inline struct net_device *dst_dev(const struct dst_entry *dst) { return READ_ONCE(dst->dev); } static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) { return dst_dev(skb_dst(skb)); } static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) { return dev_net(skb_dst_dev(skb)); } static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) { return dev_net_rcu(skb_dst_dev(skb)); } Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250630121934.3399505-7-edumazet@google.com Signed-off-by: Jakub Kicinski (cherry picked from commit 88fe14253e181878c2ddb51a298ae8c468a63010) Signed-off-by: Jonathan Maple # Conflicts: # net/core/dst.c # net/core/sock.c --- .../88fe1425.failed | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/88fe1425.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/88fe1425.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/88fe1425.failed new file mode 100644 index 0000000000000..a56b306ec319f --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/88fe1425.failed @@ -0,0 +1,130 @@ +net: dst: add four helpers to annotate data-races around dst->dev + +jira KERNEL-572 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Eric Dumazet +commit 88fe14253e181878c2ddb51a298ae8c468a63010 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/88fe1425.failed + +dst->dev is read locklessly in many contexts, +and written in dst_dev_put(). + +Fixing all the races is going to need many changes. + +We probably will have to add full RCU protection. + +Add three helpers to ease this painful process. + +static inline struct net_device *dst_dev(const struct dst_entry *dst) +{ + return READ_ONCE(dst->dev); +} + +static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) +{ + return dst_dev(skb_dst(skb)); +} + +static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) +{ + return dev_net(skb_dst_dev(skb)); +} + +static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) +{ + return dev_net_rcu(skb_dst_dev(skb)); +} + +Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") + Signed-off-by: Eric Dumazet + Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250630121934.3399505-7-edumazet@google.com + Signed-off-by: Jakub Kicinski +(cherry picked from commit 88fe14253e181878c2ddb51a298ae8c468a63010) + Signed-off-by: Jonathan Maple + +# Conflicts: +# net/core/dst.c +# net/core/sock.c +diff --cc net/core/dst.c +index 795ca07e28a4,e2de8b68c41d..000000000000 +--- a/net/core/dst.c ++++ b/net/core/dst.c +@@@ -145,12 -145,12 +145,18 @@@ void dst_dev_put(struct dst_entry *dst + { + struct net_device *dev = dst->dev; + + - WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD); + + dst->obsolete = DST_OBSOLETE_DEAD; + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev); +++<<<<<<< HEAD + + dst->input = dst_discard; + + dst->output = dst_discard_out; + + dst->dev = blackhole_netdev; +++======= ++ WRITE_ONCE(dst->input, dst_discard); ++ WRITE_ONCE(dst->output, dst_discard_out); ++ WRITE_ONCE(dst->dev, blackhole_netdev); +++>>>>>>> 88fe14253e18 (net: dst: add four helpers to annotate data-races around dst->dev) + netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, + GFP_ATOMIC); + } +diff --cc net/core/sock.c +index 4379447ccd02,8b7623c7d547..000000000000 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@@ -2544,9 -2600,13 +2544,16 @@@ void sk_setup_caps(struct sock *sk, str + { + u32 max_segs = 1; + +++<<<<<<< HEAD + + sk->sk_route_caps = dst->dev->features; + + if (sk_is_tcp(sk)) +++======= ++ sk->sk_route_caps = dst_dev(dst)->features; ++ if (sk_is_tcp(sk)) { ++ struct inet_connection_sock *icsk = inet_csk(sk); ++ +++>>>>>>> 88fe14253e18 (net: dst: add four helpers to annotate data-races around dst->dev) + sk->sk_route_caps |= NETIF_F_GSO; + - icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); + - } + if (sk->sk_route_caps & NETIF_F_GSO) + sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; + if (unlikely(sk->sk_gso_disabled)) +diff --git a/include/net/dst.h b/include/net/dst.h +index 08647c99d79c..281d94998cff 100644 +--- a/include/net/dst.h ++++ b/include/net/dst.h +@@ -561,6 +561,26 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) + dst->ops->update_pmtu(dst, NULL, skb, mtu, false); + } + ++static inline struct net_device *dst_dev(const struct dst_entry *dst) ++{ ++ return READ_ONCE(dst->dev); ++} ++ ++static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) ++{ ++ return dst_dev(skb_dst(skb)); ++} ++ ++static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) ++{ ++ return dev_net(skb_dst_dev(skb)); ++} ++ ++static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) ++{ ++ return dev_net_rcu(skb_dst_dev(skb)); ++} ++ + struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); + void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu, bool confirm_neigh); +* Unmerged path net/core/dst.c +* Unmerged path net/core/sock.c From 67d9dc8300a53842e775eb5f9c2e033bf113ec36 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:19 -0500 Subject: [PATCH 12/39] net: Add locking to protect skb->dev access in ip_output jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Sharath Chandra Vurukala commit 1dbf1d590d10a6d1978e8184f8dfe20af22d680a Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/1dbf1d59.failed In ip_output() skb->dev is updated from the skb_dst(skb)->dev this can become invalid when the interface is unregistered and freed, Introduced new skb_dst_dev_rcu() function to be used instead of skb_dst_dev() within rcu_locks in ip_output.This will ensure that all the skb's associated with the dev being deregistered will be transnmitted out first, before freeing the dev. Given that ip_output() is called within an rcu_read_lock() critical section or from a bottom-half context, it is safe to introduce an RCU read-side critical section within it. Multiple panic call stacks were observed when UL traffic was run in concurrency with device deregistration from different functions, pasting one sample for reference. [496733.627565][T13385] Call trace: [496733.627570][T13385] bpf_prog_ce7c9180c3b128ea_cgroupskb_egres+0x24c/0x7f0 [496733.627581][T13385] __cgroup_bpf_run_filter_skb+0x128/0x498 [496733.627595][T13385] ip_finish_output+0xa4/0xf4 [496733.627605][T13385] ip_output+0x100/0x1a0 [496733.627613][T13385] ip_send_skb+0x68/0x100 [496733.627618][T13385] udp_send_skb+0x1c4/0x384 [496733.627625][T13385] udp_sendmsg+0x7b0/0x898 [496733.627631][T13385] inet_sendmsg+0x5c/0x7c [496733.627639][T13385] __sys_sendto+0x174/0x1e4 [496733.627647][T13385] __arm64_sys_sendto+0x28/0x3c [496733.627653][T13385] invoke_syscall+0x58/0x11c [496733.627662][T13385] el0_svc_common+0x88/0xf4 [496733.627669][T13385] do_el0_svc+0x2c/0xb0 [496733.627676][T13385] el0_svc+0x2c/0xa4 [496733.627683][T13385] el0t_64_sync_handler+0x68/0xb4 [496733.627689][T13385] el0t_64_sync+0x1a4/0x1a8 Changes in v3: - Replaced WARN_ON() with WARN_ON_ONCE(), as suggested by Willem de Bruijn. - Dropped legacy lines mistakenly pulled in from an outdated branch. Changes in v2: - Addressed review comments from Eric Dumazet - Used READ_ONCE() to prevent potential load/store tearing - Added skb_dst_dev_rcu() and used along with rcu_read_lock() in ip_output Signed-off-by: Sharath Chandra Vurukala Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250730105118.GA26100@hu-sharathv-hyd.qualcomm.com Signed-off-by: Jakub Kicinski (cherry picked from commit 1dbf1d590d10a6d1978e8184f8dfe20af22d680a) Signed-off-by: Jonathan Maple # Conflicts: # include/net/dst.h # net/ipv4/ip_output.c --- .../1dbf1d59.failed | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/1dbf1d59.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/1dbf1d59.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/1dbf1d59.failed new file mode 100644 index 0000000000000..81899fdff6c7c --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/1dbf1d59.failed @@ -0,0 +1,131 @@ +net: Add locking to protect skb->dev access in ip_output + +jira KERNEL-572 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Sharath Chandra Vurukala +commit 1dbf1d590d10a6d1978e8184f8dfe20af22d680a +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/1dbf1d59.failed + +In ip_output() skb->dev is updated from the skb_dst(skb)->dev +this can become invalid when the interface is unregistered and freed, + +Introduced new skb_dst_dev_rcu() function to be used instead of +skb_dst_dev() within rcu_locks in ip_output.This will ensure that +all the skb's associated with the dev being deregistered will +be transnmitted out first, before freeing the dev. + +Given that ip_output() is called within an rcu_read_lock() +critical section or from a bottom-half context, it is safe to introduce +an RCU read-side critical section within it. + +Multiple panic call stacks were observed when UL traffic was run +in concurrency with device deregistration from different functions, +pasting one sample for reference. + +[496733.627565][T13385] Call trace: +[496733.627570][T13385] bpf_prog_ce7c9180c3b128ea_cgroupskb_egres+0x24c/0x7f0 +[496733.627581][T13385] __cgroup_bpf_run_filter_skb+0x128/0x498 +[496733.627595][T13385] ip_finish_output+0xa4/0xf4 +[496733.627605][T13385] ip_output+0x100/0x1a0 +[496733.627613][T13385] ip_send_skb+0x68/0x100 +[496733.627618][T13385] udp_send_skb+0x1c4/0x384 +[496733.627625][T13385] udp_sendmsg+0x7b0/0x898 +[496733.627631][T13385] inet_sendmsg+0x5c/0x7c +[496733.627639][T13385] __sys_sendto+0x174/0x1e4 +[496733.627647][T13385] __arm64_sys_sendto+0x28/0x3c +[496733.627653][T13385] invoke_syscall+0x58/0x11c +[496733.627662][T13385] el0_svc_common+0x88/0xf4 +[496733.627669][T13385] do_el0_svc+0x2c/0xb0 +[496733.627676][T13385] el0_svc+0x2c/0xa4 +[496733.627683][T13385] el0t_64_sync_handler+0x68/0xb4 +[496733.627689][T13385] el0t_64_sync+0x1a4/0x1a8 + +Changes in v3: +- Replaced WARN_ON() with WARN_ON_ONCE(), as suggested by Willem de Bruijn. +- Dropped legacy lines mistakenly pulled in from an outdated branch. + +Changes in v2: +- Addressed review comments from Eric Dumazet +- Used READ_ONCE() to prevent potential load/store tearing +- Added skb_dst_dev_rcu() and used along with rcu_read_lock() in ip_output + + Signed-off-by: Sharath Chandra Vurukala + Reviewed-by: Eric Dumazet +Link: https://patch.msgid.link/20250730105118.GA26100@hu-sharathv-hyd.qualcomm.com + Signed-off-by: Jakub Kicinski +(cherry picked from commit 1dbf1d590d10a6d1978e8184f8dfe20af22d680a) + Signed-off-by: Jonathan Maple + +# Conflicts: +# include/net/dst.h +# net/ipv4/ip_output.c +diff --cc include/net/dst.h +index 08647c99d79c,bab01363bb97..000000000000 +--- a/include/net/dst.h ++++ b/include/net/dst.h +@@@ -561,6 -563,38 +561,41 @@@ static inline void skb_dst_update_pmtu_ + dst->ops->update_pmtu(dst, NULL, skb, mtu, false); + } + +++<<<<<<< HEAD +++======= ++ static inline struct net_device *dst_dev(const struct dst_entry *dst) ++ { ++ return READ_ONCE(dst->dev); ++ } ++ ++ static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) ++ { ++ /* In the future, use rcu_dereference(dst->dev) */ ++ WARN_ON_ONCE(!rcu_read_lock_held()); ++ return READ_ONCE(dst->dev); ++ } ++ ++ static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) ++ { ++ return dst_dev(skb_dst(skb)); ++ } ++ ++ static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb) ++ { ++ return dst_dev_rcu(skb_dst(skb)); ++ } ++ ++ static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) ++ { ++ return dev_net(skb_dst_dev(skb)); ++ } ++ ++ static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) ++ { ++ return dev_net_rcu(skb_dst_dev(skb)); ++ } ++ +++>>>>>>> 1dbf1d590d10 (net: Add locking to protect skb->dev access in ip_output) + struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); + void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu, bool confirm_neigh); +diff --cc net/ipv4/ip_output.c +index 0065b1996c94,84e7f8a2f50f..000000000000 +--- a/net/ipv4/ip_output.c ++++ b/net/ipv4/ip_output.c +@@@ -426,8 -425,11 +426,15 @@@ int ip_mc_output(struct net *net, struc + + int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) + { +++<<<<<<< HEAD + + struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; +++======= ++ struct net_device *dev, *indev = skb->dev; ++ int ret_val; +++>>>>>>> 1dbf1d590d10 (net: Add locking to protect skb->dev access in ip_output) + ++ rcu_read_lock(); ++ dev = skb_dst_dev_rcu(skb); + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + +* Unmerged path include/net/dst.h +* Unmerged path net/ipv4/ip_output.c From 8f59227796161e2bf4941b113c9cabb05da2c152 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:19 -0500 Subject: [PATCH 13/39] net: dst: introduce dst->dev_rcu jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Eric Dumazet commit caedcc5b6df1b2e2b5f39079e3369c1d4d5c5f50 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/caedcc5b.failed Followup of commit 88fe14253e18 ("net: dst: add four helpers to annotate data-races around dst->dev"). We want to gradually add explicit RCU protection to dst->dev, including lockdep support. Add an union to alias dst->dev_rcu and dst->dev. Add dst_dev_net_rcu() helper. Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-2-edumazet@google.com Signed-off-by: Jakub Kicinski (cherry picked from commit caedcc5b6df1b2e2b5f39079e3369c1d4d5c5f50) Signed-off-by: Jonathan Maple # Conflicts: # include/net/dst.h # net/core/dst.c # net/ipv4/route.c --- .../caedcc5b.failed | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/caedcc5b.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/caedcc5b.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/caedcc5b.failed new file mode 100644 index 0000000000000..a1c95d4ef8add --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/caedcc5b.failed @@ -0,0 +1,136 @@ +net: dst: introduce dst->dev_rcu + +jira KERNEL-572 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Eric Dumazet +commit caedcc5b6df1b2e2b5f39079e3369c1d4d5c5f50 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/caedcc5b.failed + +Followup of commit 88fe14253e18 ("net: dst: add four helpers +to annotate data-races around dst->dev"). + +We want to gradually add explicit RCU protection to dst->dev, +including lockdep support. + +Add an union to alias dst->dev_rcu and dst->dev. + +Add dst_dev_net_rcu() helper. + +Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") + Signed-off-by: Eric Dumazet + Reviewed-by: David Ahern +Link: https://patch.msgid.link/20250828195823.3958522-2-edumazet@google.com + Signed-off-by: Jakub Kicinski +(cherry picked from commit caedcc5b6df1b2e2b5f39079e3369c1d4d5c5f50) + Signed-off-by: Jonathan Maple + +# Conflicts: +# include/net/dst.h +# net/core/dst.c +# net/ipv4/route.c +diff --cc include/net/dst.h +index 08647c99d79c,f8aa1239b4db..000000000000 +--- a/include/net/dst.h ++++ b/include/net/dst.h +@@@ -561,6 -566,41 +564,44 @@@ static inline void skb_dst_update_pmtu_ + dst->ops->update_pmtu(dst, NULL, skb, mtu, false); + } + +++<<<<<<< HEAD +++======= ++ static inline struct net_device *dst_dev(const struct dst_entry *dst) ++ { ++ return READ_ONCE(dst->dev); ++ } ++ ++ static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) ++ { ++ return rcu_dereference(dst->dev_rcu); ++ } ++ ++ static inline struct net *dst_dev_net_rcu(const struct dst_entry *dst) ++ { ++ return dev_net_rcu(dst_dev_rcu(dst)); ++ } ++ ++ static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) ++ { ++ return dst_dev(skb_dst(skb)); ++ } ++ ++ static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb) ++ { ++ return dst_dev_rcu(skb_dst(skb)); ++ } ++ ++ static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) ++ { ++ return dev_net(skb_dst_dev(skb)); ++ } ++ ++ static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) ++ { ++ return dev_net_rcu(skb_dst_dev_rcu(skb)); ++ } ++ +++>>>>>>> caedcc5b6df1 (net: dst: introduce dst->dev_rcu) + struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); + void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu, bool confirm_neigh); +diff --cc net/core/dst.c +index 795ca07e28a4,e9d35f49c9e7..000000000000 +--- a/net/core/dst.c ++++ b/net/core/dst.c +@@@ -145,12 -145,12 +145,18 @@@ void dst_dev_put(struct dst_entry *dst + { + struct net_device *dev = dst->dev; + + - WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD); + + dst->obsolete = DST_OBSOLETE_DEAD; + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev); +++<<<<<<< HEAD + + dst->input = dst_discard; + + dst->output = dst_discard_out; + + dst->dev = blackhole_netdev; +++======= ++ WRITE_ONCE(dst->input, dst_discard); ++ WRITE_ONCE(dst->output, dst_discard_out); ++ rcu_assign_pointer(dst->dev_rcu, blackhole_netdev); +++>>>>>>> caedcc5b6df1 (net: dst: introduce dst->dev_rcu) + netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, + GFP_ATOMIC); + } +diff --cc net/ipv4/route.c +index c4ffbf26c17b,44382d175589..000000000000 +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@@ -1021,7 -1027,7 +1021,11 @@@ static void __ip_rt_update_pmtu(struct + return; + + rcu_read_lock(); +++<<<<<<< HEAD + + net = dev_net_rcu(dst->dev); +++======= ++ net = dst_dev_net_rcu(dst); +++>>>>>>> caedcc5b6df1 (net: dst: introduce dst->dev_rcu) + if (mtu < net->ipv4.ip_rt_min_pmtu) { + lock = true; + mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); +@@@ -1307,7 -1327,7 +1311,11 @@@ static unsigned int ipv4_default_advmss + struct net *net; + + rcu_read_lock(); +++<<<<<<< HEAD + + net = dev_net_rcu(dst->dev); +++======= ++ net = dst_dev_net_rcu(dst); +++>>>>>>> caedcc5b6df1 (net: dst: introduce dst->dev_rcu) + advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, + net->ipv4.ip_rt_min_advmss); + rcu_read_unlock(); +* Unmerged path include/net/dst.h +* Unmerged path net/core/dst.c +* Unmerged path net/ipv4/route.c From 2ac864fabe35aa03d62220d01775e3dda891e216 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:20 -0500 Subject: [PATCH 14/39] ipv6: use RCU in ip6_output() jira KERNEL-572 cve CVE-2025-40158 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Eric Dumazet commit 11709573cc4e48dc34c80fc7ab9ce5b159e29695 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/11709573.failed Use RCU in ip6_output() in order to use dst_dev_rcu() to prevent possible UAF. We can remove rcu_read_lock()/rcu_read_unlock() pairs from ip6_finish_output2(). Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-5-edumazet@google.com Signed-off-by: Jakub Kicinski (cherry picked from commit 11709573cc4e48dc34c80fc7ab9ce5b159e29695) Signed-off-by: Jonathan Maple # Conflicts: # net/ipv6/ip6_output.c --- .../11709573.failed | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/11709573.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/11709573.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/11709573.failed new file mode 100644 index 0000000000000..c745766ab1efb --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/11709573.failed @@ -0,0 +1,65 @@ +ipv6: use RCU in ip6_output() + +jira KERNEL-572 +cve CVE-2025-40158 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Eric Dumazet +commit 11709573cc4e48dc34c80fc7ab9ce5b159e29695 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/11709573.failed + +Use RCU in ip6_output() in order to use dst_dev_rcu() to prevent +possible UAF. + +We can remove rcu_read_lock()/rcu_read_unlock() pairs +from ip6_finish_output2(). + +Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") + Signed-off-by: Eric Dumazet + Reviewed-by: David Ahern +Link: https://patch.msgid.link/20250828195823.3958522-5-edumazet@google.com + Signed-off-by: Jakub Kicinski +(cherry picked from commit 11709573cc4e48dc34c80fc7ab9ce5b159e29695) + Signed-off-by: Jonathan Maple + +# Conflicts: +# net/ipv6/ip6_output.c +diff --cc net/ipv6/ip6_output.c +index 5a364b352115,9d64c13bab5e..000000000000 +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@@ -60,7 -60,7 +60,11 @@@ + static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) + { + struct dst_entry *dst = skb_dst(skb); +++<<<<<<< HEAD + + struct net_device *dev = dst->dev; +++======= ++ struct net_device *dev = dst_dev_rcu(dst); +++>>>>>>> 11709573cc4e (ipv6: use RCU in ip6_output()) + struct inet6_dev *idev = ip6_dst_idev(dst); + unsigned int hh_len = LL_RESERVED_SPACE(dev); + const struct in6_addr *daddr, *nexthop; +@@@ -232,10 -226,15 +230,20 @@@ static int ip6_finish_output(struct ne + + int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) + { +++<<<<<<< HEAD + + struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; + + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); +++======= ++ struct dst_entry *dst = skb_dst(skb); ++ struct net_device *dev, *indev = skb->dev; ++ struct inet6_dev *idev; ++ int ret; +++>>>>>>> 11709573cc4e (ipv6: use RCU in ip6_output()) + + skb->protocol = htons(ETH_P_IPV6); ++ rcu_read_lock(); ++ dev = dst_dev_rcu(dst); ++ idev = ip6_dst_idev(dst); + skb->dev = dev; + + if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { +* Unmerged path net/ipv6/ip6_output.c From 755f634654c5c9ec102c7eb9cb0c23702a2e4f2c Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:20 -0500 Subject: [PATCH 15/39] ipv6: use RCU in ip6_xmit() jira KERNEL-572 cve CVE-2025-40135 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Eric Dumazet commit 9085e56501d93af9f2d7bd16f7fcfacdde47b99c Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9085e565.failed Use RCU in ip6_xmit() in order to use dst_dev_rcu() to prevent possible UAF. Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-4-edumazet@google.com Signed-off-by: Jakub Kicinski (cherry picked from commit 9085e56501d93af9f2d7bd16f7fcfacdde47b99c) Signed-off-by: Jonathan Maple # Conflicts: # net/ipv6/ip6_output.c --- .../9085e565.failed | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9085e565.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9085e565.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9085e565.failed new file mode 100644 index 0000000000000..4e6cdcaa52767 --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9085e565.failed @@ -0,0 +1,40 @@ +ipv6: use RCU in ip6_xmit() + +jira KERNEL-572 +cve CVE-2025-40135 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Eric Dumazet +commit 9085e56501d93af9f2d7bd16f7fcfacdde47b99c +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9085e565.failed + +Use RCU in ip6_xmit() in order to use dst_dev_rcu() to prevent +possible UAF. + +Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") + Signed-off-by: Eric Dumazet + Reviewed-by: David Ahern +Link: https://patch.msgid.link/20250828195823.3958522-4-edumazet@google.com + Signed-off-by: Jakub Kicinski +(cherry picked from commit 9085e56501d93af9f2d7bd16f7fcfacdde47b99c) + Signed-off-by: Jonathan Maple + +# Conflicts: +# net/ipv6/ip6_output.c +diff --cc net/ipv6/ip6_output.c +index 5a364b352115,e234640433d6..000000000000 +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@@ -270,8 -271,6 +270,10 @@@ int ip6_xmit(const struct sock *sk, str + const struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *first_hop = &fl6->daddr; + struct dst_entry *dst = skb_dst(skb); +++<<<<<<< HEAD + + struct net_device *dev = dst->dev; +++======= +++>>>>>>> 9085e56501d9 (ipv6: use RCU in ip6_xmit()) + struct inet6_dev *idev = ip6_dst_idev(dst); + struct hop_jumbo_hdr *hop_jumbo; + int hoplen = sizeof(*hop_jumbo); +* Unmerged path net/ipv6/ip6_output.c From 6cf63b099720fb70bfe0d110eeefcea86062d144 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:21 -0500 Subject: [PATCH 16/39] net: use dst_dev_rcu() in sk_setup_caps() jira KERNEL-572 cve CVE-2025-40170 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Eric Dumazet commit 99a2ace61b211b0be861b07fbaa062fca4b58879 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/99a2ace6.failed Use RCU to protect accesses to dst->dev from sk_setup_caps() and sk_dst_gso_max_size(). Also use dst_dev_rcu() in ip6_dst_mtu_maybe_forward(), and ip_dst_mtu_maybe_forward(). ip4_dst_hoplimit() can use dst_dev_net_rcu(). Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-6-edumazet@google.com Signed-off-by: Jakub Kicinski (cherry picked from commit 99a2ace61b211b0be861b07fbaa062fca4b58879) Signed-off-by: Jonathan Maple # Conflicts: # include/net/ip.h # include/net/ip6_route.h # include/net/route.h # net/core/sock.c --- .../99a2ace6.failed | 176 ++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/99a2ace6.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/99a2ace6.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/99a2ace6.failed new file mode 100644 index 0000000000000..89647d0e33eea --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/99a2ace6.failed @@ -0,0 +1,176 @@ +net: use dst_dev_rcu() in sk_setup_caps() + +jira KERNEL-572 +cve CVE-2025-40170 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Eric Dumazet +commit 99a2ace61b211b0be861b07fbaa062fca4b58879 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/99a2ace6.failed + +Use RCU to protect accesses to dst->dev from sk_setup_caps() +and sk_dst_gso_max_size(). + +Also use dst_dev_rcu() in ip6_dst_mtu_maybe_forward(), +and ip_dst_mtu_maybe_forward(). + +ip4_dst_hoplimit() can use dst_dev_net_rcu(). + +Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") + Signed-off-by: Eric Dumazet + Reviewed-by: David Ahern +Link: https://patch.msgid.link/20250828195823.3958522-6-edumazet@google.com + Signed-off-by: Jakub Kicinski +(cherry picked from commit 99a2ace61b211b0be861b07fbaa062fca4b58879) + Signed-off-by: Jonathan Maple + +# Conflicts: +# include/net/ip.h +# include/net/ip6_route.h +# include/net/route.h +# net/core/sock.c +diff --cc include/net/ip.h +index 30596104cb60,6dbd2bf8fa9c..000000000000 +--- a/include/net/ip.h ++++ b/include/net/ip.h +@@@ -471,7 -473,8 +472,12 @@@ static inline unsigned int ip_dst_mtu_m + + rcu_read_lock(); + +++<<<<<<< HEAD + + net = dev_net_rcu(dst->dev); +++======= ++ dev = dst_dev_rcu(dst); ++ net = dev_net_rcu(dev); +++>>>>>>> 99a2ace61b21 (net: use dst_dev_rcu() in sk_setup_caps()) + if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || + ip_mtu_locked(dst) || + !forwarding) { +@@@ -485,7 -488,7 +491,11 @@@ + if (mtu) + goto out; + +++<<<<<<< HEAD + + mtu = READ_ONCE(dst->dev->mtu); +++======= ++ mtu = READ_ONCE(dev->mtu); +++>>>>>>> 99a2ace61b21 (net: use dst_dev_rcu() in sk_setup_caps()) + + if (unlikely(ip_mtu_locked(dst))) { + if (rt->rt_uses_gateway && mtu > 576) +diff --cc include/net/ip6_route.h +index 6dbdf60b342f,59f48ca3abdf..000000000000 +--- a/include/net/ip6_route.h ++++ b/include/net/ip6_route.h +@@@ -337,7 -337,7 +337,11 @@@ static inline unsigned int ip6_dst_mtu_ + + mtu = IPV6_MIN_MTU; + rcu_read_lock(); +++<<<<<<< HEAD + + idev = __in6_dev_get(dst->dev); +++======= ++ idev = __in6_dev_get(dst_dev_rcu(dst)); +++>>>>>>> 99a2ace61b21 (net: use dst_dev_rcu() in sk_setup_caps()) + if (idev) + mtu = READ_ONCE(idev->cnf.mtu6); + rcu_read_unlock(); +diff --cc include/net/route.h +index 8d2de5eea126,f90106f383c5..000000000000 +--- a/include/net/route.h ++++ b/include/net/route.h +@@@ -373,7 -390,7 +373,11 @@@ static inline int ip4_dst_hoplimit(cons + const struct net *net; + + rcu_read_lock(); +++<<<<<<< HEAD + + net = dev_net_rcu(dst->dev); +++======= ++ net = dst_dev_net_rcu(dst); +++>>>>>>> 99a2ace61b21 (net: use dst_dev_rcu() in sk_setup_caps()) + hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); + rcu_read_unlock(); + } +diff --cc net/core/sock.c +index 4379447ccd02,9a8290fcc35d..000000000000 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@@ -2512,17 -2578,16 +2512,21 @@@ out + } + EXPORT_SYMBOL_GPL(sk_clone_lock); + +++<<<<<<< HEAD + +void sk_free_unlock_clone(struct sock *sk) + +{ + + /* It is still raw copy of parent, so invalidate + + * destructor and make plain sk_free() */ + + sk->sk_destruct = NULL; + + bh_unlock_sock(sk); + + sk_free(sk); + +} + +EXPORT_SYMBOL_GPL(sk_free_unlock_clone); + + + +static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) +++======= ++ static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) +++>>>>>>> 99a2ace61b21 (net: use dst_dev_rcu() in sk_setup_caps()) + { + bool is_ipv6 = false; + u32 max_size; +@@@ -2532,8 -2597,8 +2536,13 @@@ + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); + #endif + /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ +++<<<<<<< HEAD + + max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : + + READ_ONCE(dst->dev->gso_ipv4_max_size); +++======= ++ max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : ++ READ_ONCE(dev->gso_ipv4_max_size); +++>>>>>>> 99a2ace61b21 (net: use dst_dev_rcu() in sk_setup_caps()) + if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) + max_size = GSO_LEGACY_MAX_SIZE; + +@@@ -2542,11 -2607,18 +2551,21 @@@ + + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) + { ++ const struct net_device *dev; + u32 max_segs = 1; + +++<<<<<<< HEAD + + sk->sk_route_caps = dst->dev->features; + + if (sk_is_tcp(sk)) +++======= ++ rcu_read_lock(); ++ dev = dst_dev_rcu(dst); ++ sk->sk_route_caps = dev->features; ++ if (sk_is_tcp(sk)) { ++ struct inet_connection_sock *icsk = inet_csk(sk); ++ +++>>>>>>> 99a2ace61b21 (net: use dst_dev_rcu() in sk_setup_caps()) + sk->sk_route_caps |= NETIF_F_GSO; + - icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); + - } + if (sk->sk_route_caps & NETIF_F_GSO) + sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; + if (unlikely(sk->sk_gso_disabled)) +@@@ -2556,9 -2628,9 +2575,13 @@@ + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + } else { + sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; +- sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); ++ sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); + /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ +++<<<<<<< HEAD + + max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); +++======= ++ max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); +++>>>>>>> 99a2ace61b21 (net: use dst_dev_rcu() in sk_setup_caps()) + } + } + sk->sk_gso_max_segs = max_segs; +* Unmerged path include/net/ip.h +* Unmerged path include/net/ip6_route.h +* Unmerged path include/net/route.h +* Unmerged path net/core/sock.c From 0f049fee3eebb86b2d96541e0af5c5544b0abadc Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:21 -0500 Subject: [PATCH 17/39] vsock: Ignore signal/timeout on connect() if already established jira KERNEL-572 cve CVE-2025-40248 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Michal Luczaj commit 002541ef650b742a198e4be363881439bb9d86b4 During connect(), acting on a signal/timeout by disconnecting an already established socket leads to several issues: 1. connect() invoking vsock_transport_cancel_pkt() -> virtio_transport_purge_skbs() may race with sendmsg() invoking virtio_transport_get_credit(). This results in a permanently elevated `vvs->bytes_unsent`. Which, in turn, confuses the SOCK_LINGER handling. 2. connect() resetting a connected socket's state may race with socket being placed in a sockmap. A disconnected socket remaining in a sockmap breaks sockmap's assumptions. And gives rise to WARNs. 3. connect() transitioning SS_CONNECTED -> SS_UNCONNECTED allows for a transport change/drop after TCP_ESTABLISHED. Which poses a problem for any simultaneous sendmsg() or connect() and may result in a use-after-free/null-ptr-deref. Do not disconnect socket on signal/timeout. Keep the logic for unconnected sockets: they don't linger, can't be placed in a sockmap, are rejected by sendmsg(). [1]: https://lore.kernel.org/netdev/e07fd95c-9a38-4eea-9638-133e38c2ec9b@rbox.co/ [2]: https://lore.kernel.org/netdev/20250317-vsock-trans-signal-race-v4-0-fc8837f3f1d4@rbox.co/ [3]: https://lore.kernel.org/netdev/60f1b7db-3099-4f6a-875e-af9f6ef194f6@rbox.co/ Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Michal Luczaj Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20251119-vsock-interrupted-connect-v2-1-70734cf1233f@rbox.co Signed-off-by: Jakub Kicinski (cherry picked from commit 002541ef650b742a198e4be363881439bb9d86b4) Signed-off-by: Jonathan Maple --- net/vmw_vsock/af_vsock.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 0945ffcb67e72..886c249932315 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1557,18 +1557,40 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, timeout = schedule_timeout(timeout); lock_sock(sk); - if (signal_pending(current)) { - err = sock_intr_errno(timeout); - sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; - sock->state = SS_UNCONNECTED; - vsock_transport_cancel_pkt(vsk); - vsock_remove_connected(vsk); - goto out_wait; - } else if ((sk->sk_state != TCP_ESTABLISHED) && (timeout == 0)) { - err = -ETIMEDOUT; + /* Connection established. Whatever happens to socket once we + * release it, that's not connect()'s concern. No need to go + * into signal and timeout handling. Call it a day. + * + * Note that allowing to "reset" an already established socket + * here is racy and insecure. + */ + if (sk->sk_state == TCP_ESTABLISHED) + break; + + /* If connection was _not_ established and a signal/timeout came + * to be, we want the socket's state reset. User space may want + * to retry. + * + * sk_state != TCP_ESTABLISHED implies that socket is not on + * vsock_connected_table. We keep the binding and the transport + * assigned. + */ + if (signal_pending(current) || timeout == 0) { + err = timeout == 0 ? -ETIMEDOUT : sock_intr_errno(timeout); + + /* Listener might have already responded with + * VIRTIO_VSOCK_OP_RESPONSE. Its handling expects our + * sk_state == TCP_SYN_SENT, which hereby we break. + * In such case VIRTIO_VSOCK_OP_RST will follow. + */ sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; + + /* Try to cancel VIRTIO_VSOCK_OP_REQUEST skb sent out by + * transport->connect(). + */ vsock_transport_cancel_pkt(vsk); + goto out_wait; } From 9b4bfab40c7fe87c37554191f488a0ecfccd7ffb Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:21 -0500 Subject: [PATCH 18/39] Bluetooth: hci_sock: Prevent race in socket write iter and sock bind jira KERNEL-572 cve CVE-2025-68305 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Edward Adam Davis commit 89bb613511cc21ed5ba6bddc1c9b9ae9c0dad392 There is a potential race condition between sock bind and socket write iter. bind may free the same cmd via mgmt_pending before write iter sends the cmd, just as syzbot reported in UAF[1]. Here we use hci_dev_lock to synchronize the two, thereby avoiding the UAF mentioned in [1]. [1] syzbot reported: BUG: KASAN: slab-use-after-free in mgmt_pending_remove+0x3b/0x210 net/bluetooth/mgmt_util.c:316 Read of size 8 at addr ffff888077164818 by task syz.0.17/5989 Call Trace: mgmt_pending_remove+0x3b/0x210 net/bluetooth/mgmt_util.c:316 set_link_security+0x5c2/0x710 net/bluetooth/mgmt.c:1918 hci_mgmt_cmd+0x9c9/0xef0 net/bluetooth/hci_sock.c:1719 hci_sock_sendmsg+0x6ca/0xef0 net/bluetooth/hci_sock.c:1839 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg+0x21c/0x270 net/socket.c:742 sock_write_iter+0x279/0x360 net/socket.c:1195 Allocated by task 5989: mgmt_pending_add+0x35/0x140 net/bluetooth/mgmt_util.c:296 set_link_security+0x557/0x710 net/bluetooth/mgmt.c:1910 hci_mgmt_cmd+0x9c9/0xef0 net/bluetooth/hci_sock.c:1719 hci_sock_sendmsg+0x6ca/0xef0 net/bluetooth/hci_sock.c:1839 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg+0x21c/0x270 net/socket.c:742 sock_write_iter+0x279/0x360 net/socket.c:1195 Freed by task 5991: mgmt_pending_free net/bluetooth/mgmt_util.c:311 [inline] mgmt_pending_foreach+0x30d/0x380 net/bluetooth/mgmt_util.c:257 mgmt_index_removed+0x112/0x2f0 net/bluetooth/mgmt.c:9477 hci_sock_bind+0xbe9/0x1000 net/bluetooth/hci_sock.c:1314 Fixes: 6fe26f694c82 ("Bluetooth: MGMT: Protect mgmt_pending list with its own lock") Reported-by: syzbot+9aa47cd4633a3cf92a80@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9aa47cd4633a3cf92a80 Tested-by: syzbot+9aa47cd4633a3cf92a80@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Signed-off-by: Luiz Augusto von Dentz (cherry picked from commit 89bb613511cc21ed5ba6bddc1c9b9ae9c0dad392) Signed-off-by: Jonathan Maple --- net/bluetooth/hci_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 428ee5c7de7ea..043936783f747 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1311,7 +1311,9 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, goto done; } + hci_dev_lock(hdev); mgmt_index_removed(hdev); + hci_dev_unlock(hdev); err = hci_dev_open(hdev->id); if (err) { From 0cb1d626e90e5a34ed0e8e8d880bf730807dab1a Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:21 -0500 Subject: [PATCH 19/39] net: atlantic: fix fragment overflow handling in RX path jira KERNEL-572 cve CVE-2025-68301 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Jiefeng Zhang commit 5ffcb7b890f61541201461580bb6622ace405aec The atlantic driver can receive packets with more than MAX_SKB_FRAGS (17) fragments when handling large multi-descriptor packets. This causes an out-of-bounds write in skb_add_rx_frag_netmem() leading to kernel panic. The issue occurs because the driver doesn't check the total number of fragments before calling skb_add_rx_frag(). When a packet requires more than MAX_SKB_FRAGS fragments, the fragment index exceeds the array bounds. Fix by assuming there will be an extra frag if buff->len > AQ_CFG_RX_HDR_SIZE, then all fragments are accounted for. And reusing the existing check to prevent the overflow earlier in the code path. This crash occurred in production with an Aquantia AQC113 10G NIC. Stack trace from production environment: ``` RIP: 0010:skb_add_rx_frag_netmem+0x29/0xd0 Code: 90 f3 0f 1e fa 0f 1f 44 00 00 48 89 f8 41 89 ca 48 89 d7 48 63 ce 8b 90 c0 00 00 00 48 c1 e1 04 48 01 ca 48 03 90 c8 00 00 00 <48> 89 7a 30 44 89 52 3c 44 89 42 38 40 f6 c7 01 75 74 48 89 fa 83 RSP: 0018:ffffa9bec02a8d50 EFLAGS: 00010287 RAX: ffff925b22e80a00 RBX: ffff925ad38d2700 RCX: fffffffe0a0c8000 RDX: ffff9258ea95bac0 RSI: ffff925ae0a0c800 RDI: 0000000000037a40 RBP: 0000000000000024 R08: 0000000000000000 R09: 0000000000000021 R10: 0000000000000848 R11: 0000000000000000 R12: ffffa9bec02a8e24 R13: ffff925ad8615570 R14: 0000000000000000 R15: ffff925b22e80a00 FS: 0000000000000000(0000) GS:ffff925e47880000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff9258ea95baf0 CR3: 0000000166022004 CR4: 0000000000f72ef0 PKRU: 55555554 Call Trace: aq_ring_rx_clean+0x175/0xe60 [atlantic] ? aq_ring_rx_clean+0x14d/0xe60 [atlantic] ? aq_ring_tx_clean+0xdf/0x190 [atlantic] ? kmem_cache_free+0x348/0x450 ? aq_vec_poll+0x81/0x1d0 [atlantic] ? __napi_poll+0x28/0x1c0 ? net_rx_action+0x337/0x420 ``` Fixes: 6aecbba12b5c ("net: atlantic: add check for MAX_SKB_FRAGS") Changes in v4: - Add Fixes: tag to satisfy patch validation requirements. Changes in v3: - Fix by assuming there will be an extra frag if buff->len > AQ_CFG_RX_HDR_SIZE, then all fragments are accounted for. Signed-off-by: Jiefeng Zhang Link: https://patch.msgid.link/20251126032249.69358-1-jiefeng.z.zhang@gmail.com Signed-off-by: Jakub Kicinski (cherry picked from commit 5ffcb7b890f61541201461580bb6622ace405aec) Signed-off-by: Jonathan Maple --- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index f21de0c21e524..d23d23bed39fe 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -547,6 +547,11 @@ static int __aq_ring_rx_clean(struct aq_ring_s *self, struct napi_struct *napi, if (!buff->is_eop) { unsigned int frag_cnt = 0U; + + /* There will be an extra fragment */ + if (buff->len > AQ_CFG_RX_HDR_SIZE) + frag_cnt++; + buff_ = buff; do { bool is_rsc_completed = true; From 41b2f69127804cafce6591e77d26eb470c1ca3ec Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:22 -0500 Subject: [PATCH 20/39] io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU jira KERNEL-572 cve CVE-2025-38453 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Jens Axboe commit fc582cd26e888b0652bc1494f252329453fd3b23 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/fc582cd2.failed syzbot reports that defer/local task_work adding via msg_ring can hit a request that has been freed: CPU: 1 UID: 0 PID: 19356 Comm: iou-wrk-19354 Not tainted 6.16.0-rc4-syzkaller-00108-g17bbde2e1716 #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xd2/0x2b0 mm/kasan/report.c:521 kasan_report+0x118/0x150 mm/kasan/report.c:634 io_req_local_work_add io_uring/io_uring.c:1184 [inline] __io_req_task_work_add+0x589/0x950 io_uring/io_uring.c:1252 io_msg_remote_post io_uring/msg_ring.c:103 [inline] io_msg_data_remote io_uring/msg_ring.c:133 [inline] __io_msg_ring_data+0x820/0xaa0 io_uring/msg_ring.c:151 io_msg_ring_data io_uring/msg_ring.c:173 [inline] io_msg_ring+0x134/0xa00 io_uring/msg_ring.c:314 __io_issue_sqe+0x17e/0x4b0 io_uring/io_uring.c:1739 io_issue_sqe+0x165/0xfd0 io_uring/io_uring.c:1762 io_wq_submit_work+0x6e9/0xb90 io_uring/io_uring.c:1874 io_worker_handle_work+0x7cd/0x1180 io_uring/io-wq.c:642 io_wq_worker+0x42f/0xeb0 io_uring/io-wq.c:696 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 which is supposed to be safe with how requests are allocated. But msg ring requests alloc and free on their own, and hence must defer freeing to a sane time. Add an rcu_head and use kfree_rcu() in both spots where requests are freed. Only the one in io_msg_tw_complete() is strictly required as it has been visible on the other ring, but use it consistently in the other spot as well. This should not cause any other issues outside of KASAN rightfully complaining about it. Link: https://lore.kernel.org/io-uring/686cd2ea.a00a0220.338033.0007.GAE@google.com/ Reported-by: syzbot+54cbbfb4db9145d26fc2@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Fixes: 0617bb500bfa ("io_uring/msg_ring: improve handling of target CQE posting") Signed-off-by: Jens Axboe (cherry picked from commit fc582cd26e888b0652bc1494f252329453fd3b23) Signed-off-by: Jonathan Maple # Conflicts: # include/linux/io_uring_types.h # io_uring/msg_ring.c --- .../fc582cd2.failed | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/fc582cd2.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/fc582cd2.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/fc582cd2.failed new file mode 100644 index 0000000000000..e3a1f073d8992 --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/fc582cd2.failed @@ -0,0 +1,112 @@ +io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU + +jira KERNEL-572 +cve CVE-2025-38453 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Jens Axboe +commit fc582cd26e888b0652bc1494f252329453fd3b23 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/fc582cd2.failed + +syzbot reports that defer/local task_work adding via msg_ring can hit +a request that has been freed: + +CPU: 1 UID: 0 PID: 19356 Comm: iou-wrk-19354 Not tainted 6.16.0-rc4-syzkaller-00108-g17bbde2e1716 #0 PREEMPT(full) +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 +Call Trace: + + dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 + print_address_description mm/kasan/report.c:408 [inline] + print_report+0xd2/0x2b0 mm/kasan/report.c:521 + kasan_report+0x118/0x150 mm/kasan/report.c:634 + io_req_local_work_add io_uring/io_uring.c:1184 [inline] + __io_req_task_work_add+0x589/0x950 io_uring/io_uring.c:1252 + io_msg_remote_post io_uring/msg_ring.c:103 [inline] + io_msg_data_remote io_uring/msg_ring.c:133 [inline] + __io_msg_ring_data+0x820/0xaa0 io_uring/msg_ring.c:151 + io_msg_ring_data io_uring/msg_ring.c:173 [inline] + io_msg_ring+0x134/0xa00 io_uring/msg_ring.c:314 + __io_issue_sqe+0x17e/0x4b0 io_uring/io_uring.c:1739 + io_issue_sqe+0x165/0xfd0 io_uring/io_uring.c:1762 + io_wq_submit_work+0x6e9/0xb90 io_uring/io_uring.c:1874 + io_worker_handle_work+0x7cd/0x1180 io_uring/io-wq.c:642 + io_wq_worker+0x42f/0xeb0 io_uring/io-wq.c:696 + ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 + ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 + + +which is supposed to be safe with how requests are allocated. But msg +ring requests alloc and free on their own, and hence must defer freeing +to a sane time. + +Add an rcu_head and use kfree_rcu() in both spots where requests are +freed. Only the one in io_msg_tw_complete() is strictly required as it +has been visible on the other ring, but use it consistently in the other +spot as well. + +This should not cause any other issues outside of KASAN rightfully +complaining about it. + +Link: https://lore.kernel.org/io-uring/686cd2ea.a00a0220.338033.0007.GAE@google.com/ + Reported-by: syzbot+54cbbfb4db9145d26fc2@syzkaller.appspotmail.com + Cc: stable@vger.kernel.org +Fixes: 0617bb500bfa ("io_uring/msg_ring: improve handling of target CQE posting") + Signed-off-by: Jens Axboe +(cherry picked from commit fc582cd26e888b0652bc1494f252329453fd3b23) + Signed-off-by: Jonathan Maple + +# Conflicts: +# include/linux/io_uring_types.h +# io_uring/msg_ring.c +diff --cc include/linux/io_uring_types.h +index c252e98aee7c,a7efcec2e3d0..000000000000 +--- a/include/linux/io_uring_types.h ++++ b/include/linux/io_uring_types.h +@@@ -646,8 -690,17 +646,22 @@@ struct io_kiocb + atomic_t refs; + bool cancel_seq_set; + struct io_task_work io_task_work; +++<<<<<<< HEAD + + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ + + struct hlist_node hash_node; +++======= ++ union { ++ /* ++ * for polled requests, i.e. IORING_OP_POLL_ADD and async armed ++ * poll ++ */ ++ struct hlist_node hash_node; ++ /* For IOPOLL setup queues, with hybrid polling */ ++ u64 iopoll_start; ++ /* for private io_kiocb freeing */ ++ struct rcu_head rcu_head; ++ }; +++>>>>>>> fc582cd26e88 (io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU) + /* internal polling, see IORING_FEAT_FAST_POLL */ + struct async_poll *apoll; + /* opcode allocated if it needs to store data for async defer */ +diff --cc io_uring/msg_ring.c +index 7fd9badcfaf8,4c2578f2efcb..000000000000 +--- a/io_uring/msg_ring.c ++++ b/io_uring/msg_ring.c +@@@ -89,11 -89,11 +89,16 @@@ static void io_msg_tw_complete(struct i + static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, + int res, u32 cflags, u64 user_data) + { +++<<<<<<< HEAD + + req->task = READ_ONCE(ctx->submitter_task); + + if (!req->task) { + + kmem_cache_free(req_cachep, req); +++======= ++ if (!READ_ONCE(ctx->submitter_task)) { ++ kfree_rcu(req, rcu_head); +++>>>>>>> fc582cd26e88 (io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU) + return -EOWNERDEAD; + } + - req->opcode = IORING_OP_NOP; + req->cqe.user_data = user_data; + io_req_set_res(req, res, cflags); + percpu_ref_get(&ctx->refs); +* Unmerged path include/linux/io_uring_types.h +* Unmerged path io_uring/msg_ring.c From 8095fb3f5ed74fd48abd6b60ad5fef9700940fb6 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:22 -0500 Subject: [PATCH 21/39] ASoC: Intel: bytcr_rt5640: Fix invalid quirk input mapping jira KERNEL-572 cve CVE-2025-40154 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Takashi Iwai commit fba404e4b4af4f4f747bb0e41e9fff7d03c7bcc0 When an invalid value is passed via quirk option, currently bytcr_rt5640 driver only shows an error message but leaves as is. This may lead to unepxected results like OOB access. This patch corrects the input mapping to the certain default value if an invalid value is passed. Fixes: 063422ca2a9d ("ASoC: Intel: bytcr_rt5640: Set card long_name based on quirks") Signed-off-by: Takashi Iwai Message-ID: <20250902171826.27329-3-tiwai@suse.de> Signed-off-by: Mark Brown (cherry picked from commit fba404e4b4af4f4f747bb0e41e9fff7d03c7bcc0) Signed-off-by: Jonathan Maple --- sound/soc/intel/boards/bytcr_rt5640.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index 0f3b8f44e7011..bc846558480e4 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -68,7 +68,8 @@ enum { BYT_RT5640_OVCD_SF_1P5 = (RT5640_OVCD_SF_1P5 << 13), }; -#define BYT_RT5640_MAP(quirk) ((quirk) & GENMASK(3, 0)) +#define BYT_RT5640_MAP_MASK GENMASK(3, 0) +#define BYT_RT5640_MAP(quirk) ((quirk) & BYT_RT5640_MAP_MASK) #define BYT_RT5640_JDSRC(quirk) (((quirk) & GENMASK(7, 4)) >> 4) #define BYT_RT5640_OVCD_TH(quirk) (((quirk) & GENMASK(12, 8)) >> 8) #define BYT_RT5640_OVCD_SF(quirk) (((quirk) & GENMASK(14, 13)) >> 13) @@ -140,7 +141,9 @@ static void log_quirks(struct device *dev) dev_info(dev, "quirk NO_INTERNAL_MIC_MAP enabled\n"); break; default: - dev_err(dev, "quirk map 0x%x is not supported, microphone input will not work\n", map); + dev_warn_once(dev, "quirk sets invalid input map: 0x%x, default to DMIC1_MAP\n", map); + byt_rt5640_quirk &= ~BYT_RT5640_MAP_MASK; + byt_rt5640_quirk |= BYT_RT5640_DMIC1_MAP; break; } if (byt_rt5640_quirk & BYT_RT5640_HSMIC2_ON_IN1) From e8198e4407ce0c0b1f7a97ee89cb03cc336f6833 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:23 -0500 Subject: [PATCH 22/39] uprobes: Fix race in uprobe_free_utask jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Jiri Olsa commit b583ef82b671c9a752fbe3e95bd4c1c51eab764d Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/b583ef82.failed Max Makarov reported kernel panic [1] in perf user callchain code. The reason for that is the race between uprobe_free_utask and bpf profiler code doing the perf user stack unwind and is triggered within uprobe_free_utask function: - after current->utask is freed and - before current->utask is set to NULL general protection fault, probably for non-canonical address 0x9e759c37ee555c76: 0000 [#1] SMP PTI RIP: 0010:is_uprobe_at_func_entry+0x28/0x80 ... ? die_addr+0x36/0x90 ? exc_general_protection+0x217/0x420 ? asm_exc_general_protection+0x26/0x30 ? is_uprobe_at_func_entry+0x28/0x80 perf_callchain_user+0x20a/0x360 get_perf_callchain+0x147/0x1d0 bpf_get_stackid+0x60/0x90 bpf_prog_9aac297fb833e2f5_do_perf_event+0x434/0x53b ? __smp_call_single_queue+0xad/0x120 bpf_overflow_handler+0x75/0x110 ... asm_sysvec_apic_timer_interrupt+0x1a/0x20 RIP: 0010:__kmem_cache_free+0x1cb/0x350 ... ? uprobe_free_utask+0x62/0x80 ? acct_collect+0x4c/0x220 uprobe_free_utask+0x62/0x80 mm_release+0x12/0xb0 do_exit+0x26b/0xaa0 __x64_sys_exit+0x1b/0x20 do_syscall_64+0x5a/0x80 It can be easily reproduced by running following commands in separate terminals: # while :; do bpftrace -e 'uprobe:/bin/ls:_start { printf("hit\n"); }' -c ls; done # bpftrace -e 'profile:hz:100000 { @[ustack()] = count(); }' Fixing this by making sure current->utask pointer is set to NULL before we start to release the utask object. [1] https://github.com/grafana/pyroscope/issues/3673 Fixes: cfa7f3d2c526 ("perf,x86: avoid missing caller address in stack traces captured in uprobe") Reported-by: Max Makarov Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250109141440.2692173-1-jolsa@kernel.org (cherry picked from commit b583ef82b671c9a752fbe3e95bd4c1c51eab764d) Signed-off-by: Jonathan Maple # Conflicts: # kernel/events/uprobes.c --- .../b583ef82.failed | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/b583ef82.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/b583ef82.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/b583ef82.failed new file mode 100644 index 0000000000000..6f0f9d0e45d7a --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/b583ef82.failed @@ -0,0 +1,130 @@ +uprobes: Fix race in uprobe_free_utask + +jira KERNEL-572 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Jiri Olsa +commit b583ef82b671c9a752fbe3e95bd4c1c51eab764d +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/b583ef82.failed + +Max Makarov reported kernel panic [1] in perf user callchain code. + +The reason for that is the race between uprobe_free_utask and bpf +profiler code doing the perf user stack unwind and is triggered +within uprobe_free_utask function: + - after current->utask is freed and + - before current->utask is set to NULL + + general protection fault, probably for non-canonical address 0x9e759c37ee555c76: 0000 [#1] SMP PTI + RIP: 0010:is_uprobe_at_func_entry+0x28/0x80 + ... + ? die_addr+0x36/0x90 + ? exc_general_protection+0x217/0x420 + ? asm_exc_general_protection+0x26/0x30 + ? is_uprobe_at_func_entry+0x28/0x80 + perf_callchain_user+0x20a/0x360 + get_perf_callchain+0x147/0x1d0 + bpf_get_stackid+0x60/0x90 + bpf_prog_9aac297fb833e2f5_do_perf_event+0x434/0x53b + ? __smp_call_single_queue+0xad/0x120 + bpf_overflow_handler+0x75/0x110 + ... + asm_sysvec_apic_timer_interrupt+0x1a/0x20 + RIP: 0010:__kmem_cache_free+0x1cb/0x350 + ... + ? uprobe_free_utask+0x62/0x80 + ? acct_collect+0x4c/0x220 + uprobe_free_utask+0x62/0x80 + mm_release+0x12/0xb0 + do_exit+0x26b/0xaa0 + __x64_sys_exit+0x1b/0x20 + do_syscall_64+0x5a/0x80 + +It can be easily reproduced by running following commands in +separate terminals: + + # while :; do bpftrace -e 'uprobe:/bin/ls:_start { printf("hit\n"); }' -c ls; done + # bpftrace -e 'profile:hz:100000 { @[ustack()] = count(); }' + +Fixing this by making sure current->utask pointer is set to NULL +before we start to release the utask object. + +[1] https://github.com/grafana/pyroscope/issues/3673 + +Fixes: cfa7f3d2c526 ("perf,x86: avoid missing caller address in stack traces captured in uprobe") + Reported-by: Max Makarov + Signed-off-by: Jiri Olsa + Signed-off-by: Peter Zijlstra (Intel) + Acked-by: Oleg Nesterov + Acked-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20250109141440.2692173-1-jolsa@kernel.org +(cherry picked from commit b583ef82b671c9a752fbe3e95bd4c1c51eab764d) + Signed-off-by: Jonathan Maple + +# Conflicts: +# kernel/events/uprobes.c +diff --cc kernel/events/uprobes.c +index b728a0108ec3,5d71ef85420c..000000000000 +--- a/kernel/events/uprobes.c ++++ b/kernel/events/uprobes.c +@@@ -1772,18 -1915,50 +1772,24 @@@ void uprobe_free_utask(struct task_stru + if (!utask) + return; + +++<<<<<<< HEAD + + if (utask->active_uprobe) + + put_uprobe(utask->active_uprobe); +++======= ++ t->utask = NULL; ++ WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); ++ ++ timer_delete_sync(&utask->ri_timer); +++>>>>>>> b583ef82b671 (uprobes: Fix race in uprobe_free_utask) + + ri = utask->return_instances; + while (ri) + - ri = free_ret_instance(ri, true /* cleanup_hprobe */); + + ri = free_ret_instance(ri); + + + xol_free_insn_slot(t); + kfree(utask); +- t->utask = NULL; + } + + -#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ + - + -#define for_each_ret_instance_rcu(pos, head) \ + - for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) + - + -static void ri_timer(struct timer_list *timer) + -{ + - struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); + - struct return_instance *ri; + - + - /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ + - guard(srcu)(&uretprobes_srcu); + - /* RCU protects return_instance from freeing. */ + - guard(rcu)(); + - + - for_each_ret_instance_rcu(ri, utask->return_instances) + - hprobe_expire(&ri->hprobe, false); + -} + - + -static struct uprobe_task *alloc_utask(void) + -{ + - struct uprobe_task *utask; + - + - utask = kzalloc(sizeof(*utask), GFP_KERNEL); + - if (!utask) + - return NULL; + - + - timer_setup(&utask->ri_timer, ri_timer, 0); + - + - return utask; + -} + - + /* + * Allocate a uprobe_task object for the task if necessary. + * Called when the thread hits a breakpoint. +* Unmerged path kernel/events/uprobes.c From 0384e7e2ffac2546e998b07f401a124b31be00b5 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:23 -0500 Subject: [PATCH 23/39] RDMA/core: Fix "KASAN: slab-use-after-free Read in ib_register_device" problem jira KERNEL-572 cve CVE-2025-38022 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Zhu Yanjun commit d0706bfd3ee40923c001c6827b786a309e2a8713 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xc3/0x670 mm/kasan/report.c:521 kasan_report+0xe0/0x110 mm/kasan/report.c:634 strlen+0x93/0xa0 lib/string.c:420 __fortify_strlen include/linux/fortify-string.h:268 [inline] get_kobj_path_length lib/kobject.c:118 [inline] kobject_get_path+0x3f/0x2a0 lib/kobject.c:158 kobject_uevent_env+0x289/0x1870 lib/kobject_uevent.c:545 ib_register_device drivers/infiniband/core/device.c:1472 [inline] ib_register_device+0x8cf/0xe00 drivers/infiniband/core/device.c:1393 rxe_register_device+0x275/0x320 drivers/infiniband/sw/rxe/rxe_verbs.c:1552 rxe_net_add+0x8e/0xe0 drivers/infiniband/sw/rxe/rxe_net.c:550 rxe_newlink+0x70/0x190 drivers/infiniband/sw/rxe/rxe.c:225 nldev_newlink+0x3a3/0x680 drivers/infiniband/core/nldev.c:1796 rdma_nl_rcv_msg+0x387/0x6e0 drivers/infiniband/core/netlink.c:195 rdma_nl_rcv_skb.constprop.0.isra.0+0x2e5/0x450 netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] netlink_unicast+0x53a/0x7f0 net/netlink/af_netlink.c:1339 netlink_sendmsg+0x8d1/0xdd0 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg net/socket.c:727 [inline] ____sys_sendmsg+0xa95/0xc70 net/socket.c:2566 ___sys_sendmsg+0x134/0x1d0 net/socket.c:2620 __sys_sendmsg+0x16d/0x220 net/socket.c:2652 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcd/0x260 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f This problem is similar to the problem that the commit 1d6a9e7449e2 ("RDMA/core: Fix use-after-free when rename device name") fixes. The root cause is: the function ib_device_rename() renames the name with lock. But in the function kobject_uevent(), this name is accessed without lock protection at the same time. The solution is to add the lock protection when this name is accessed in the function kobject_uevent(). Fixes: 779e0bf47632 ("RDMA/core: Do not indicate device ready when device enablement fails") Link: https://patch.msgid.link/r/20250506151008.75701-1-yanjun.zhu@linux.dev Reported-by: syzbot+e2ce9e275ecc70a30b72@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=e2ce9e275ecc70a30b72 Signed-off-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe (cherry picked from commit d0706bfd3ee40923c001c6827b786a309e2a8713) Signed-off-by: Jonathan Maple --- drivers/infiniband/core/device.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index b4e3e4beb7f45..d4263385850a7 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1352,6 +1352,9 @@ static void ib_device_notify_register(struct ib_device *device) down_read(&devices_rwsem); + /* Mark for userspace that device is ready */ + kobject_uevent(&device->dev.kobj, KOBJ_ADD); + ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); if (ret) goto out; @@ -1468,10 +1471,9 @@ int ib_register_device(struct ib_device *device, const char *name, return ret; } dev_set_uevent_suppress(&device->dev, false); - /* Mark for userspace that device is ready */ - kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_notify_register(device); + ib_device_put(device); return 0; From fb7561d02e65bb2fc145cbbc5de76341c6653efc Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:23 -0500 Subject: [PATCH 24/39] x86/boot/compressed: Remove unused header includes from kaslr.c jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Borislav Petkov (AMD) commit 5daececd4ff533ab316ab360aba0bda1bf01961d Nothing is using the linux/ namespace headers anymore. Remove them. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241130122644.GAZ0sEhD3Bm_9ZAIuc@fat_crate.local (cherry picked from commit 5daececd4ff533ab316ab360aba0bda1bf01961d) Signed-off-by: Jonathan Maple --- arch/x86/boot/compressed/kaslr.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index f4d82379bf44f..f03d59ea6e40f 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -25,10 +25,6 @@ #include "efi.h" #include -#include -#include -#include -#include #include #include From 8a7313e8b8fe486d882172c8ac4fb75d4223335b Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:23 -0500 Subject: [PATCH 25/39] x86/kaslr: Reduce KASLR entropy on most x86 systems jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Balbir Singh commit 7ffb791423c7c518269a9aad35039ef824a40adb When CONFIG_PCI_P2PDMA=y (which is basically enabled on all large x86 distros), it maps the PFN's via a ZONE_DEVICE mapping using devm_memremap_pages(). The mapped virtual address range corresponds to the pci_resource_start() of the BAR address and size corresponding to the BAR length. When KASLR is enabled, the direct map range of the kernel is reduced to the size of physical memory plus additional padding. If the BAR address is beyond this limit, PCI peer to peer DMA mappings fail. Fix this by not shrinking the size of the direct map when CONFIG_PCI_P2PDMA=y. This reduces the total available entropy, but it's better than the current work around of having to disable KASLR completely. [ mingo: Clarified the changelog to point out the broad impact ... ] Signed-off-by: Balbir Singh Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Acked-by: Bjorn Helgaas # drivers/pci/Kconfig Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Andy Lutomirski Link: https://lore.kernel.org/lkml/20250206023201.1481957-1-balbirs@nvidia.com/ Link: https://lore.kernel.org/r/20250206234234.1912585-1-balbirs@nvidia.com -- arch/x86/mm/kaslr.c | 10 ++++++++-- drivers/pci/Kconfig | 6 ++++++ 2 files changed, 14 insertions(+), 2 deletions(-) (cherry picked from commit 7ffb791423c7c518269a9aad35039ef824a40adb) Signed-off-by: Jonathan Maple --- arch/x86/mm/kaslr.c | 10 ++++++++-- drivers/pci/Kconfig | 6 ++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 11a93542d1983..3c306de52fd4d 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -113,8 +113,14 @@ void __init kernel_randomize_memory(void) memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; - /* Adapt physical memory region size based on available memory */ - if (memory_tb < kaslr_regions[0].size_tb) + /* + * Adapt physical memory region size based on available memory, + * except when CONFIG_PCI_P2PDMA is enabled. P2PDMA exposes the + * device BAR space assuming the direct map space is large enough + * for creating a ZONE_DEVICE mapping in the direct map corresponding + * to the physical BAR address. + */ + if (!IS_ENABLED(CONFIG_PCI_P2PDMA) && (memory_tb < kaslr_regions[0].size_tb)) kaslr_regions[0].size_tb = memory_tb; /* diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 2fbd379923fd1..5c3054aaec8c1 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -203,6 +203,12 @@ config PCI_P2PDMA P2P DMA transactions must be between devices behind the same root port. + Enabling this option will reduce the entropy of x86 KASLR memory + regions. For example - on a 46 bit system, the entropy goes down + from 16 bits to 15 bits. The actual reduction in entropy depends + on the physical address bits, on processor features, kernel config + (5 level page table) and physical memory present on the system. + If unsure, say N. config PCI_LABEL From d94d9b9221ea6165f4af84c07a92479a4a46d5a8 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:24 -0500 Subject: [PATCH 26/39] x86/mm/init: Handle the special case of device private pages in add_pages(), to not increase max_pfn and trigger dma_addressing_limited() bounce buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Balbir Singh commit 7170130e4c72ce0caa0cb42a1627c635cc262821 As Bert Karwatzki reported, the following recent commit causes a performance regression on AMD iGPU and dGPU systems: 7ffb791423c7 ("x86/kaslr: Reduce KASLR entropy on most x86 systems") It exposed a bug with nokaslr and zone device interaction. The root cause of the bug is that, the GPU driver registers a zone device private memory region. When KASLR is disabled or the above commit is applied, the direct_map_physmem_end is set to much higher than 10 TiB typically to the 64TiB address. When zone device private memory is added to the system via add_pages(), it bumps up the max_pfn to the same value. This causes dma_addressing_limited() to return true, since the device cannot address memory all the way up to max_pfn. This caused a regression for games played on the iGPU, as it resulted in the DMA32 zone being used for GPU allocations. Fix this by not bumping up max_pfn on x86 systems, when pgmap is passed into add_pages(). The presence of pgmap is used to determine if device private memory is being added via add_pages(). More details: devm_request_mem_region() and request_free_mem_region() request for device private memory. iomem_resource is passed as the base resource with start and end parameters. iomem_resource's end depends on several factors, including the platform and virtualization. On x86 for example on bare metal, this value is set to boot_cpu_data.x86_phys_bits. boot_cpu_data.x86_phys_bits can change depending on support for MKTME. By default it is set to the same as log2(direct_map_physmem_end) which is 46 to 52 bits depending on the number of levels in the page table. The allocation routines used iomem_resource's end and direct_map_physmem_end to figure out where to allocate the region. [ arch/powerpc is also impacted by this problem, but this patch does not fix the issue for PowerPC. ] Testing: 1. Tested on a virtual machine with test_hmm for zone device inseration 2. A previous version of this patch was tested by Bert, please see: https://lore.kernel.org/lkml/d87680bab997fdc9fb4e638983132af235d9a03a.camel@web.de/ [ mingo: Clarified the comments and the changelog. ] Reported-by: Bert Karwatzki Tested-by: Bert Karwatzki Fixes: 7ffb791423c7 ("x86/kaslr: Reduce KASLR entropy on most x86 systems") Signed-off-by: Balbir Singh Signed-off-by: Ingo Molnar Cc: Brian Gerst Cc: Juergen Gross Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Andrew Morton Cc: Christoph Hellwig Cc: Pierre-Eric Pelloux-Prayer Cc: Alex Deucher Cc: Christian König Cc: David Airlie Cc: Simona Vetter Link: https://lore.kernel.org/r/20250401000752.249348-1-balbirs@nvidia.com (cherry picked from commit 7170130e4c72ce0caa0cb42a1627c635cc262821) Signed-off-by: Jonathan Maple --- arch/x86/mm/init_64.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 01ea7c6df3036..17c89dad4f7ff 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -967,9 +967,18 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, ret = __add_pages(nid, start_pfn, nr_pages, params); WARN_ON_ONCE(ret); - /* update max_pfn, max_low_pfn and high_memory */ - update_end_of_memory_vars(start_pfn << PAGE_SHIFT, - nr_pages << PAGE_SHIFT); + /* + * Special case: add_pages() is called by memremap_pages() for adding device + * private pages. Do not bump up max_pfn in the device private path, + * because max_pfn changes affect dma_addressing_limited(). + * + * dma_addressing_limited() returning true when max_pfn is the device's + * addressable memory can force device drivers to use bounce buffers + * and impact their performance negatively: + */ + if (!params->pgmap) + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); return ret; } From 6365fdf2f1fbb5c079ab655ce687a2f72db51867 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:24 -0500 Subject: [PATCH 27/39] s390: Disable ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Heiko Carstens commit 64e2f60f355e556337fcffe80b9bcff1b22c9c42 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/64e2f60f.failed As reported by Luiz Capitulino enabling HVO on s390 leads to reproducible crashes. The problem is that kernel page tables are modified without flushing corresponding TLB entries. Even if it looks like the empty flush_tlb_all() implementation on s390 is the problem, it is actually a different problem: on s390 it is not allowed to replace an active/valid page table entry with another valid page table entry without the detour over an invalid entry. A direct replacement may lead to random crashes and/or data corruption. In order to invalidate an entry special instructions have to be used (e.g. ipte or idte). Alternatively there are also special instructions available which allow to replace a valid entry with a different valid entry (e.g. crdte or cspg). Given that the HVO code currently does not provide the hooks to allow for an implementation which is compliant with the s390 architecture requirements, disable ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP again, which is basically a revert of the original patch which enabled it. Reported-by: Luiz Capitulino Closes: https://lore.kernel.org/all/20251028153930.37107-1-luizcap@redhat.com/ Fixes: 00a34d5a99c0 ("s390: select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP") Cc: stable@vger.kernel.org Tested-by: Luiz Capitulino Reviewed-by: Gerald Schaefer Reviewed-by: David Hildenbrand Signed-off-by: Heiko Carstens (cherry picked from commit 64e2f60f355e556337fcffe80b9bcff1b22c9c42) Signed-off-by: Jonathan Maple # Conflicts: # arch/s390/Kconfig --- .../64e2f60f.failed | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/64e2f60f.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/64e2f60f.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/64e2f60f.failed new file mode 100644 index 0000000000000..27929c39bbc39 --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/64e2f60f.failed @@ -0,0 +1,63 @@ +s390: Disable ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP + +jira KERNEL-572 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Heiko Carstens +commit 64e2f60f355e556337fcffe80b9bcff1b22c9c42 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/64e2f60f.failed + +As reported by Luiz Capitulino enabling HVO on s390 leads to reproducible +crashes. The problem is that kernel page tables are modified without +flushing corresponding TLB entries. + +Even if it looks like the empty flush_tlb_all() implementation on s390 is +the problem, it is actually a different problem: on s390 it is not allowed +to replace an active/valid page table entry with another valid page table +entry without the detour over an invalid entry. A direct replacement may +lead to random crashes and/or data corruption. + +In order to invalidate an entry special instructions have to be used +(e.g. ipte or idte). Alternatively there are also special instructions +available which allow to replace a valid entry with a different valid +entry (e.g. crdte or cspg). + +Given that the HVO code currently does not provide the hooks to allow for +an implementation which is compliant with the s390 architecture +requirements, disable ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP again, which is +basically a revert of the original patch which enabled it. + + Reported-by: Luiz Capitulino +Closes: https://lore.kernel.org/all/20251028153930.37107-1-luizcap@redhat.com/ +Fixes: 00a34d5a99c0 ("s390: select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP") + Cc: stable@vger.kernel.org + Tested-by: Luiz Capitulino + Reviewed-by: Gerald Schaefer + Reviewed-by: David Hildenbrand + Signed-off-by: Heiko Carstens +(cherry picked from commit 64e2f60f355e556337fcffe80b9bcff1b22c9c42) + Signed-off-by: Jonathan Maple + +# Conflicts: +# arch/s390/Kconfig +diff --cc arch/s390/Kconfig +index 990c93235da3,df22b10d9141..000000000000 +--- a/arch/s390/Kconfig ++++ b/arch/s390/Kconfig +@@@ -140,9 -155,10 +140,13 @@@ config S39 + select ARCH_WANTS_NO_INSTR + select ARCH_WANT_DEFAULT_BPF_JIT + select ARCH_WANT_IPC_PARSE_VERSION + - select ARCH_WANT_IRQS_OFF_ACTIVATE_MM + select ARCH_WANT_KERNEL_PMD_MKWRITE + select ARCH_WANT_LD_ORPHAN_WARN +++<<<<<<< HEAD + + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP +++======= ++ select ARCH_WANTS_THP_SWAP +++>>>>>>> 64e2f60f355e (s390: Disable ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP) + select BUILDTIME_TABLE_SORT + select CLONE_BACKWARDS2 + select DCACHE_WORD_ACCESS if !KMSAN +* Unmerged path arch/s390/Kconfig From 7464dea0cd0adfb723cbf0c0a1610c42fc8428d6 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:25 -0500 Subject: [PATCH 28/39] xfs: rearrange code in xfs_inode_item_precommit jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Dave Chinner commit bc7d684fea18cc48c3630d2b7f1789000ff2df5b Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/bc7d684f.failed There are similar extsize checks and updates done inside and outside the inode item lock, which could all be done under a single top level logic branch outside the ili_lock. The COW extsize fixup can potentially miss updating the XFS_ILOG_CORE in ili_fsync_fields, so moving this code up above the ili_fsync_fields update could also be considered a fix. Further, to make the next change a bit cleaner, move where we calculate the on-disk flag mask to after we attach the cluster buffer to the the inode log item. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino (cherry picked from commit bc7d684fea18cc48c3630d2b7f1789000ff2df5b) Signed-off-by: Jonathan Maple # Conflicts: # fs/xfs/xfs_inode_item.c --- .../bc7d684f.failed | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/bc7d684f.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/bc7d684f.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/bc7d684f.failed new file mode 100644 index 0000000000000..20f762a64afa4 --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/bc7d684f.failed @@ -0,0 +1,89 @@ +xfs: rearrange code in xfs_inode_item_precommit + +jira KERNEL-572 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Dave Chinner +commit bc7d684fea18cc48c3630d2b7f1789000ff2df5b +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/bc7d684f.failed + +There are similar extsize checks and updates done inside and outside +the inode item lock, which could all be done under a single top +level logic branch outside the ili_lock. The COW extsize fixup can +potentially miss updating the XFS_ILOG_CORE in ili_fsync_fields, so +moving this code up above the ili_fsync_fields update could also be +considered a fix. + +Further, to make the next change a bit cleaner, move where we +calculate the on-disk flag mask to after we attach the cluster +buffer to the the inode log item. + + Signed-off-by: Dave Chinner + Reviewed-by: Christoph Hellwig + Reviewed-by: Darrick J. Wong + Signed-off-by: Carlos Maiolino +(cherry picked from commit bc7d684fea18cc48c3630d2b7f1789000ff2df5b) + Signed-off-by: Jonathan Maple + +# Conflicts: +# fs/xfs/xfs_inode_item.c +diff --cc fs/xfs/xfs_inode_item.c +index b509cbd191f4,678ca95793e0..000000000000 +--- a/fs/xfs/xfs_inode_item.c ++++ b/fs/xfs/xfs_inode_item.c +@@@ -131,32 -131,28 +131,35 @@@ xfs_inode_item_precommit + } + + /* +- * Inode verifiers do not check that the extent size hint is an integer +- * multiple of the rt extent size on a directory with both rtinherit +- * and extszinherit flags set. If we're logging a directory that is +- * misconfigured in this way, clear the hint. ++ * Inode verifiers do not check that the extent size hints are an ++ * integer multiple of the rt extent size on a directory with ++ * rtinherit flags set. If we're logging a directory that is ++ * misconfigured in this way, clear the bad hints. + */ +- if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && +- (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && +- xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { +- ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | +- XFS_DIFLAG_EXTSZINHERIT); +- ip->i_extsize = 0; +- flags |= XFS_ILOG_CORE; ++ if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) { ++ if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && ++ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { ++ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | ++ XFS_DIFLAG_EXTSZINHERIT); ++ ip->i_extsize = 0; ++ flags |= XFS_ILOG_CORE; ++ } ++ if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && ++ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { ++ ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; ++ ip->i_cowextsize = 0; ++ flags |= XFS_ILOG_CORE; ++ } + } + +- /* +- * Record the specific change for fdatasync optimisation. This allows +- * fdatasync to skip log forces for inodes that are only timestamp +- * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it +- * to XFS_ILOG_CORE so that the actual on-disk dirty tracking +- * (ili_fields) correctly tracks that the version has changed. +- */ + spin_lock(&iip->ili_lock); +++<<<<<<< HEAD + + iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); + + if (flags & XFS_ILOG_IVERSION) + + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + + +++======= +++>>>>>>> bc7d684fea18 (xfs: rearrange code in xfs_inode_item_precommit) + if (!iip->ili_item.li_buf) { + struct xfs_buf *bp; + int error; +* Unmerged path fs/xfs/xfs_inode_item.c From e5d84d2b9f719b206ae293d7bb73d6c318a7506b Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:25 -0500 Subject: [PATCH 29/39] xfs: rework datasync tracking and execution jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Dave Chinner commit c91d38b57f2c4784d885c874b2a1234a01361afd Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/c91d38b5.failed Jan Kara reported that the shared ILOCK held across the journal flush during fdatasync operations slows down O_DSYNC DIO on unwritten extents significantly. The underlying issue is that unwritten extent conversion needs the ILOCK exclusive, whilst the datasync operation after the extent conversion holds it shared. Hence we cannot be flushing the journal for one IO completion whilst at the same time doing unwritten extent conversion on another IO completion on the same inode. This means that IO completions lock-step, and IO performance is dependent on the journal flush latency. Jan demonstrated that reducing the ifdatasync lock hold time can improve O_DSYNC DIO to unwritten extents performance by 2.5x. Discussion on that patch found issues with the method, and we came to the conclusion that separately tracking datasync flush sequences was the best approach to solving the problem. The fsync code uses the ILOCK to serialise against concurrent modifications in the transaction commit phase. In a transaction commit, there are several disjoint updates to inode log item state that need to be considered atomically by the fsync code. These operations are all done under ILOCK_EXCL context: 1. ili_fsync_flags is updated in ->iop_precommit 2. i_pincount is updated in ->iop_pin before it is added to the CIL 3. ili_commit_seq is updated in ->iop_committing, after it has been added to the CIL In fsync, we need to: 1. check that the inode is dirty in the journal (ipincount) 2. check that ili_fsync_flags is set 3. grab the ili_commit_seq if a journal flush is needed 4. clear the ili_fsync_flags to ensure that new modifications that require fsync are tracked in ->iop_precommit correctly The serialisation of ipincount/ili_commit_seq is needed to ensure that we don't try to unnecessarily flush the journal. The serialisation of ili_fsync_flags being set in ->iop_precommit and cleared in fsync post journal flush is required for correctness. Hence holding the ILOCK_SHARED in xfs_file_fsync() performs all this serialisation for us. Ideally, we want to remove the need to hold the ILOCK_SHARED in xfs_file_fsync() for best performance. We start with the observation that fsync/fdatasync() only need to wait for operations that have been completed. Hence operations that are still being committed have not completed and datasync operations do not need to wait for them. This means we can use a single point in time in the commit process to signal "this modification is complete". This is what ->iop_committing is supposed to provide - it is the point at which the object is unlocked after the modification has been recorded in the CIL. Hence we could use ili_commit_seq to determine if we should flush the journal. In theory, we can already do this. However, in practice this will expose an internal global CIL lock to the IO path. The ipincount() checks optimise away the need to take this lock - if the inode is not pinned, then it is not in the CIL and we don't need to check if a journal flush at ili_commit_seq needs to be performed. The reason this is needed is that the ili_commit_seq is never cleared. Once it is set, it remains set even once the journal has been committed and the object has been unpinned. Hence we have to look that journal internal commit sequence state to determine if ili_commit_seq needs to be acted on or not. We can solve this by clearing ili_commit_seq when the inode is unpinned. If we clear it atomically with the last unpin going away, then we are guaranteed that new modifications will order correctly as they add a new pin counts and we won't clear a sequence number for an active modification in the CIL. Further, we can then allow the per-transaction flag state to propagate into ->iop_committing (instead of clearing it in ->iop_precommit) and that will allow us to determine if the modification needs a full fsync or just a datasync, and so we can record a separate datasync sequence number (Jan's idea!) and then use that in the fdatasync path instead of the full fsync sequence number. With this infrastructure in place, we no longer need the ILOCK_SHARED in the fsync path. All serialisation is done against the commit sequence numbers - if the sequence number is set, then we have to flush the journal. If it is not set, then we have nothing to do. This greatly simplifies the fsync implementation.... Signed-off-by: Dave Chinner Tested-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino (cherry picked from commit c91d38b57f2c4784d885c874b2a1234a01361afd) Signed-off-by: Jonathan Maple # Conflicts: # fs/xfs/xfs_inode_item.c --- .../c91d38b5.failed | 356 ++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/c91d38b5.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/c91d38b5.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/c91d38b5.failed new file mode 100644 index 0000000000000..0ee1fe85acec5 --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/c91d38b5.failed @@ -0,0 +1,356 @@ +xfs: rework datasync tracking and execution + +jira KERNEL-572 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Dave Chinner +commit c91d38b57f2c4784d885c874b2a1234a01361afd +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/c91d38b5.failed + +Jan Kara reported that the shared ILOCK held across the journal +flush during fdatasync operations slows down O_DSYNC DIO on +unwritten extents significantly. The underlying issue is that +unwritten extent conversion needs the ILOCK exclusive, whilst the +datasync operation after the extent conversion holds it shared. + +Hence we cannot be flushing the journal for one IO completion whilst +at the same time doing unwritten extent conversion on another IO +completion on the same inode. This means that IO completions +lock-step, and IO performance is dependent on the journal flush +latency. + +Jan demonstrated that reducing the ifdatasync lock hold time can +improve O_DSYNC DIO to unwritten extents performance by 2.5x. +Discussion on that patch found issues with the method, and we +came to the conclusion that separately tracking datasync flush +sequences was the best approach to solving the problem. + +The fsync code uses the ILOCK to serialise against concurrent +modifications in the transaction commit phase. In a transaction +commit, there are several disjoint updates to inode log item state +that need to be considered atomically by the fsync code. These +operations are all done under ILOCK_EXCL context: + +1. ili_fsync_flags is updated in ->iop_precommit +2. i_pincount is updated in ->iop_pin before it is added to the CIL +3. ili_commit_seq is updated in ->iop_committing, after it has been + added to the CIL + +In fsync, we need to: + +1. check that the inode is dirty in the journal (ipincount) +2. check that ili_fsync_flags is set +3. grab the ili_commit_seq if a journal flush is needed +4. clear the ili_fsync_flags to ensure that new modifications that +require fsync are tracked in ->iop_precommit correctly + +The serialisation of ipincount/ili_commit_seq is needed +to ensure that we don't try to unnecessarily flush the journal. + +The serialisation of ili_fsync_flags being set in +->iop_precommit and cleared in fsync post journal flush is +required for correctness. + +Hence holding the ILOCK_SHARED in xfs_file_fsync() performs all this +serialisation for us. Ideally, we want to remove the need to hold +the ILOCK_SHARED in xfs_file_fsync() for best performance. + +We start with the observation that fsync/fdatasync() only need to +wait for operations that have been completed. Hence operations that +are still being committed have not completed and datasync operations +do not need to wait for them. + +This means we can use a single point in time in the commit process +to signal "this modification is complete". This is what +->iop_committing is supposed to provide - it is the point at which +the object is unlocked after the modification has been recorded in +the CIL. Hence we could use ili_commit_seq to determine if we should +flush the journal. + +In theory, we can already do this. However, in practice this will +expose an internal global CIL lock to the IO path. The ipincount() +checks optimise away the need to take this lock - if the inode is +not pinned, then it is not in the CIL and we don't need to check if +a journal flush at ili_commit_seq needs to be performed. + +The reason this is needed is that the ili_commit_seq is never +cleared. Once it is set, it remains set even once the journal has +been committed and the object has been unpinned. Hence we have to +look that journal internal commit sequence state to determine if +ili_commit_seq needs to be acted on or not. + +We can solve this by clearing ili_commit_seq when the inode is +unpinned. If we clear it atomically with the last unpin going away, +then we are guaranteed that new modifications will order correctly +as they add a new pin counts and we won't clear a sequence number +for an active modification in the CIL. + +Further, we can then allow the per-transaction flag state to +propagate into ->iop_committing (instead of clearing it in +->iop_precommit) and that will allow us to determine if the +modification needs a full fsync or just a datasync, and so we can +record a separate datasync sequence number (Jan's idea!) and then +use that in the fdatasync path instead of the full fsync sequence +number. + +With this infrastructure in place, we no longer need the +ILOCK_SHARED in the fsync path. All serialisation is done against +the commit sequence numbers - if the sequence number is set, then we +have to flush the journal. If it is not set, then we have nothing to +do. This greatly simplifies the fsync implementation.... + + Signed-off-by: Dave Chinner + Tested-by: Jan Kara + Reviewed-by: Christoph Hellwig + Signed-off-by: Carlos Maiolino +(cherry picked from commit c91d38b57f2c4784d885c874b2a1234a01361afd) + Signed-off-by: Jonathan Maple + +# Conflicts: +# fs/xfs/xfs_inode_item.c +diff --cc fs/xfs/xfs_inode_item.c +index b509cbd191f4,1bd411a1114c..000000000000 +--- a/fs/xfs/xfs_inode_item.c ++++ b/fs/xfs/xfs_inode_item.c +@@@ -191,6 -187,20 +191,23 @@@ xfs_inode_item_precommit + } + + /* +++<<<<<<< HEAD +++======= ++ * Store the dirty flags back into the inode item as this state is used ++ * later on in xfs_inode_item_committing() to determine whether the ++ * transaction is relevant to fsync state or not. ++ */ ++ iip->ili_dirty_flags = flags; ++ ++ /* ++ * Convert the flags on-disk fields that have been modified in the ++ * transaction so that ili_fields tracks the changes correctly. ++ */ ++ if (flags & XFS_ILOG_IVERSION) ++ flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); ++ ++ /* +++>>>>>>> c91d38b57f2c (xfs: rework datasync tracking and execution) + * Always OR in the bits from the ili_last_fields field. This is to + * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines + * in the eventual clearing of the ili_fields bits. See the big comment +diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c +index b19916b11fd5..220dc674aa02 100644 +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -73,52 +73,47 @@ xfs_dir_fsync( + return xfs_log_force_inode(ip); + } + +-static xfs_csn_t +-xfs_fsync_seq( +- struct xfs_inode *ip, +- bool datasync) +-{ +- if (!xfs_ipincount(ip)) +- return 0; +- if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) +- return 0; +- return ip->i_itemp->ili_commit_seq; +-} +- + /* +- * All metadata updates are logged, which means that we just have to flush the +- * log up to the latest LSN that touched the inode. ++ * All metadata updates are logged, which means that we just have to push the ++ * journal to the required sequence number than holds the updates. We track ++ * datasync commits separately to full sync commits, and hence only need to ++ * select the correct sequence number for the log force here. + * +- * If we have concurrent fsync/fdatasync() calls, we need them to all block on +- * the log force before we clear the ili_fsync_fields field. This ensures that +- * we don't get a racing sync operation that does not wait for the metadata to +- * hit the journal before returning. If we race with clearing ili_fsync_fields, +- * then all that will happen is the log force will do nothing as the lsn will +- * already be on disk. We can't race with setting ili_fsync_fields because that +- * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock +- * shared until after the ili_fsync_fields is cleared. ++ * We don't have to serialise against concurrent modifications, as we do not ++ * have to wait for modifications that have not yet completed. We define a ++ * transaction commit as completing when the commit sequence number is updated, ++ * hence if the sequence number has not updated, the sync operation has been ++ * run before the commit completed and we don't have to wait for it. ++ * ++ * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain ++ * set on the log item until - at least - the journal flush completes. In ++ * reality, they are only cleared when the inode is fully unpinned (i.e. ++ * persistent in the journal and not dirty in the CIL), and so we rely on ++ * xfs_log_force_seq() either skipping sequences that have been persisted or ++ * waiting on sequences that are still in flight to correctly order concurrent ++ * sync operations. + */ +-static int ++static int + xfs_fsync_flush_log( + struct xfs_inode *ip, + bool datasync, + int *log_flushed) + { +- int error = 0; +- xfs_csn_t seq; ++ struct xfs_inode_log_item *iip = ip->i_itemp; ++ xfs_csn_t seq = 0; + +- xfs_ilock(ip, XFS_ILOCK_SHARED); +- seq = xfs_fsync_seq(ip, datasync); +- if (seq) { +- error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, +- log_flushed); ++ spin_lock(&iip->ili_lock); ++ if (datasync) ++ seq = iip->ili_datasync_seq; ++ else ++ seq = iip->ili_commit_seq; ++ spin_unlock(&iip->ili_lock); + +- spin_lock(&ip->i_itemp->ili_lock); +- ip->i_itemp->ili_fsync_fields = 0; +- spin_unlock(&ip->i_itemp->ili_lock); +- } +- xfs_iunlock(ip, XFS_ILOCK_SHARED); +- return error; ++ if (!seq) ++ return 0; ++ ++ return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, ++ log_flushed); + } + + STATIC int +@@ -156,12 +151,10 @@ xfs_file_fsync( + error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + + /* +- * Any inode that has dirty modifications in the log is pinned. The +- * racy check here for a pinned inode will not catch modifications +- * that happen concurrently to the fsync call, but fsync semantics +- * only require to sync previously completed I/O. ++ * If the inode has a inode log item attached, it may need the journal ++ * flushed to persist any changes the log item might be tracking. + */ +- if (xfs_ipincount(ip)) { ++ if (ip->i_itemp) { + err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); + if (err2 && !error) + error = err2; +diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c +index 19dcb569a3e7..b84684577b0f 100644 +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1642,7 +1642,6 @@ xfs_ifree_mark_inode_stale( + spin_lock(&iip->ili_lock); + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; +- iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); + ASSERT(iip->ili_last_fields); + +@@ -1808,12 +1807,20 @@ static void + xfs_iunpin( + struct xfs_inode *ip) + { +- xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); ++ struct xfs_inode_log_item *iip = ip->i_itemp; ++ xfs_csn_t seq = 0; + + trace_xfs_inode_unpin_nowait(ip, _RET_IP_); ++ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); ++ ++ spin_lock(&iip->ili_lock); ++ seq = iip->ili_commit_seq; ++ spin_unlock(&iip->ili_lock); ++ if (!seq) ++ return; + + /* Give the log a push to start the unpinning I/O */ +- xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); ++ xfs_log_force_seq(ip->i_mount, seq, 0, NULL); + + } + +@@ -2472,7 +2479,6 @@ xfs_iflush( + spin_lock(&iip->ili_lock); + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; +- iip->ili_fsync_fields = 0; + set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); + spin_unlock(&iip->ili_lock); + +@@ -2631,12 +2637,15 @@ int + xfs_log_force_inode( + struct xfs_inode *ip) + { ++ struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; + +- xfs_ilock(ip, XFS_ILOCK_SHARED); +- if (xfs_ipincount(ip)) +- seq = ip->i_itemp->ili_commit_seq; +- xfs_iunlock(ip, XFS_ILOCK_SHARED); ++ if (!iip) ++ return 0; ++ ++ spin_lock(&iip->ili_lock); ++ seq = iip->ili_commit_seq; ++ spin_unlock(&iip->ili_lock); + + if (!seq) + return 0; +* Unmerged path fs/xfs/xfs_inode_item.c +diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h +index 377e06007804..7a6d179aad4c 100644 +--- a/fs/xfs/xfs_inode_item.h ++++ b/fs/xfs/xfs_inode_item.h +@@ -32,9 +32,17 @@ struct xfs_inode_log_item { + spinlock_t ili_lock; /* flush state lock */ + unsigned int ili_last_fields; /* fields when flushed */ + unsigned int ili_fields; /* fields to be logged */ +- unsigned int ili_fsync_fields; /* logged since last fsync */ + xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ ++ ++ /* ++ * We record the sequence number for every inode modification, as ++ * well as those that only require fdatasync operations for data ++ * integrity. This allows optimisation of the O_DSYNC/fdatasync path ++ * without needing to track what modifications the journal is currently ++ * carrying for the inode. These are protected by the above ili_lock. ++ */ + xfs_csn_t ili_commit_seq; /* last transaction commit */ ++ xfs_csn_t ili_datasync_seq; /* for datasync optimisation */ + }; + + static inline int xfs_inode_clean(struct xfs_inode *ip) +diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c +index 86da16f54be9..b9ffe609376a 100644 +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -133,9 +133,18 @@ xfs_bmbt_to_iomap( + iomap->bdev = target->bt_bdev; + iomap->flags = iomap_flags; + +- if (xfs_ipincount(ip) && +- (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) +- iomap->flags |= IOMAP_F_DIRTY; ++ /* ++ * If the inode is dirty for datasync purposes, let iomap know so it ++ * doesn't elide the IO completion journal flushes on O_DSYNC IO. ++ */ ++ if (ip->i_itemp) { ++ struct xfs_inode_log_item *iip = ip->i_itemp; ++ ++ spin_lock(&iip->ili_lock); ++ if (iip->ili_datasync_seq) ++ iomap->flags |= IOMAP_F_DIRTY; ++ spin_unlock(&iip->ili_lock); ++ } + + iomap->validity_cookie = sequence_cookie; + iomap->folio_ops = &xfs_iomap_folio_ops; From fd2c5f3e6eba7297e56b259725314154afffb7ec Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:25 -0500 Subject: [PATCH 30/39] scsi: st: Don't modify unknown block number in MTIOCGET MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Kai Mäkisara commit 5bb2d6179d1a8039236237e1e94cfbda3be1ed9e Struct mtget field mt_blkno -1 means it is unknown. Don't add anything to it. Signed-off-by: Kai Mäkisara Link: https://bugzilla.kernel.org/show_bug.cgi?id=219419#c14 Link: https://lore.kernel.org/r/20241106095723.63254-2-Kai.Makisara@kolumbus.fi Reviewed-by: John Meneghini Tested-by: John Meneghini Signed-off-by: Martin K. Petersen (cherry picked from commit 5bb2d6179d1a8039236237e1e94cfbda3be1ed9e) Signed-off-by: Jonathan Maple --- drivers/scsi/st.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 49386e8e141b1..7bcb9f83d4f6e 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -3792,7 +3792,7 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) ((STp->density << MT_ST_DENSITY_SHIFT) & MT_ST_DENSITY_MASK); mt_status.mt_blkno = STps->drv_block; mt_status.mt_fileno = STps->drv_file; - if (STp->block_size != 0) { + if (STp->block_size != 0 && mt_status.mt_blkno >= 0) { if (STps->rw == ST_WRITING) mt_status.mt_blkno += (STp->buffer)->buffer_bytes / STp->block_size; From f766dee539fd5ae3a7457085340cc86cc4ea06d1 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:26 -0500 Subject: [PATCH 31/39] scsi: st: Add MTIOCGET and MTLOAD to ioctls allowed after device reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Kai Mäkisara commit 0b120edb37dc9dd8ca82893d386922eb6b16f860 Most drives rewind the tape when the device is reset. Reading and writing are not allowed until something is done to make the tape position match the user's expectation (e.g., rewind the tape). Add MTIOCGET and MTLOAD to operations allowed after reset. MTIOCGET is modified to not touch the tape if pos_unknown is non-zero. The tape location is known after MTLOAD. Signed-off-by: Kai Mäkisara Link: https://bugzilla.kernel.org/show_bug.cgi?id=219419#c14 Link: https://lore.kernel.org/r/20241106095723.63254-3-Kai.Makisara@kolumbus.fi Reviewed-by: John Meneghini Tested-by: John Meneghini Signed-off-by: Martin K. Petersen (cherry picked from commit 0b120edb37dc9dd8ca82893d386922eb6b16f860) Signed-off-by: Jonathan Maple --- drivers/scsi/st.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 7bcb9f83d4f6e..fc9b686b3faa4 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -3528,6 +3528,7 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) int i, cmd_nr, cmd_type, bt; int retval = 0; unsigned int blk; + bool cmd_mtiocget; struct scsi_tape *STp = file->private_data; struct st_modedef *STm; struct st_partstat *STps; @@ -3641,6 +3642,7 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) */ if (mtc.mt_op != MTREW && mtc.mt_op != MTOFFL && + mtc.mt_op != MTLOAD && mtc.mt_op != MTRETEN && mtc.mt_op != MTERASE && mtc.mt_op != MTSEEK && @@ -3768,17 +3770,28 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) goto out; } + cmd_mtiocget = cmd_type == _IOC_TYPE(MTIOCGET) && cmd_nr == _IOC_NR(MTIOCGET); + if ((i = flush_buffer(STp, 0)) < 0) { - retval = i; - goto out; - } - if (STp->can_partitions && - (i = switch_partition(STp)) < 0) { - retval = i; - goto out; + if (cmd_mtiocget && STp->pos_unknown) { + /* flush fails -> modify status accordingly */ + reset_state(STp); + STp->pos_unknown = 1; + } else { /* return error */ + retval = i; + goto out; + } + } else { /* flush_buffer succeeds */ + if (STp->can_partitions) { + i = switch_partition(STp); + if (i < 0) { + retval = i; + goto out; + } + } } - if (cmd_type == _IOC_TYPE(MTIOCGET) && cmd_nr == _IOC_NR(MTIOCGET)) { + if (cmd_mtiocget) { struct mtget mt_status; if (_IOC_SIZE(cmd_in) != sizeof(struct mtget)) { From 43125b64150ef2a31a596281ed3a9ad15d1ca743 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:26 -0500 Subject: [PATCH 32/39] scsi: st: New session only when Unit Attention for new tape MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Kai Mäkisara commit a4550b28c8c853e7241ecf30b4f1d9c6bc631fda Currently the code starts new tape session when any Unit Attention (UA) is seen when opening the device. This leads to incorrectly clearing pos_unknown when the UA is for reset. Set new session only when the UA is for a new tape. Signed-off-by: Kai Mäkisara Link: https://lore.kernel.org/r/20241106095723.63254-4-Kai.Makisara@kolumbus.fi Reviewed-by: John Meneghini Tested-by: John Meneghini Signed-off-by: Martin K. Petersen (cherry picked from commit a4550b28c8c853e7241ecf30b4f1d9c6bc631fda) Signed-off-by: Jonathan Maple --- drivers/scsi/st.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index fc9b686b3faa4..dc5e4c32289cf 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -1002,7 +1002,10 @@ static int test_ready(struct scsi_tape *STp, int do_wait) scode = cmdstatp->sense_hdr.sense_key; if (scode == UNIT_ATTENTION) { /* New media? */ - new_session = 1; + if (cmdstatp->sense_hdr.asc == 0x28) { /* New media */ + new_session = 1; + DEBC_printk(STp, "New tape session."); + } if (attentions < MAX_ATTENTIONS) { attentions++; continue; From 4b183bc0bd293f1cd1883e8fb0d4cf20b822160b Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:26 -0500 Subject: [PATCH 33/39] scsi: st: Don't set pos_unknown just after device recognition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Kai Mäkisara commit 98b37881b7492ae9048ad48260cc8a6ee9eb39fd Commit 9604eea5bd3a ("scsi: st: Add third party poweron reset handling") in v6.6 added new code to handle the Power On/Reset Unit Attention (POR UA) sense data. This was in addition to the existing method. When this Unit Attention is received, the driver blocks attempts to read, write and some other operations because the reset may have rewinded the tape. Because of the added code, also the initial POR UA resulted in blocking operations, including those that are used to set the driver options after the device is recognized. Also, reading and writing are refused, whereas they succeeded before this commit. Add code to not set pos_unknown to block operations if the POR UA is received from the first test_ready() call after the st device has been created. This restores the behavior before v6.6. Signed-off-by: Kai Mäkisara Link: https://lore.kernel.org/r/20241216113755.30415-1-Kai.Makisara@kolumbus.fi Fixes: 9604eea5bd3a ("scsi: st: Add third party poweron reset handling") CC: stable@vger.kernel.org Closes: https://lore.kernel.org/linux-scsi/2201CF73-4795-4D3B-9A79-6EE5215CF58D@kolumbus.fi/ Signed-off-by: Martin K. Petersen (cherry picked from commit 98b37881b7492ae9048ad48260cc8a6ee9eb39fd) Signed-off-by: Jonathan Maple --- drivers/scsi/st.c | 6 ++++++ drivers/scsi/st.h | 1 + 2 files changed, 7 insertions(+) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index dc5e4c32289cf..74a6830b7ed8e 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -1048,6 +1048,11 @@ static int test_ready(struct scsi_tape *STp, int do_wait) retval = new_session ? CHKRES_NEW_SESSION : CHKRES_READY; break; } + if (STp->first_tur) { + /* Don't set pos_unknown right after device recognition */ + STp->pos_unknown = 0; + STp->first_tur = 0; + } if (SRpnt != NULL) st_release_request(SRpnt); @@ -4364,6 +4369,7 @@ static int st_probe(struct device *dev) blk_queue_rq_timeout(tpnt->device->request_queue, ST_TIMEOUT); tpnt->long_timeout = ST_LONG_TIMEOUT; tpnt->try_dio = try_direct_io; + tpnt->first_tur = 1; for (i = 0; i < ST_NBR_MODES; i++) { STm = &(tpnt->modes[i]); diff --git a/drivers/scsi/st.h b/drivers/scsi/st.h index 47b0e31b7828c..0d7c4b8c2c8a8 100644 --- a/drivers/scsi/st.h +++ b/drivers/scsi/st.h @@ -171,6 +171,7 @@ struct scsi_tape { unsigned char rew_at_close; /* rewind necessary at close */ unsigned char inited; unsigned char cleaning_req; /* cleaning requested? */ + unsigned char first_tur; /* first TEST UNIT READY */ int block_size; int changed_blksize; int min_block; From baa4384112b2285301fe49c0bc56762ad1034942 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:26 -0500 Subject: [PATCH 34/39] scsi: st: Separate st-unique ioctl handling from SCSI common ioctl handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author David Jeffery commit b37d70c0df85e217a868ecdf535500ff926427ae The st ioctl function currently interleaves code for handling various st specific ioctls with parts of code needed for handling ioctls common to all SCSI devices. Separate out st's code for the common ioctls into a more manageable, separate function. Signed-off-by: David Jeffery Tested-by: Laurence Oberman Acked-by: Kai Mäkisara Reviewed-by: John Meneghini Tested-by: John Meneghini Link: https://patch.msgid.link/20251104154709.6436-1-djeffery@redhat.com Signed-off-by: Martin K. Petersen (cherry picked from commit b37d70c0df85e217a868ecdf535500ff926427ae) Signed-off-by: Jonathan Maple --- drivers/scsi/st.c | 85 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 23 deletions(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 74a6830b7ed8e..505850f009ca5 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -3526,8 +3526,60 @@ static int partition_tape(struct scsi_tape *STp, int size) out: return result; } - +/* + * Handles any extra state needed for ioctls which are not st-specific. + * Called with the scsi_tape lock held, released before return + */ +static long st_common_ioctl(struct scsi_tape *STp, struct st_modedef *STm, + struct file *file, unsigned int cmd_in, + unsigned long arg) +{ + int i, retval = 0; + + if (!STm->defined) { + retval = -ENXIO; + goto out; + } + + if ((i = flush_buffer(STp, 0)) < 0) { + retval = i; + goto out; + } else { /* flush_buffer succeeds */ + if (STp->can_partitions) { + i = switch_partition(STp); + if (i < 0) { + retval = i; + goto out; + } + } + } + mutex_unlock(&STp->lock); + + switch (cmd_in) { + case SG_IO: + case SCSI_IOCTL_SEND_COMMAND: + case CDROM_SEND_PACKET: + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + break; + default: + break; + } + + retval = scsi_ioctl(STp->device, file->f_mode & FMODE_WRITE, + cmd_in, (void __user *)arg); + if (!retval && cmd_in == SCSI_IOCTL_STOP_UNIT) { + /* unload */ + STp->rew_at_close = 0; + STp->ready = ST_NO_TAPE; + } + + return retval; +out: + mutex_unlock(&STp->lock); + return retval; +} /* The ioctl command */ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) @@ -3565,6 +3617,15 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) if (retval) goto out; + switch (cmd_in) { + case MTIOCPOS: + case MTIOCGET: + case MTIOCTOP: + break; + default: + return st_common_ioctl(STp, STm, file, cmd_in, arg); + } + cmd_type = _IOC_TYPE(cmd_in); cmd_nr = _IOC_NR(cmd_in); @@ -3876,29 +3937,7 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) } mt_pos.mt_blkno = blk; retval = put_user_mtpos(p, &mt_pos); - goto out; } - mutex_unlock(&STp->lock); - - switch (cmd_in) { - case SG_IO: - case SCSI_IOCTL_SEND_COMMAND: - case CDROM_SEND_PACKET: - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - break; - default: - break; - } - - retval = scsi_ioctl(STp->device, file->f_mode & FMODE_WRITE, cmd_in, p); - if (!retval && cmd_in == SCSI_IOCTL_STOP_UNIT) { - /* unload */ - STp->rew_at_close = 0; - STp->ready = ST_NO_TAPE; - } - return retval; - out: mutex_unlock(&STp->lock); return retval; From bf74aac97f8f78be381dc28d8f3ff32674e77098 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:27 -0500 Subject: [PATCH 35/39] scsi: st: Skip buffer flush for information ioctls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author David Jeffery commit d27418aaf8bcb21f3f9b54a57427a0ae4f025bf7 With commit 9604eea5bd3a ("scsi: st: Add third party poweron reset handling") some customer tape applications fail from being unable to complete ioctls to verify ID information for the device when there has been any type of reset event to their tape devices. The st driver currently will fail all standard SCSI ioctls if a call to flush_buffer() fails in st_ioctl(). This causes ioctls which otherwise have no effect on tape state to succeed or fail based on events unrelated to the requested ioctl. This makes SCSI information ioctls unreliable after a reset even if no buffering is in use. With a reset setting the pos_unknown field, flush_buffer() will report failure and fail all ioctls. So any application expecting to use ioctls to check the identify the device will be unable to do so in such a state. For SCSI information ioctls, avoid the need for a buffer flush and allow the ioctls to execute regardless of buffer state. Signed-off-by: David Jeffery Tested-by: Laurence Oberman Acked-by: Kai Mäkisara Reviewed-by: John Meneghini Tested-by: John Meneghini Link: https://patch.msgid.link/20251104154709.6436-2-djeffery@redhat.com Signed-off-by: Martin K. Petersen (cherry picked from commit d27418aaf8bcb21f3f9b54a57427a0ae4f025bf7) Signed-off-by: Jonathan Maple --- drivers/scsi/st.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 505850f009ca5..168f25e4aaa38 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -3542,30 +3542,34 @@ static long st_common_ioctl(struct scsi_tape *STp, struct st_modedef *STm, goto out; } - if ((i = flush_buffer(STp, 0)) < 0) { - retval = i; - goto out; - } else { /* flush_buffer succeeds */ - if (STp->can_partitions) { - i = switch_partition(STp); - if (i < 0) { - retval = i; - goto out; - } - } - } - mutex_unlock(&STp->lock); - switch (cmd_in) { + case SCSI_IOCTL_GET_IDLUN: + case SCSI_IOCTL_GET_BUS_NUMBER: + case SCSI_IOCTL_GET_PCI: + break; case SG_IO: case SCSI_IOCTL_SEND_COMMAND: case CDROM_SEND_PACKET: - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - break; + if (!capable(CAP_SYS_RAWIO)) { + retval = -EPERM; + goto out; + } + fallthrough; default: - break; + if ((i = flush_buffer(STp, 0)) < 0) { + retval = i; + goto out; + } else { /* flush_buffer succeeds */ + if (STp->can_partitions) { + i = switch_partition(STp); + if (i < 0) { + retval = i; + goto out; + } + } + } } + mutex_unlock(&STp->lock); retval = scsi_ioctl(STp->device, file->f_mode & FMODE_WRITE, cmd_in, (void __user *)arg); From 99c5d19c1eb2ee38bc297ac77bc1b2d919582594 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:27 -0500 Subject: [PATCH 36/39] i40e: improve VF MAC filters accounting jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Lukasz Czapnik commit b99dd77076bd3fddac6f7f1cbfa081c38fde17f5 When adding new VM MAC, driver checks only *active* filters in vsi->mac_filter_hash. Each MAC, even in non-active state is using resources. To determine number of MACs VM uses, count VSI filters in *any* state. Add i40e_count_all_filters() to simply count all filters, and rename i40e_count_filters() to i40e_count_active_filters() to avoid ambiguity. Fixes: cfb1d572c986 ("i40e: Add ensurance of MacVlan resources for every trusted VF") Cc: stable@vger.kernel.org Signed-off-by: Lukasz Czapnik Reviewed-by: Aleksandr Loktionov Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen (cherry picked from commit b99dd77076bd3fddac6f7f1cbfa081c38fde17f5) Signed-off-by: Jonathan Maple --- drivers/net/ethernet/intel/i40e/i40e.h | 3 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 26 ++++++-- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 65 ++++++++----------- 3 files changed, 50 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index c67963bfe14ed..11be9d4890b8c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -1277,7 +1277,8 @@ struct i40e_mac_filter *i40e_add_mac_filter(struct i40e_vsi *vsi, const u8 *macaddr); int i40e_del_mac_filter(struct i40e_vsi *vsi, const u8 *macaddr); bool i40e_is_vsi_in_vlan(struct i40e_vsi *vsi); -int i40e_count_filters(struct i40e_vsi *vsi); +int i40e_count_all_filters(struct i40e_vsi *vsi); +int i40e_count_active_filters(struct i40e_vsi *vsi); struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, const u8 *macaddr); void i40e_vlan_stripping_enable(struct i40e_vsi *vsi); static inline bool i40e_is_sw_dcb(struct i40e_pf *pf) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 120d68654e3f7..6717e81c3c039 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1241,12 +1241,30 @@ void i40e_update_stats(struct i40e_vsi *vsi) } /** - * i40e_count_filters - counts VSI mac filters + * i40e_count_all_filters - counts VSI MAC filters * @vsi: the VSI to be searched * - * Returns count of mac filters - **/ -int i40e_count_filters(struct i40e_vsi *vsi) + * Return: count of MAC filters in any state. + */ +int i40e_count_all_filters(struct i40e_vsi *vsi) +{ + struct i40e_mac_filter *f; + struct hlist_node *h; + int bkt, cnt = 0; + + hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) + cnt++; + + return cnt; +} + +/** + * i40e_count_active_filters - counts VSI MAC filters + * @vsi: the VSI to be searched + * + * Return: count of active MAC filters. + */ +int i40e_count_active_filters(struct i40e_vsi *vsi) { struct i40e_mac_filter *f; struct hlist_node *h; diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 85f93eb0aea2b..7b8027cab8ba7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2865,24 +2865,6 @@ static int i40e_vc_get_stats_msg(struct i40e_vf *vf, u8 *msg) (u8 *)&stats, sizeof(stats)); } -/** - * i40e_can_vf_change_mac - * @vf: pointer to the VF info - * - * Return true if the VF is allowed to change its MAC filters, false otherwise - */ -static bool i40e_can_vf_change_mac(struct i40e_vf *vf) -{ - /* If the VF MAC address has been set administratively (via the - * ndo_set_vf_mac command), then deny permission to the VF to - * add/delete unicast MAC addresses, unless the VF is trusted - */ - if (vf->pf_set_mac && !vf->trusted) - return false; - - return true; -} - #define I40E_MAX_MACVLAN_PER_HW 3072 #define I40E_MAX_MACVLAN_PER_PF(num_ports) (I40E_MAX_MACVLAN_PER_HW / \ (num_ports)) @@ -2921,8 +2903,10 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, struct i40e_pf *pf = vf->pf; struct i40e_vsi *vsi = pf->vsi[vf->lan_vsi_idx]; struct i40e_hw *hw = &pf->hw; - int mac2add_cnt = 0; - int i; + int i, mac_add_max, mac_add_cnt = 0; + bool vf_trusted; + + vf_trusted = test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps); for (i = 0; i < al->num_elements; i++) { struct i40e_mac_filter *f; @@ -2942,9 +2926,8 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, * The VF may request to set the MAC address filter already * assigned to it so do not return an error in that case. */ - if (!i40e_can_vf_change_mac(vf) && - !is_multicast_ether_addr(addr) && - !ether_addr_equal(addr, vf->default_lan_addr.addr)) { + if (!vf_trusted && !is_multicast_ether_addr(addr) && + vf->pf_set_mac && !ether_addr_equal(addr, vf->default_lan_addr.addr)) { dev_err(&pf->pdev->dev, "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n"); return -EPERM; @@ -2953,29 +2936,33 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, /*count filters that really will be added*/ f = i40e_find_mac(vsi, addr); if (!f) - ++mac2add_cnt; + ++mac_add_cnt; } /* If this VF is not privileged, then we can't add more than a limited - * number of addresses. Check to make sure that the additions do not - * push us over the limit. - */ - if (!test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) { - if ((i40e_count_filters(vsi) + mac2add_cnt) > - I40E_VC_MAX_MAC_ADDR_PER_VF) { - dev_err(&pf->pdev->dev, - "Cannot add more MAC addresses, VF is not trusted, switch the VF to trusted to add more functionality\n"); - return -EPERM; - } - /* If this VF is trusted, it can use more resources than untrusted. + * number of addresses. + * + * If this VF is trusted, it can use more resources than untrusted. * However to ensure that every trusted VF has appropriate number of * resources, divide whole pool of resources per port and then across * all VFs. */ - } else { - if ((i40e_count_filters(vsi) + mac2add_cnt) > - I40E_VC_MAX_MACVLAN_PER_TRUSTED_VF(pf->num_alloc_vfs, - hw->num_ports)) { + if (!vf_trusted) + mac_add_max = I40E_VC_MAX_MAC_ADDR_PER_VF; + else + mac_add_max = I40E_VC_MAX_MACVLAN_PER_TRUSTED_VF(pf->num_alloc_vfs, hw->num_ports); + + /* VF can replace all its filters in one step, in this case mac_add_max + * will be added as active and another mac_add_max will be in + * a to-be-removed state. Account for that. + */ + if ((i40e_count_active_filters(vsi) + mac_add_cnt) > mac_add_max || + (i40e_count_all_filters(vsi) + mac_add_cnt) > 2 * mac_add_max) { + if (!vf_trusted) { + dev_err(&pf->pdev->dev, + "Cannot add more MAC addresses, VF is not trusted, switch the VF to trusted to add more functionality\n"); + return -EPERM; + } else { dev_err(&pf->pdev->dev, "Cannot add more MAC addresses, trusted VF exhausted it's resources\n"); return -EPERM; From 7a5308c6b709989d482ba84474dfb60f6849ead5 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:28 -0500 Subject: [PATCH 37/39] devlink: Add new "max_mac_per_vf" generic device param jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Mohammad Heib commit 9352d40c8bcd2ef29366d2c38b163c0b115039ed Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9352d40c.failed Add a new device generic parameter to controls the maximum number of MAC filters allowed per VF. For example, to limit a VF to 3 MAC addresses: $ devlink dev param set pci/0000:3b:00.0 name max_mac_per_vf \ value 3 \ cmode runtime Signed-off-by: Mohammad Heib Reviewed-by: Simon Horman Signed-off-by: Jacob Keller Signed-off-by: Tony Nguyen (cherry picked from commit 9352d40c8bcd2ef29366d2c38b163c0b115039ed) Signed-off-by: Jonathan Maple # Conflicts: # Documentation/networking/devlink/devlink-params.rst # include/net/devlink.h # net/devlink/param.c --- .../9352d40c.failed | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9352d40c.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9352d40c.failed b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9352d40c.failed new file mode 100644 index 0000000000000..7483ac932e02a --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9352d40c.failed @@ -0,0 +1,120 @@ +devlink: Add new "max_mac_per_vf" generic device param + +jira KERNEL-572 +Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 +commit-author Mohammad Heib +commit 9352d40c8bcd2ef29366d2c38b163c0b115039ed +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/9352d40c.failed + +Add a new device generic parameter to controls the maximum +number of MAC filters allowed per VF. + +For example, to limit a VF to 3 MAC addresses: + $ devlink dev param set pci/0000:3b:00.0 name max_mac_per_vf \ + value 3 \ + cmode runtime + + Signed-off-by: Mohammad Heib + Reviewed-by: Simon Horman + Signed-off-by: Jacob Keller + Signed-off-by: Tony Nguyen +(cherry picked from commit 9352d40c8bcd2ef29366d2c38b163c0b115039ed) + Signed-off-by: Jonathan Maple + +# Conflicts: +# Documentation/networking/devlink/devlink-params.rst +# include/net/devlink.h +# net/devlink/param.c +diff --cc Documentation/networking/devlink/devlink-params.rst +index 211b58177e12,c0597d456641..000000000000 +--- a/Documentation/networking/devlink/devlink-params.rst ++++ b/Documentation/networking/devlink/devlink-params.rst +@@@ -143,3 -143,15 +143,18 @@@ own name + * - ``clock_id`` + - u64 + - Clock ID used by the device for registering DPLL devices and pins. +++<<<<<<< HEAD +++======= ++ * - ``total_vfs`` ++ - u32 ++ - The max number of Virtual Functions (VFs) exposed by the PF. ++ after reboot/pci reset, 'sriov_totalvfs' entry under the device's sysfs ++ directory will report this value. ++ * - ``num_doorbells`` ++ - u32 ++ - Controls the number of doorbells used by the device. ++ * - ``max_mac_per_vf`` ++ - u32 ++ - Controls the maximum number of MAC address filters that can be assigned ++ to a Virtual Function (VF). +++>>>>>>> 9352d40c8bcd (devlink: Add new "max_mac_per_vf" generic device param) +diff --cc include/net/devlink.h +index c0824ca087f4,d01046ef0577..000000000000 +--- a/include/net/devlink.h ++++ b/include/net/devlink.h +@@@ -556,6 -530,9 +556,12 @@@ enum devlink_param_generic_id + DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, + DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, + DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, +++<<<<<<< HEAD +++======= ++ DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, ++ DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, ++ DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF, +++>>>>>>> 9352d40c8bcd (devlink: Add new "max_mac_per_vf" generic device param) + + /* add new param generic ids above here*/ + __DEVLINK_PARAM_GENERIC_ID_MAX, +@@@ -620,6 -597,15 +626,18 @@@ + #define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id" + #define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64 + +++<<<<<<< HEAD +++======= ++ #define DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME "total_vfs" ++ #define DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE DEVLINK_PARAM_TYPE_U32 ++ ++ #define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME "num_doorbells" ++ #define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE DEVLINK_PARAM_TYPE_U32 ++ ++ #define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME "max_mac_per_vf" ++ #define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE DEVLINK_PARAM_TYPE_U32 ++ +++>>>>>>> 9352d40c8bcd (devlink: Add new "max_mac_per_vf" generic device param) + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ + { \ + .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ +diff --cc net/devlink/param.c +index 41dcc86cfd94,6b233b13b69a..000000000000 +--- a/net/devlink/param.c ++++ b/net/devlink/param.c +@@@ -102,6 -102,21 +102,24 @@@ static const struct devlink_param devli + .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME, + .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE, + }, +++<<<<<<< HEAD +++======= ++ { ++ .id = DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, ++ .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME, ++ .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE, ++ }, ++ { ++ .id = DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, ++ .name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME, ++ .type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE, ++ }, ++ { ++ .id = DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF, ++ .name = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME, ++ .type = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE, ++ }, +++>>>>>>> 9352d40c8bcd (devlink: Add new "max_mac_per_vf" generic device param) + }; + + static int devlink_param_generic_verify(const struct devlink_param *param) +* Unmerged path Documentation/networking/devlink/devlink-params.rst +* Unmerged path include/net/devlink.h +* Unmerged path net/devlink/param.c From 9ccc5db4ffcc92e13bb59e1c810746cc567fc78b Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:28 -0500 Subject: [PATCH 38/39] i40e: support generic devlink param "max_mac_per_vf" jira KERNEL-572 Rebuild_History Non-Buildable kernel-6.12.0-124.31.1.el10_1 commit-author Mohammad Heib commit 2c031d4c772f3a9191d04d57a3403ad6a56375c7 Currently the i40e driver enforces its own internally calculated per-VF MAC filter limit, derived from the number of allocated VFs and available hardware resources. This limit is not configurable by the administrator, which makes it difficult to control how many MAC addresses each VF may use. This patch adds support for the new generic devlink runtime parameter "max_mac_per_vf" which provides administrators with a way to cap the number of MAC addresses a VF can use: - When the parameter is set to 0 (default), the driver continues to use its internally calculated limit. - When set to a non-zero value, the driver applies this value as a strict cap for VFs, overriding the internal calculation. Important notes: - The configured value is a theoretical maximum. Hardware limits may still prevent additional MAC addresses from being added, even if the parameter allows it. - Since MAC filters are a shared hardware resource across all VFs, setting a high value may cause resource contention and starve other VFs. - This change gives administrators predictable and flexible control over VF resource allocation, while still respecting hardware limitations. - Previous discussion about this change: https://lore.kernel.org/netdev/20250805134042.2604897-2-dhill@redhat.com https://lore.kernel.org/netdev/20250823094952.182181-1-mheib@redhat.com Signed-off-by: Mohammad Heib Reviewed-by: Jacob Keller Reviewed-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Jacob Keller Signed-off-by: Tony Nguyen (cherry picked from commit 2c031d4c772f3a9191d04d57a3403ad6a56375c7) Signed-off-by: Jonathan Maple --- Documentation/networking/devlink/i40e.rst | 34 ++++++++++++ drivers/net/ethernet/intel/i40e/i40e.h | 4 ++ .../net/ethernet/intel/i40e/i40e_devlink.c | 54 ++++++++++++++++++- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 31 ++++++++--- 4 files changed, 113 insertions(+), 10 deletions(-) diff --git a/Documentation/networking/devlink/i40e.rst b/Documentation/networking/devlink/i40e.rst index d3cb5bb5197e9..51c887f0dc833 100644 --- a/Documentation/networking/devlink/i40e.rst +++ b/Documentation/networking/devlink/i40e.rst @@ -7,6 +7,40 @@ i40e devlink support This document describes the devlink features implemented by the ``i40e`` device driver. +Parameters +========== + +.. list-table:: Generic parameters implemented + :widths: 5 5 90 + + * - Name + - Mode + - Notes + * - ``max_mac_per_vf`` + - runtime + - Controls the maximum number of MAC addresses a VF can use + on i40e devices. + + By default (``0``), the driver enforces its internally calculated per-VF + MAC filter limit, which is based on the number of allocated VFS. + + If set to a non-zero value, this parameter acts as a strict cap: + the driver will use the user-provided value instead of its internal + calculation. + + **Important notes:** + + - This value **must be set before enabling SR-IOV**. + Attempting to change it while SR-IOV is enabled will return an error. + - MAC filters are a **shared hardware resource** across all VFs. + Setting a high value may cause other VFs to be starved of filters. + - This value is a **Administrative policy**. The hardware may return + errors when its absolute limit is reached, regardless of the value + set here. + + The default value is ``0`` (internal calculation is used). + + Info versions ============= diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 11be9d4890b8c..83b94f369e3b0 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -573,6 +573,10 @@ struct i40e_pf { struct i40e_vf *vf; int num_alloc_vfs; /* actual number of VFs allocated */ u32 vf_aq_requests; + /* If set to non-zero, the device uses this value + * as maximum number of MAC filters per VF. + */ + u32 max_mac_per_vf; u32 arq_overflows; /* Not fatal, possibly indicative of problems */ struct ratelimit_state mdd_message_rate_limit; /* DCBx/DCBNL capability for PF that indicates diff --git a/drivers/net/ethernet/intel/i40e/i40e_devlink.c b/drivers/net/ethernet/intel/i40e/i40e_devlink.c index cc4e9e2addb75..bc205e3077c7f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_devlink.c +++ b/drivers/net/ethernet/intel/i40e/i40e_devlink.c @@ -5,6 +5,41 @@ #include "i40e.h" #include "i40e_devlink.h" +static int i40e_max_mac_per_vf_set(struct devlink *devlink, + u32 id, + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) +{ + struct i40e_pf *pf = devlink_priv(devlink); + + if (pf->num_alloc_vfs > 0) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot change max_mac_per_vf while SR-IOV is enabled"); + return -EBUSY; + } + + pf->max_mac_per_vf = ctx->val.vu32; + return 0; +} + +static int i40e_max_mac_per_vf_get(struct devlink *devlink, + u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct i40e_pf *pf = devlink_priv(devlink); + + ctx->val.vu32 = pf->max_mac_per_vf; + return 0; +} + +static const struct devlink_param i40e_dl_params[] = { + DEVLINK_PARAM_GENERIC(MAX_MAC_PER_VF, + BIT(DEVLINK_PARAM_CMODE_RUNTIME), + i40e_max_mac_per_vf_get, + i40e_max_mac_per_vf_set, + NULL), +}; + static void i40e_info_get_dsn(struct i40e_pf *pf, char *buf, size_t len) { u8 dsn[8]; @@ -165,7 +200,18 @@ void i40e_free_pf(struct i40e_pf *pf) **/ void i40e_devlink_register(struct i40e_pf *pf) { - devlink_register(priv_to_devlink(pf)); + struct devlink *dl = priv_to_devlink(pf); + struct device *dev = &pf->pdev->dev; + int err; + + err = devlink_params_register(dl, i40e_dl_params, + ARRAY_SIZE(i40e_dl_params)); + if (err) + dev_err(dev, + "devlink params register failed with error %d", err); + + devlink_register(dl); + } /** @@ -176,7 +222,11 @@ void i40e_devlink_register(struct i40e_pf *pf) **/ void i40e_devlink_unregister(struct i40e_pf *pf) { - devlink_unregister(priv_to_devlink(pf)); + struct devlink *dl = priv_to_devlink(pf); + + devlink_unregister(dl); + devlink_params_unregister(dl, i40e_dl_params, + ARRAY_SIZE(i40e_dl_params)); } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 7b8027cab8ba7..950549649995b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2938,33 +2938,48 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, if (!f) ++mac_add_cnt; } - - /* If this VF is not privileged, then we can't add more than a limited - * number of addresses. + /* Determine the maximum number of MAC addresses this VF may use. + * + * - For untrusted VFs: use a fixed small limit. + * + * - For trusted VFs: limit is calculated by dividing total MAC + * filter pool across all VFs/ports. * - * If this VF is trusted, it can use more resources than untrusted. - * However to ensure that every trusted VF has appropriate number of - * resources, divide whole pool of resources per port and then across - * all VFs. + * - User can override this by devlink param "max_mac_per_vf". + * If set its value is used as a strict cap for both trusted and + * untrusted VFs. + * Note: + * even when overridden, this is a theoretical maximum; hardware + * may reject additional MACs if the absolute HW limit is reached. */ if (!vf_trusted) mac_add_max = I40E_VC_MAX_MAC_ADDR_PER_VF; else mac_add_max = I40E_VC_MAX_MACVLAN_PER_TRUSTED_VF(pf->num_alloc_vfs, hw->num_ports); + if (pf->max_mac_per_vf > 0) + mac_add_max = pf->max_mac_per_vf; + /* VF can replace all its filters in one step, in this case mac_add_max * will be added as active and another mac_add_max will be in * a to-be-removed state. Account for that. */ if ((i40e_count_active_filters(vsi) + mac_add_cnt) > mac_add_max || (i40e_count_all_filters(vsi) + mac_add_cnt) > 2 * mac_add_max) { + if (pf->max_mac_per_vf == mac_add_max && mac_add_max > 0) { + dev_err(&pf->pdev->dev, + "Cannot add more MAC addresses: VF reached its maximum allowed limit (%d)\n", + mac_add_max); + return -EPERM; + } if (!vf_trusted) { dev_err(&pf->pdev->dev, "Cannot add more MAC addresses, VF is not trusted, switch the VF to trusted to add more functionality\n"); return -EPERM; } else { dev_err(&pf->pdev->dev, - "Cannot add more MAC addresses, trusted VF exhausted it's resources\n"); + "Cannot add more MAC addresses: trusted VF reached its maximum allowed limit (%d)\n", + mac_add_max); return -EPERM; } } From 248c2b40639d6a6b1ab9e4c36bf413a35a07737c Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 6 Feb 2026 03:01:40 -0500 Subject: [PATCH 39/39] Rebuild rocky10_1 with kernel-6.12.0-124.31.1.el10_1 Rebuild_History BUILDABLE Rebuilding Kernel from rpm changelog with Fuzz Limit: 87.50% Number of commits in upstream range v6.12~1..kernel-mainline: 93416 Number of commits in rpm: 43 Number of commits matched with upstream: 39 (90.70%) Number of commits in upstream but not in rpm: 93377 Number of commits NOT found in upstream: 4 (9.30%) Rebuilding Kernel on Branch rocky10_1_rebuild_kernel-6.12.0-124.31.1.el10_1 for kernel-6.12.0-124.31.1.el10_1 Clean Cherry Picks: 26 (66.67%) Empty Cherry Picks: 12 (30.77%) _______________________________ Full Details Located here: ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/rebuild.details.txt Includes: * git commit header above * Empty Commits with upstream SHA * RPM ChangeLog Entries that could not be matched Individual Empty Commit failures contained in the same containing directory. The git message for empty commits will have the path for the failed commit. File names are the first 8 characters of the upstream SHA --- ...1.el10_1 => COPYING-6.12.0-124.31.1.el10_1 | 0 .../networking/devlink/devlink-params.rst | 4 + Makefile.rhelver | 2 +- arch/arm64/kvm/sys_regs.c | 20 +++- arch/s390/Kconfig | 1 - arch/s390/mm/hugetlbpage.c | 11 ++ arch/x86/kernel/alternative.c | 3 + .../rebuild.details.txt | 32 +++++ .../kernel-6.12.0-aarch64-64k-debug.config | 4 +- configs/kernel-6.12.0-aarch64-64k.config | 4 +- configs/kernel-6.12.0-aarch64-debug.config | 4 +- .../kernel-6.12.0-aarch64-rt-64k-debug.config | 4 +- configs/kernel-6.12.0-aarch64-rt-64k.config | 4 +- configs/kernel-6.12.0-aarch64-rt-debug.config | 4 +- configs/kernel-6.12.0-aarch64-rt.config | 4 +- configs/kernel-6.12.0-aarch64.config | 4 +- configs/kernel-6.12.0-ppc64le-debug.config | 4 +- configs/kernel-6.12.0-ppc64le.config | 4 +- configs/kernel-6.12.0-riscv64-debug.config | 4 +- configs/kernel-6.12.0-riscv64.config | 4 +- configs/kernel-6.12.0-s390x-debug.config | 7 +- configs/kernel-6.12.0-s390x-zfcpdump.config | 5 +- configs/kernel-6.12.0-s390x.config | 7 +- configs/kernel-6.12.0-x86_64-debug.config | 4 +- configs/kernel-6.12.0-x86_64-rt-debug.config | 4 +- configs/kernel-6.12.0-x86_64-rt.config | 4 +- configs/kernel-6.12.0-x86_64.config | 4 +- fs/xfs/xfs_file.c | 75 ++++++------ fs/xfs/xfs_inode.c | 25 ++-- fs/xfs/xfs_inode_item.c | 110 +++++++++++++----- fs/xfs/xfs_inode_item.h | 10 +- fs/xfs/xfs_iomap.c | 15 ++- include/linux/io_uring_types.h | 2 + include/net/devlink.h | 4 + include/net/dst.h | 40 ++++++- include/net/ip.h | 6 +- include/net/ip6_route.h | 2 +- include/net/route.h | 2 +- io_uring/msg_ring.c | 4 +- kernel/events/uprobes.c | 2 +- net/core/dst.c | 4 +- net/core/sock.c | 16 ++- net/devlink/param.c | 5 + net/ipv4/ip_output.c | 15 ++- net/ipv4/route.c | 4 +- net/ipv6/ip6_output.c | 65 ++++++----- .../kabi-module/kabi_x86_64/__kabi__alt_instr | 2 + redhat/kernel.changelog-10.1 | 46 ++++++++ uki-addons.sbat | 4 +- uki.sbat | 4 +- 50 files changed, 431 insertions(+), 187 deletions(-) rename COPYING-6.12.0-124.29.1.el10_1 => COPYING-6.12.0-124.31.1.el10_1 (100%) create mode 100644 ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/rebuild.details.txt create mode 100644 redhat/kabi/kabi-module/kabi_x86_64/__kabi__alt_instr diff --git a/COPYING-6.12.0-124.29.1.el10_1 b/COPYING-6.12.0-124.31.1.el10_1 similarity index 100% rename from COPYING-6.12.0-124.29.1.el10_1 rename to COPYING-6.12.0-124.31.1.el10_1 diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst index 211b58177e121..74a35f3b7c9af 100644 --- a/Documentation/networking/devlink/devlink-params.rst +++ b/Documentation/networking/devlink/devlink-params.rst @@ -143,3 +143,7 @@ own name. * - ``clock_id`` - u64 - Clock ID used by the device for registering DPLL devices and pins. + * - ``max_mac_per_vf`` + - u32 + - Controls the maximum number of MAC address filters that can be assigned + to a Virtual Function (VF). diff --git a/Makefile.rhelver b/Makefile.rhelver index add841228a689..e4427791af80a 100644 --- a/Makefile.rhelver +++ b/Makefile.rhelver @@ -12,7 +12,7 @@ RHEL_MINOR = 1 # # Use this spot to avoid future merge conflicts. # Do not trim this comment. -RHEL_RELEASE = 124.29.1 +RHEL_RELEASE = 124.31.1 # # RHEL_REBASE_NUM diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index dc91904d3e476..64db64079492a 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1639,6 +1639,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, break; case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; + val &= ~ID_AA64MMFR2_EL1_NV; break; case SYS_ID_AA64MMFR3_EL1: val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE | @@ -2005,6 +2006,22 @@ static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, user_val); } +static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + u64 nv_mask = ID_AA64MMFR2_EL1_NV_MASK; + + /* + * We made the mistake to expose the now deprecated NV field, + * so allow userspace to write it, but silently ignore it. + */ + if ((hw_val & nv_mask) == (user_val & nv_mask)) + user_val &= ~nv_mask; + + return set_id_reg(vcpu, rd, user_val); +} + static int set_ctr_el0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { @@ -2890,7 +2907,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_VH | ID_AA64MMFR1_EL1_VMIDBits)), - ID_WRITABLE(ID_AA64MMFR2_EL1, ~(ID_AA64MMFR2_EL1_RES0 | + ID_FILTERED(ID_AA64MMFR2_EL1, + id_aa64mmfr2_el1, ~(ID_AA64MMFR2_EL1_RES0 | ID_AA64MMFR2_EL1_EVT | ID_AA64MMFR2_EL1_FWB | ID_AA64MMFR2_EL1_IDS | diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 990c93235da3e..9a055a37fbf67 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -142,7 +142,6 @@ config S390 select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_KERNEL_PMD_MKWRITE select ARCH_WANT_LD_ORPHAN_WARN - select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 select DCACHE_WORD_ACCESS if !KMSAN diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d9ce199953de9..b6a7ac8097ca5 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -15,6 +15,17 @@ #include #include #include +#include + +/* + * RHEL-only: Since the 'hugetlb_optimize_vmemmap_key' static key is part + * of the kABI, we need stub definitions to avoid breaking the build + * when CONFIG_ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP=n. + */ +#ifndef CONFIG_ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP +DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); +EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); +#endif /* * If the bit selected by single-bit bitmask "a" is set within "x", move diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 839205375da07..a5b7027479783 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2741,3 +2741,6 @@ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void * text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } + +struct alt_instr __kabi__alt_instr[0]; +EXPORT_SYMBOL_GPL(__kabi__alt_instr); diff --git a/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/rebuild.details.txt b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/rebuild.details.txt new file mode 100644 index 0000000000000..4b35a8d6ed4f8 --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-124.31.1.el10_1/rebuild.details.txt @@ -0,0 +1,32 @@ +Rebuild_History BUILDABLE +Rebuilding Kernel from rpm changelog with Fuzz Limit: 87.50% +Number of commits in upstream range v6.12~1..kernel-mainline: 93416 +Number of commits in rpm: 43 +Number of commits matched with upstream: 39 (90.70%) +Number of commits in upstream but not in rpm: 93377 +Number of commits NOT found in upstream: 4 (9.30%) + +Rebuilding Kernel on Branch rocky10_1_rebuild_kernel-6.12.0-124.31.1.el10_1 for kernel-6.12.0-124.31.1.el10_1 +Clean Cherry Picks: 26 (66.67%) +Empty Cherry Picks: 12 (30.77%) +_______________________________ + +__EMPTY COMMITS__________________________ +88fe14253e181878c2ddb51a298ae8c468a63010 net: dst: add four helpers to annotate data-races around dst->dev +1dbf1d590d10a6d1978e8184f8dfe20af22d680a net: Add locking to protect skb->dev access in ip_output +caedcc5b6df1b2e2b5f39079e3369c1d4d5c5f50 net: dst: introduce dst->dev_rcu +11709573cc4e48dc34c80fc7ab9ce5b159e29695 ipv6: use RCU in ip6_output() +9085e56501d93af9f2d7bd16f7fcfacdde47b99c ipv6: use RCU in ip6_xmit() +99a2ace61b211b0be861b07fbaa062fca4b58879 net: use dst_dev_rcu() in sk_setup_caps() +fc582cd26e888b0652bc1494f252329453fd3b23 io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU +b583ef82b671c9a752fbe3e95bd4c1c51eab764d uprobes: Fix race in uprobe_free_utask +64e2f60f355e556337fcffe80b9bcff1b22c9c42 s390: Disable ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP +bc7d684fea18cc48c3630d2b7f1789000ff2df5b xfs: rearrange code in xfs_inode_item_precommit +c91d38b57f2c4784d885c874b2a1234a01361afd xfs: rework datasync tracking and execution +9352d40c8bcd2ef29366d2c38b163c0b115039ed devlink: Add new "max_mac_per_vf" generic device param + +__CHANGES NOT IN UPSTREAM________________ +Add partial riscv64 support for build root' +Provide basic VisionFive 2 support' +Patch MMU for riscv64' +s390: mm: add stub for hugetlb_optimize_vmemmap_key diff --git a/configs/kernel-6.12.0-aarch64-64k-debug.config b/configs/kernel-6.12.0-aarch64-64k-debug.config index 89c63ec931d57..976f7ddcc1fa1 100644 --- a/configs/kernel-6.12.0-aarch64-64k-debug.config +++ b/configs/kernel-6.12.0-aarch64-64k-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-64k.config b/configs/kernel-6.12.0-aarch64-64k.config index 2c31913a36ffd..fc1423961a733 100644 --- a/configs/kernel-6.12.0-aarch64-64k.config +++ b/configs/kernel-6.12.0-aarch64-64k.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-debug.config b/configs/kernel-6.12.0-aarch64-debug.config index 897ec5ee1ca2a..eba77102fc2a5 100644 --- a/configs/kernel-6.12.0-aarch64-debug.config +++ b/configs/kernel-6.12.0-aarch64-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-rt-64k-debug.config b/configs/kernel-6.12.0-aarch64-rt-64k-debug.config index f9a446a620d3b..36a9ea924e736 100644 --- a/configs/kernel-6.12.0-aarch64-rt-64k-debug.config +++ b/configs/kernel-6.12.0-aarch64-rt-64k-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-rt-64k.config b/configs/kernel-6.12.0-aarch64-rt-64k.config index da134a98df4b9..a94f969fd853d 100644 --- a/configs/kernel-6.12.0-aarch64-rt-64k.config +++ b/configs/kernel-6.12.0-aarch64-rt-64k.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-rt-debug.config b/configs/kernel-6.12.0-aarch64-rt-debug.config index a54e5e6e98e19..a8003d73aed0e 100644 --- a/configs/kernel-6.12.0-aarch64-rt-debug.config +++ b/configs/kernel-6.12.0-aarch64-rt-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-rt.config b/configs/kernel-6.12.0-aarch64-rt.config index 1cd5fe06c54cf..dc5ad05892136 100644 --- a/configs/kernel-6.12.0-aarch64-rt.config +++ b/configs/kernel-6.12.0-aarch64-rt.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64.config b/configs/kernel-6.12.0-aarch64.config index d128a41b44560..2d579b0dbaeef 100644 --- a/configs/kernel-6.12.0-aarch64.config +++ b/configs/kernel-6.12.0-aarch64.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-ppc64le-debug.config b/configs/kernel-6.12.0-ppc64le-debug.config index ccfa3eca8218d..ce940a7303a4f 100644 --- a/configs/kernel-6.12.0-ppc64le-debug.config +++ b/configs/kernel-6.12.0-ppc64le-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-ppc64le.config b/configs/kernel-6.12.0-ppc64le.config index bd9863680037a..ec814991e2f15 100644 --- a/configs/kernel-6.12.0-ppc64le.config +++ b/configs/kernel-6.12.0-ppc64le.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-riscv64-debug.config b/configs/kernel-6.12.0-riscv64-debug.config index 7e32eee0b294c..f674761392764 100644 --- a/configs/kernel-6.12.0-riscv64-debug.config +++ b/configs/kernel-6.12.0-riscv64-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-riscv64.config b/configs/kernel-6.12.0-riscv64.config index 48594fde160c2..71b88b3cf4fd4 100644 --- a/configs/kernel-6.12.0-riscv64.config +++ b/configs/kernel-6.12.0-riscv64.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-s390x-debug.config b/configs/kernel-6.12.0-s390x-debug.config index d68176a104b7a..d3d4295cf6585 100644 --- a/configs/kernel-6.12.0-s390x-debug.config +++ b/configs/kernel-6.12.0-s390x-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y @@ -717,7 +717,6 @@ CONFIG_SPARSEMEM=y CONFIG_SPARSEMEM_EXTREME=y CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y CONFIG_SPARSEMEM_VMEMMAP=y -CONFIG_ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP=y CONFIG_HAVE_MEMBLOCK_PHYS_MAP=y CONFIG_HAVE_GUP_FAST=y CONFIG_NUMA_KEEP_MEMINFO=y @@ -3277,9 +3276,7 @@ CONFIG_TMPFS_INODE64=y CONFIG_TMPFS_QUOTA=y CONFIG_ARCH_SUPPORTS_HUGETLBFS=y CONFIG_HUGETLBFS=y -# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y CONFIG_CONFIGFS_FS=y # end of Pseudo filesystems diff --git a/configs/kernel-6.12.0-s390x-zfcpdump.config b/configs/kernel-6.12.0-s390x-zfcpdump.config index dedb67d608ad3..effc3271fcd5d 100644 --- a/configs/kernel-6.12.0-s390x-zfcpdump.config +++ b/configs/kernel-6.12.0-s390x-zfcpdump.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y @@ -600,7 +600,6 @@ CONFIG_SPARSEMEM=y CONFIG_SPARSEMEM_EXTREME=y CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y CONFIG_SPARSEMEM_VMEMMAP=y -CONFIG_ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP=y CONFIG_HAVE_MEMBLOCK_PHYS_MAP=y CONFIG_HAVE_GUP_FAST=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y diff --git a/configs/kernel-6.12.0-s390x.config b/configs/kernel-6.12.0-s390x.config index 8d435b5d4830a..d37af0a3b5589 100644 --- a/configs/kernel-6.12.0-s390x.config +++ b/configs/kernel-6.12.0-s390x.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y @@ -740,7 +740,6 @@ CONFIG_SPARSEMEM=y CONFIG_SPARSEMEM_EXTREME=y CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y CONFIG_SPARSEMEM_VMEMMAP=y -CONFIG_ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP=y CONFIG_HAVE_MEMBLOCK_PHYS_MAP=y CONFIG_HAVE_GUP_FAST=y CONFIG_NUMA_KEEP_MEMINFO=y @@ -3302,9 +3301,7 @@ CONFIG_TMPFS_INODE64=y CONFIG_TMPFS_QUOTA=y CONFIG_ARCH_SUPPORTS_HUGETLBFS=y CONFIG_HUGETLBFS=y -# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y CONFIG_CONFIGFS_FS=y # end of Pseudo filesystems diff --git a/configs/kernel-6.12.0-x86_64-debug.config b/configs/kernel-6.12.0-x86_64-debug.config index 94d09a557b775..fa3044c4ab3ba 100644 --- a/configs/kernel-6.12.0-x86_64-debug.config +++ b/configs/kernel-6.12.0-x86_64-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-x86_64-rt-debug.config b/configs/kernel-6.12.0-x86_64-rt-debug.config index ebf9dd7915a3b..4df603849d15e 100644 --- a/configs/kernel-6.12.0-x86_64-rt-debug.config +++ b/configs/kernel-6.12.0-x86_64-rt-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-x86_64-rt.config b/configs/kernel-6.12.0-x86_64-rt.config index ce4a536a71663..b01bf080db1b3 100644 --- a/configs/kernel-6.12.0-x86_64-rt.config +++ b/configs/kernel-6.12.0-x86_64-rt.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-x86_64.config b/configs/kernel-6.12.0-x86_64.config index 00afa86d9d1ea..c3fa4c155bd38 100644 --- a/configs/kernel-6.12.0-x86_64.config +++ b/configs/kernel-6.12.0-x86_64.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=107600 -CONFIG_RUSTC_LLVM_VERSION=170006 +CONFIG_RUSTC_VERSION=0 +CONFIG_RUSTC_LLVM_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b19916b11fd56..220dc674aa02a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -73,52 +73,47 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } -static xfs_csn_t -xfs_fsync_seq( - struct xfs_inode *ip, - bool datasync) -{ - if (!xfs_ipincount(ip)) - return 0; - if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - return 0; - return ip->i_itemp->ili_commit_seq; -} - /* - * All metadata updates are logged, which means that we just have to flush the - * log up to the latest LSN that touched the inode. + * All metadata updates are logged, which means that we just have to push the + * journal to the required sequence number than holds the updates. We track + * datasync commits separately to full sync commits, and hence only need to + * select the correct sequence number for the log force here. * - * If we have concurrent fsync/fdatasync() calls, we need them to all block on - * the log force before we clear the ili_fsync_fields field. This ensures that - * we don't get a racing sync operation that does not wait for the metadata to - * hit the journal before returning. If we race with clearing ili_fsync_fields, - * then all that will happen is the log force will do nothing as the lsn will - * already be on disk. We can't race with setting ili_fsync_fields because that - * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock - * shared until after the ili_fsync_fields is cleared. + * We don't have to serialise against concurrent modifications, as we do not + * have to wait for modifications that have not yet completed. We define a + * transaction commit as completing when the commit sequence number is updated, + * hence if the sequence number has not updated, the sync operation has been + * run before the commit completed and we don't have to wait for it. + * + * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain + * set on the log item until - at least - the journal flush completes. In + * reality, they are only cleared when the inode is fully unpinned (i.e. + * persistent in the journal and not dirty in the CIL), and so we rely on + * xfs_log_force_seq() either skipping sequences that have been persisted or + * waiting on sequences that are still in flight to correctly order concurrent + * sync operations. */ -static int +static int xfs_fsync_flush_log( struct xfs_inode *ip, bool datasync, int *log_flushed) { - int error = 0; - xfs_csn_t seq; + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - seq = xfs_fsync_seq(ip, datasync); - if (seq) { - error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, - log_flushed); + spin_lock(&iip->ili_lock); + if (datasync) + seq = iip->ili_datasync_seq; + else + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); - spin_lock(&ip->i_itemp->ili_lock); - ip->i_itemp->ili_fsync_fields = 0; - spin_unlock(&ip->i_itemp->ili_lock); - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; + if (!seq) + return 0; + + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, + log_flushed); } STATIC int @@ -156,12 +151,10 @@ xfs_file_fsync( error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* - * Any inode that has dirty modifications in the log is pinned. The - * racy check here for a pinned inode will not catch modifications - * that happen concurrently to the fsync call, but fsync semantics - * only require to sync previously completed I/O. + * If the inode has a inode log item attached, it may need the journal + * flushed to persist any changes the log item might be tracking. */ - if (xfs_ipincount(ip)) { + if (ip->i_itemp) { err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); if (err2 && !error) error = err2; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 19dcb569a3e7f..b84684577b0f7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1642,7 +1642,6 @@ xfs_ifree_mark_inode_stale( spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); @@ -1808,12 +1807,20 @@ static void xfs_iunpin( struct xfs_inode *ip) { - xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); + if (!seq) + return; /* Give the log a push to start the unpinning I/O */ - xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); + xfs_log_force_seq(ip->i_mount, seq, 0, NULL); } @@ -2472,7 +2479,6 @@ xfs_iflush( spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); spin_unlock(&iip->ili_lock); @@ -2631,12 +2637,15 @@ int xfs_log_force_inode( struct xfs_inode *ip) { + struct xfs_inode_log_item *iip = ip->i_itemp; xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - seq = ip->i_itemp->ili_commit_seq; - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!iip) + return 0; + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); if (!seq) return 0; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b509cbd191f4e..aa09d5a6458db 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -131,31 +131,28 @@ xfs_inode_item_precommit( } /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. + * Inode verifiers do not check that the extent size hints are an + * integer multiple of the rt extent size on a directory with + * rtinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the bad hints. */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) { + if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + flags |= XFS_ILOG_CORE; + } } - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it - * to XFS_ILOG_CORE so that the actual on-disk dirty tracking - * (ili_fields) correctly tracks that the version has changed. - */ spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); - if (flags & XFS_ILOG_IVERSION) - flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); if (!iip->ili_item.li_buf) { struct xfs_buf *bp; @@ -190,6 +187,20 @@ xfs_inode_item_precommit( xfs_trans_brelse(tp, bp); } + /* + * Store the dirty flags back into the inode item as this state is used + * later on in xfs_inode_item_committing() to determine whether the + * transaction is relevant to fsync state or not. + */ + iip->ili_dirty_flags = flags; + + /* + * Convert the flags on-disk fields that have been modified in the + * transaction so that ili_fields tracks the changes correctly. + */ + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + /* * Always OR in the bits from the ili_last_fields field. This is to * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines @@ -200,12 +211,6 @@ xfs_inode_item_precommit( spin_unlock(&iip->ili_lock); xfs_inode_item_precommit_check(ip); - - /* - * We are done with the log item transaction dirty state, so clear it so - * that it doesn't pollute future transactions. - */ - iip->ili_dirty_flags = 0; return 0; } @@ -707,13 +712,24 @@ xfs_inode_item_unpin( struct xfs_log_item *lip, int remove) { - struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); ASSERT(atomic_read(&ip->i_pincount) > 0); - if (atomic_dec_and_test(&ip->i_pincount)) + + /* + * If this is the last unpin, then the inode no longer needs a journal + * flush to persist it. Hence we can clear the commit sequence numbers + * as a fsync/fdatasync operation on the inode at this point is a no-op. + */ + if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) { + iip->ili_commit_seq = 0; + iip->ili_datasync_seq = 0; + spin_unlock(&iip->ili_lock); wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); + } } STATIC uint @@ -833,12 +849,45 @@ xfs_inode_item_committed( return lsn; } +/* + * The modification is now complete, so before we unlock the inode we need to + * update the commit sequence numbers for data integrity journal flushes. We + * always record the commit sequence number (ili_commit_seq) so that anything + * that needs a full journal sync will capture all of this modification. + * + * We then + * check if the changes will impact a datasync (O_DSYNC) journal flush. If the + * changes will require a datasync flush, then we also record the sequence in + * ili_datasync_seq. + * + * These commit sequence numbers will get cleared atomically with the inode being + * unpinned (i.e. pin count goes to zero), and so it will only be set when the + * inode is dirty in the journal. This removes the need for checking if the + * inode is pinned to determine if a journal flush is necessary, and hence + * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to + * serialise pin counts against commit sequence number updates. + * + */ STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, xfs_csn_t seq) { - INODE_ITEM(lip)->ili_commit_seq = seq; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + + spin_lock(&iip->ili_lock); + iip->ili_commit_seq = seq; + if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP)) + iip->ili_datasync_seq = seq; + spin_unlock(&iip->ili_lock); + + /* + * Clear the per-transaction dirty flags now that we have finished + * recording the transaction's inode modifications in the CIL and are + * about to release and (maybe) unlock the inode. + */ + iip->ili_dirty_flags = 0; + return xfs_inode_item_release(lip); } @@ -1042,7 +1091,6 @@ xfs_iflush_abort_clean( { iip->ili_last_fields = 0; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; iip->ili_flush_lsn = 0; iip->ili_item.li_buf = NULL; list_del_init(&iip->ili_item.li_bio_list); diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 377e060078044..7a6d179aad4c2 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -32,9 +32,17 @@ struct xfs_inode_log_item { spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - unsigned int ili_fsync_fields; /* logged since last fsync */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + + /* + * We record the sequence number for every inode modification, as + * well as those that only require fdatasync operations for data + * integrity. This allows optimisation of the O_DSYNC/fdatasync path + * without needing to track what modifications the journal is currently + * carrying for the inode. These are protected by the above ili_lock. + */ xfs_csn_t ili_commit_seq; /* last transaction commit */ + xfs_csn_t ili_datasync_seq; /* for datasync optimisation */ }; static inline int xfs_inode_clean(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 86da16f54be9d..b9ffe609376ad 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -133,9 +133,18 @@ xfs_bmbt_to_iomap( iomap->bdev = target->bt_bdev; iomap->flags = iomap_flags; - if (xfs_ipincount(ip) && - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; + /* + * If the inode is dirty for datasync purposes, let iomap know so it + * doesn't elide the IO completion journal flushes on O_DSYNC IO. + */ + if (ip->i_itemp) { + struct xfs_inode_log_item *iip = ip->i_itemp; + + spin_lock(&iip->ili_lock); + if (iip->ili_datasync_seq) + iomap->flags |= IOMAP_F_DIRTY; + spin_unlock(&iip->ili_lock); + } iomap->validity_cookie = sequence_cookie; iomap->folio_ops = &xfs_iomap_folio_ops; diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index c252e98aee7c2..8ab57274a21ec 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -663,6 +663,8 @@ struct io_kiocb { u64 extra1; u64 extra2; } big_cqe; + /* for private io_kiocb freeing */ + RH_KABI_EXTEND(struct rcu_head rcu_head) }; struct io_overflow_cqe { diff --git a/include/net/devlink.h b/include/net/devlink.h index c0824ca087f45..a42553e233b7d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -556,6 +556,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, + DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -620,6 +621,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id" #define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64 +#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME "max_mac_per_vf" +#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/include/net/dst.h b/include/net/dst.h index 08647c99d79c9..1767f39a1f006 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -24,7 +24,10 @@ struct sk_buff; struct dst_entry { - struct net_device *dev; + RH_KABI_REPLACE(struct net_device *dev, union { + struct net_device *dev; + struct net_device __rcu *dev_rcu; + }) struct dst_ops *ops; unsigned long _metrics; unsigned long expires; @@ -561,6 +564,41 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } +static inline struct net_device *dst_dev(const struct dst_entry *dst) +{ + return READ_ONCE(dst->dev); +} + +static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) +{ + return rcu_dereference(dst->dev_rcu); +} + +static inline struct net *dst_dev_net_rcu(const struct dst_entry *dst) +{ + return dev_net_rcu(dst_dev_rcu(dst)); +} + +static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) +{ + return dst_dev(skb_dst(skb)); +} + +static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb) +{ + return dst_dev_rcu(skb_dst(skb)); +} + +static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) +{ + return dev_net(skb_dst_dev(skb)); +} + +static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) +{ + return dev_net_rcu(skb_dst_dev_rcu(skb)); +} + struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); diff --git a/include/net/ip.h b/include/net/ip.h index 30596104cb60d..6eccd054eeebf 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -466,12 +466,14 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { const struct rtable *rt = dst_rtable(dst); + const struct net_device *dev; unsigned int mtu, res; struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + dev = dst_dev_rcu(dst); + net = dev_net_rcu(dev); if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { @@ -485,7 +487,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, if (mtu) goto out; - mtu = READ_ONCE(dst->dev->mtu); + mtu = READ_ONCE(dev->mtu); if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 6dbdf60b342f6..ede44cde7fe58 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -337,7 +337,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst mtu = IPV6_MIN_MTU; rcu_read_lock(); - idev = __in6_dev_get(dst->dev); + idev = __in6_dev_get(dst_dev_rcu(dst)); if (idev) mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); diff --git a/include/net/route.h b/include/net/route.h index 8d2de5eea1268..407aad7e9e52e 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -373,7 +373,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) const struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dst_dev_net_rcu(dst); hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); rcu_read_unlock(); } diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 7fd9badcfaf81..d3ef5992eac91 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -82,7 +82,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) spin_unlock(&ctx->msg_lock); } if (req) - kmem_cache_free(req_cachep, req); + kfree_rcu(req, rcu_head); percpu_ref_put(&ctx->refs); } @@ -91,7 +91,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, { req->task = READ_ONCE(ctx->submitter_task); if (!req->task) { - kmem_cache_free(req_cachep, req); + kfree_rcu(req, rcu_head); return -EOWNERDEAD; } req->cqe.user_data = user_data; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b728a0108ec37..736a1a9e26558 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1772,6 +1772,7 @@ void uprobe_free_utask(struct task_struct *t) if (!utask) return; + t->utask = NULL; if (utask->active_uprobe) put_uprobe(utask->active_uprobe); @@ -1781,7 +1782,6 @@ void uprobe_free_utask(struct task_struct *t) xol_free_insn_slot(t); kfree(utask); - t->utask = NULL; } /* diff --git a/net/core/dst.c b/net/core/dst.c index 795ca07e28a4e..bb07c54203569 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -150,7 +150,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev); dst->input = dst_discard; dst->output = dst_discard_out; - dst->dev = blackhole_netdev; + rcu_assign_pointer(dst->dev_rcu, blackhole_netdev); netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } @@ -263,7 +263,7 @@ unsigned int dst_blackhole_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - return mtu ? : dst->dev->mtu; + return mtu ? : dst_dev(dst)->mtu; } EXPORT_SYMBOL_GPL(dst_blackhole_mtu); diff --git a/net/core/sock.c b/net/core/sock.c index 4379447ccd02e..234ac85edf780 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2522,7 +2522,7 @@ void sk_free_unlock_clone(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); -static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) +static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) { bool is_ipv6 = false; u32 max_size; @@ -2532,8 +2532,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ - max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : - READ_ONCE(dst->dev->gso_ipv4_max_size); + max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : + READ_ONCE(dev->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; @@ -2542,9 +2542,12 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + const struct net_device *dev; u32 max_segs = 1; - sk->sk_route_caps = dst->dev->features; + rcu_read_lock(); + dev = dst_dev_rcu(dst); + sk->sk_route_caps = dev->features; if (sk_is_tcp(sk)) sk->sk_route_caps |= NETIF_F_GSO; if (sk->sk_route_caps & NETIF_F_GSO) @@ -2556,13 +2559,14 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); + sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ - max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); + max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(sk_setup_caps); diff --git a/net/devlink/param.c b/net/devlink/param.c index 41dcc86cfd944..62fd789ae01c7 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -102,6 +102,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME, .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF, + .name = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME, + .type = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0065b1996c947..a1d599345641f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -426,15 +426,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; + struct net_device *dev, *indev = skb->dev; + int ret_val; + rcu_read_lock(); + dev = skb_dst_dev_rcu(skb); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, indev, dev, - ip_finish_output, - !(IPCB(skb)->flags & IPSKB_REROUTED)); + ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, indev, dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); + rcu_read_unlock(); + return ret_val; } EXPORT_SYMBOL(ip_output); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c4ffbf26c17bb..2ae940719371b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1021,7 +1021,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dst_dev_net_rcu(dst); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1307,7 +1307,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dst_dev_net_rcu(dst); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5a364b3521153..707e0df951ac0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -60,7 +60,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev_rcu(dst); struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *daddr, *nexthop; @@ -70,15 +70,12 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * /* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { - /* Make sure idev stays alive */ - rcu_read_lock(); + /* idev stays alive because we hold rcu_read_lock(). */ skb = skb_expand_head(skb, hh_len); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - rcu_read_unlock(); return -ENOMEM; } - rcu_read_unlock(); } hdr = ipv6_hdr(skb); @@ -123,7 +120,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); @@ -131,7 +127,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (IS_ERR(neigh)) { - rcu_read_unlock(); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; @@ -139,7 +134,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); - rcu_read_unlock(); return ret; } @@ -232,22 +226,30 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; - struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + struct dst_entry *dst = skb_dst(skb); + struct net_device *dev, *indev = skb->dev; + struct inet6_dev *idev; + int ret; skb->protocol = htons(ETH_P_IPV6); + rcu_read_lock(); + dev = dst_dev_rcu(dst); + idev = ip6_dst_idev(dst); skb->dev = dev; if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; } - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, indev, dev, - ip6_finish_output, - !(IP6CB(skb)->flags & IP6SKB_REROUTED)); + ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, + net, sk, skb, indev, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ip6_output); @@ -267,35 +269,36 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk) int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { - struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; struct inet6_dev *idev = ip6_dst_idev(dst); struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); + struct net *net = sock_net(sk); unsigned int head_room; + struct net_device *dev; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; - int hlimit = -1; + int ret, hlimit = -1; u32 mtu; + rcu_read_lock(); + + dev = dst_dev_rcu(dst); head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; if (unlikely(head_room > skb_headroom(skb))) { - /* Make sure idev stays alive */ - rcu_read_lock(); + /* idev stays alive while we hold rcu_read_lock(). */ skb = skb_expand_head(skb, head_room); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - rcu_read_unlock(); - return -ENOBUFS; + ret = -ENOBUFS; + goto unlock; } - rcu_read_unlock(); } if (opt) { @@ -357,17 +360,21 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, * skb to its handler for processing */ skb = l3mdev_ip6_out((struct sock *)sk, skb); - if (unlikely(!skb)) - return 0; + if (unlikely(!skb)) { + ret = 0; + goto unlock; + } /* hooks should never assume socket lock is held. * we promote our socket to non const */ - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, (struct sock *)sk, skb, NULL, dev, - dst_output); + ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, (struct sock *)sk, skb, NULL, dev, + dst_output); + goto unlock; } + ret = -EMSGSIZE; skb->dev = dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const @@ -376,7 +383,9 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); - return -EMSGSIZE; +unlock: + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ip6_xmit); diff --git a/redhat/kabi/kabi-module/kabi_x86_64/__kabi__alt_instr b/redhat/kabi/kabi-module/kabi_x86_64/__kabi__alt_instr new file mode 100644 index 0000000000000..aae86e18ae909 --- /dev/null +++ b/redhat/kabi/kabi-module/kabi_x86_64/__kabi__alt_instr @@ -0,0 +1,2 @@ +#1- +0xe08433f8 __kabi__alt_instr vmlinux EXPORT_SYMBOL_GPL diff --git a/redhat/kernel.changelog-10.1 b/redhat/kernel.changelog-10.1 index 5028b7f7f4fb8..f0e60c613d7e4 100644 --- a/redhat/kernel.changelog-10.1 +++ b/redhat/kernel.changelog-10.1 @@ -1,3 +1,49 @@ +* Thu Jan 22 2026 CKI KWF Bot [6.12.0-124.31.1.el10_1] +- i40e: support generic devlink param "max_mac_per_vf" (Mohammad Heib) [RHEL-121647] +- devlink: Add new "max_mac_per_vf" generic device param (Mohammad Heib) [RHEL-121647] +- i40e: improve VF MAC filters accounting (Mohammad Heib) [RHEL-121647] +- KVM: arm64: Hide ID_AA64MMFR2_EL1.NV from guest and userspace (Donald Dutile) [RHEL-134763] +- scsi: st: Skip buffer flush for information ioctls (Ewan D. Milne) [RHEL-136289] +- scsi: st: Separate st-unique ioctl handling from SCSI common ioctl handling (Ewan D. Milne) [RHEL-136289] +- scsi: st: Don't set pos_unknown just after device recognition (Ewan D. Milne) [RHEL-136289] +- scsi: st: New session only when Unit Attention for new tape (Ewan D. Milne) [RHEL-136289] +- scsi: st: Add MTIOCGET and MTLOAD to ioctls allowed after device reset (Ewan D. Milne) [RHEL-136289] +- scsi: st: Don't modify unknown block number in MTIOCGET (Ewan D. Milne) [RHEL-136289] +- xfs: rework datasync tracking and execution (CKI Backport Bot) [RHEL-126599] +- xfs: rearrange code in xfs_inode_item_precommit (CKI Backport Bot) [RHEL-126599] +- s390: Disable ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP (Luiz Capitulino) [RHEL-133336] +- s390: mm: add stub for hugetlb_optimize_vmemmap_key (Luiz Capitulino) [RHEL-133336] +- x86/mm/init: Handle the special case of device private pages in add_pages(), to not increase max_pfn and trigger dma_addressing_limited() bounce buffers (Ricardo Robaina) [RHEL-129452] +- x86/kaslr: Reduce KASLR entropy on most x86 systems (Ricardo Robaina) [RHEL-129452] +- x86/boot/compressed: Remove unused header includes from kaslr.c (Ricardo Robaina) [RHEL-129452] +- RDMA/core: Fix "KASAN: slab-use-after-free Read in ib_register_device" problem (CKI Backport Bot) [RHEL-134363] {CVE-2025-38022} +- uprobes: Fix race in uprobe_free_utask (Jay Shin) [RHEL-133456] +- ASoC: Intel: bytcr_rt5640: Fix invalid quirk input mapping (CKI Backport Bot) [RHEL-129115] {CVE-2025-40154} +Resolves: RHEL-121647, RHEL-122759, RHEL-126599, RHEL-129115, RHEL-129452, RHEL-133336, RHEL-133456, RHEL-134363, RHEL-134763, RHEL-136289 + +* Wed Jan 21 2026 CKI KWF Bot [6.12.0-124.30.1.el10_1] +- io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU (Jeff Moyer) [RHEL-129623] {CVE-2025-38453} +- net: atlantic: fix fragment overflow handling in RX path (CKI Backport Bot) [RHEL-139490] {CVE-2025-68301} +- Bluetooth: hci_sock: Prevent race in socket write iter and sock bind (CKI Backport Bot) [RHEL-139465] {CVE-2025-68305} +- vsock: Ignore signal/timeout on connect() if already established (CKI Backport Bot) [RHEL-139287] {CVE-2025-40248} +- net: use dst_dev_rcu() in sk_setup_caps() (Hangbin Liu) [RHEL-129087] {CVE-2025-40170} +- ipv6: use RCU in ip6_xmit() (Hangbin Liu) [RHEL-129026] {CVE-2025-40135} +- ipv6: use RCU in ip6_output() (Hangbin Liu) [RHEL-128991] {CVE-2025-40158} +- net: dst: introduce dst->dev_rcu (Hangbin Liu) [RHEL-129026] +- net: Add locking to protect skb->dev access in ip_output (Hangbin Liu) [RHEL-129026] +- net: dst: add four helpers to annotate data-races around dst->dev (Hangbin Liu) [RHEL-129026] +- eventpoll: don't decrement ep refcount while still holding the ep mutex (CKI Backport Bot) [RHEL-138041] {CVE-2025-38349} +- fs/proc: fix uaf in proc_readdir_de() (CKI Backport Bot) [RHEL-137101] {CVE-2025-40271} +- Bluetooth: MGMT: Fix OOB access in parse_adv_monitor_pattern() (CKI Backport Bot) [RHEL-136972] {CVE-2025-40294} +- Bluetooth: hci_event: validate skb length for unknown CC opcode (CKI Backport Bot) [RHEL-136951] {CVE-2025-40301} +- net/sched: mqprio: fix stack out-of-bounds write in tc entry parsing (CKI Backport Bot) [RHEL-136836] {CVE-2025-38568} +- Bluetooth: hci_sync: fix race in hci_cmd_sync_dequeue_once (CKI Backport Bot) [RHEL-136259] {CVE-2025-40318} +- devlink: rate: Unset parent pointer in devl_rate_nodes_destroy (CKI Backport Bot) [RHEL-134926] {CVE-2025-40251} +- mptcp: fix race condition in mptcp_schedule_work() (CKI Backport Bot) [RHEL-134451] {CVE-2025-40258} +- irqchip/gic-v2m: Prevent use after free of gicv2m_get_fwnode() (CKI Backport Bot) [RHEL-131989] {CVE-2025-37819} +- drm/xe: Fix vm_bind_ioctl double free bug (Anusha Srivatsa) [RHEL-122312] {CVE-2025-38731} +Resolves: RHEL-122312, RHEL-128991, RHEL-129026, RHEL-129087, RHEL-129623, RHEL-131989, RHEL-134451, RHEL-134926, RHEL-136259, RHEL-136836, RHEL-136951, RHEL-136972, RHEL-137101, RHEL-138041, RHEL-139287, RHEL-139465, RHEL-139490 + * Sat Jan 10 2026 CKI KWF Bot [6.12.0-124.29.1.el10_1] - gitlab-ci: use rhel10.1 builder image (Michael Hofmann) - mm/vmalloc: fix data race in show_numa_info() (Waiman Long) [RHEL-137997] {CVE-2025-38383} diff --git a/uki-addons.sbat b/uki-addons.sbat index 2c7cc053d3361..6a50af964495e 100644 --- a/uki-addons.sbat +++ b/uki-addons.sbat @@ -1,3 +1,3 @@ sbat,1,SBAT Version,sbat,1,https://github.com/rhboot/shim/blob/main/SBAT.md -kernel-uki-virt-addons.rhel,1,Red Hat,kernel-uki-virt-addons,6.12.0-124.29.1.el10_1.x86_64,mailto:secalert@redhat.com -kernel-uki-virt-addons.rocky,1,RESF,kernel-uki-virt-addons,6.12.0-124.29.1.el10_1.x86_64,mailto:security@rockylinux.org +kernel-uki-virt-addons.rhel,1,Red Hat,kernel-uki-virt-addons,6.12.0-124.31.1.el10_1.x86_64,mailto:secalert@redhat.com +kernel-uki-virt-addons.rocky,1,RESF,kernel-uki-virt-addons,6.12.0-124.31.1.el10_1.x86_64,mailto:security@rockylinux.org diff --git a/uki.sbat b/uki.sbat index 96a828721eb06..a3f2d6af4cd48 100644 --- a/uki.sbat +++ b/uki.sbat @@ -1,3 +1,3 @@ sbat,1,SBAT Version,sbat,1,https://github.com/rhboot/shim/blob/main/SBAT.md -kernel-uki-virt.rhel,1,Red Hat,kernel-uki-virt,6.12.0-124.29.1.el10_1.x86_64,mailto:secalert@redhat.com -kernel-uki-virt.rocky,1,RESF,kernel-uki-virt,6.12.0-124.29.1.el10_1.x86_64,mailto:security@rockylinux.org +kernel-uki-virt.rhel,1,Red Hat,kernel-uki-virt,6.12.0-124.31.1.el10_1.x86_64,mailto:secalert@redhat.com +kernel-uki-virt.rocky,1,RESF,kernel-uki-virt,6.12.0-124.31.1.el10_1.x86_64,mailto:security@rockylinux.org