release 6.11.11
This commit is contained in:
@@ -1,326 +0,0 @@
|
||||
From 11fa4cfe7134f44f2cdac4b25636fc3291096979 Mon Sep 17 00:00:00 2001
|
||||
From: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Date: Fri, 8 Nov 2024 08:07:37 -0500
|
||||
Subject: KVM: x86: switch hugepage recovery thread to vhost_task
|
||||
|
||||
kvm_vm_create_worker_thread() is meant to be used for kthreads that
|
||||
can consume significant amounts of CPU time on behalf of a VM or in
|
||||
response to how the VM behaves (for example how it accesses its memory).
|
||||
Therefore it wants to charge the CPU time consumed by that work to
|
||||
the VM's container.
|
||||
|
||||
However, because of these threads, cgroups which have kvm instances inside
|
||||
never complete freezing. This can be trivially reproduced:
|
||||
|
||||
root@test ~# mkdir /sys/fs/cgroup/test
|
||||
root@test ~# echo $fish_pid > /sys/fs/cgroup/test/cgroup.procs
|
||||
root@test ~# qemu-system-x86_64 --nographic -enable-kvm
|
||||
|
||||
and in another terminal:
|
||||
|
||||
root@test ~# echo 1 > /sys/fs/cgroup/test/cgroup.freeze
|
||||
root@test ~# cat /sys/fs/cgroup/test/cgroup.events
|
||||
populated 1
|
||||
frozen 0
|
||||
|
||||
The cgroup freezing happens in the signal delivery path but
|
||||
kvm_vm_worker_thread() thread never call into the signal delivery path while
|
||||
joining non-root cgroups, so they never get frozen. Because the cgroup
|
||||
freezer determines whether a given cgroup is frozen by comparing the number
|
||||
of frozen threads to the total number of threads in the cgroup, the cgroup
|
||||
never becomes frozen and users waiting for the state transition may hang
|
||||
indefinitely.
|
||||
|
||||
Since the worker kthread is tied to a user process, it's better if
|
||||
it behaves similarly to user tasks as much as possible, including
|
||||
being able to send SIGSTOP and SIGCONT. In fact, vhost_task is all
|
||||
that kvm_vm_create_worker_thread() wanted to be and more: not only it
|
||||
inherits the userspace process's cgroups, it has other niceties like
|
||||
being parented properly in the process tree. Use it instead of the
|
||||
homegrown alternative.
|
||||
|
||||
(Commit message based on emails from Tejun).
|
||||
|
||||
Reported-by: Tejun Heo <tj@kernel.org>
|
||||
Reported-by: Luca Boccassi <bluca@debian.org>
|
||||
Tested-by: Luca Boccassi <bluca@debian.org>
|
||||
Acked-by: Tejun Heo <tj@kernel.org>
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
---
|
||||
arch/x86/include/asm/kvm_host.h | 4 +-
|
||||
arch/x86/kvm/Kconfig | 1 +
|
||||
arch/x86/kvm/mmu/mmu.c | 67 +++++++++++----------
|
||||
include/linux/kvm_host.h | 6 --
|
||||
virt/kvm/kvm_main.c | 103 --------------------------------
|
||||
5 files changed, 39 insertions(+), 142 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/kvm_host.h
|
||||
+++ b/arch/x86/include/asm/kvm_host.h
|
||||
@@ -26,6 +26,7 @@
|
||||
#include <linux/irqbypass.h>
|
||||
#include <linux/hyperv.h>
|
||||
#include <linux/kfifo.h>
|
||||
+#include <linux/sched/vhost_task.h>
|
||||
|
||||
#include <asm/apic.h>
|
||||
#include <asm/pvclock-abi.h>
|
||||
@@ -1445,7 +1446,8 @@ struct kvm_arch {
|
||||
bool sgx_provisioning_allowed;
|
||||
|
||||
struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
|
||||
- struct task_struct *nx_huge_page_recovery_thread;
|
||||
+ struct vhost_task *nx_huge_page_recovery_thread;
|
||||
+ u64 nx_huge_page_next;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* The number of TDP MMU pages across all roots. */
|
||||
--- a/arch/x86/kvm/Kconfig
|
||||
+++ b/arch/x86/kvm/Kconfig
|
||||
@@ -29,6 +29,7 @@ config KVM
|
||||
select HAVE_KVM_IRQ_BYPASS
|
||||
select HAVE_KVM_IRQ_ROUTING
|
||||
select HAVE_KVM_READONLY_MEM
|
||||
+ select VHOST_TASK
|
||||
select KVM_ASYNC_PF
|
||||
select USER_RETURN_NOTIFIER
|
||||
select KVM_MMIO
|
||||
--- a/arch/x86/kvm/mmu/mmu.c
|
||||
+++ b/arch/x86/kvm/mmu/mmu.c
|
||||
@@ -7160,7 +7160,7 @@ static int set_nx_huge_pages(const char
|
||||
kvm_mmu_zap_all_fast(kvm);
|
||||
mutex_unlock(&kvm->slots_lock);
|
||||
|
||||
- wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
|
||||
+ vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
|
||||
}
|
||||
mutex_unlock(&kvm_lock);
|
||||
}
|
||||
@@ -7306,7 +7306,7 @@ static int set_nx_huge_pages_recovery_pa
|
||||
mutex_lock(&kvm_lock);
|
||||
|
||||
list_for_each_entry(kvm, &vm_list, vm_list)
|
||||
- wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
|
||||
+ vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
|
||||
|
||||
mutex_unlock(&kvm_lock);
|
||||
}
|
||||
@@ -7409,62 +7409,65 @@ static void kvm_recover_nx_huge_pages(st
|
||||
srcu_read_unlock(&kvm->srcu, rcu_idx);
|
||||
}
|
||||
|
||||
-static long get_nx_huge_page_recovery_timeout(u64 start_time)
|
||||
+#define NX_HUGE_PAGE_DISABLED (-1)
|
||||
+
|
||||
+static u64 get_nx_huge_page_recovery_next(void)
|
||||
{
|
||||
bool enabled;
|
||||
uint period;
|
||||
|
||||
enabled = calc_nx_huge_pages_recovery_period(&period);
|
||||
|
||||
- return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
|
||||
- : MAX_SCHEDULE_TIMEOUT;
|
||||
+ return enabled ? get_jiffies_64() + msecs_to_jiffies(period)
|
||||
+ : NX_HUGE_PAGE_DISABLED;
|
||||
}
|
||||
|
||||
-static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
|
||||
+static void kvm_nx_huge_page_recovery_worker_kill(void *data)
|
||||
{
|
||||
- u64 start_time;
|
||||
- long remaining_time;
|
||||
-
|
||||
- while (true) {
|
||||
- start_time = get_jiffies_64();
|
||||
- remaining_time = get_nx_huge_page_recovery_timeout(start_time);
|
||||
-
|
||||
- set_current_state(TASK_INTERRUPTIBLE);
|
||||
- while (!kthread_should_stop() && remaining_time > 0) {
|
||||
- schedule_timeout(remaining_time);
|
||||
- remaining_time = get_nx_huge_page_recovery_timeout(start_time);
|
||||
- set_current_state(TASK_INTERRUPTIBLE);
|
||||
- }
|
||||
+}
|
||||
|
||||
- set_current_state(TASK_RUNNING);
|
||||
+static bool kvm_nx_huge_page_recovery_worker(void *data)
|
||||
+{
|
||||
+ struct kvm *kvm = data;
|
||||
+ long remaining_time;
|
||||
|
||||
- if (kthread_should_stop())
|
||||
- return 0;
|
||||
+ if (kvm->arch.nx_huge_page_next == NX_HUGE_PAGE_DISABLED)
|
||||
+ return false;
|
||||
|
||||
- kvm_recover_nx_huge_pages(kvm);
|
||||
+ remaining_time = kvm->arch.nx_huge_page_next - get_jiffies_64();
|
||||
+ if (remaining_time > 0) {
|
||||
+ schedule_timeout(remaining_time);
|
||||
+ /* check for signals and come back */
|
||||
+ return true;
|
||||
}
|
||||
+
|
||||
+ __set_current_state(TASK_RUNNING);
|
||||
+ kvm_recover_nx_huge_pages(kvm);
|
||||
+ kvm->arch.nx_huge_page_next = get_nx_huge_page_recovery_next();
|
||||
+ return true;
|
||||
}
|
||||
|
||||
int kvm_mmu_post_init_vm(struct kvm *kvm)
|
||||
{
|
||||
- int err;
|
||||
-
|
||||
if (nx_hugepage_mitigation_hard_disabled)
|
||||
return 0;
|
||||
|
||||
- err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
|
||||
- "kvm-nx-lpage-recovery",
|
||||
- &kvm->arch.nx_huge_page_recovery_thread);
|
||||
- if (!err)
|
||||
- kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
|
||||
+ kvm->arch.nx_huge_page_next = get_nx_huge_page_recovery_next();
|
||||
+ kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
|
||||
+ kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
|
||||
+ kvm, "kvm-nx-lpage-recovery");
|
||||
|
||||
- return err;
|
||||
+ if (!kvm->arch.nx_huge_page_recovery_thread)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
|
||||
{
|
||||
if (kvm->arch.nx_huge_page_recovery_thread)
|
||||
- kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
|
||||
+ vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
--- a/include/linux/kvm_host.h
|
||||
+++ b/include/linux/kvm_host.h
|
||||
@@ -2370,12 +2370,6 @@ static inline int kvm_arch_vcpu_run_pid_
|
||||
}
|
||||
#endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */
|
||||
|
||||
-typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
|
||||
-
|
||||
-int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
|
||||
- uintptr_t data, const char *name,
|
||||
- struct task_struct **thread_ptr);
|
||||
-
|
||||
#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
|
||||
static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
--- a/virt/kvm/kvm_main.c
|
||||
+++ b/virt/kvm/kvm_main.c
|
||||
@@ -6573,106 +6573,3 @@ void kvm_exit(void)
|
||||
kvm_irqfd_exit();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_exit);
|
||||
-
|
||||
-struct kvm_vm_worker_thread_context {
|
||||
- struct kvm *kvm;
|
||||
- struct task_struct *parent;
|
||||
- struct completion init_done;
|
||||
- kvm_vm_thread_fn_t thread_fn;
|
||||
- uintptr_t data;
|
||||
- int err;
|
||||
-};
|
||||
-
|
||||
-static int kvm_vm_worker_thread(void *context)
|
||||
-{
|
||||
- /*
|
||||
- * The init_context is allocated on the stack of the parent thread, so
|
||||
- * we have to locally copy anything that is needed beyond initialization
|
||||
- */
|
||||
- struct kvm_vm_worker_thread_context *init_context = context;
|
||||
- struct task_struct *parent;
|
||||
- struct kvm *kvm = init_context->kvm;
|
||||
- kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
|
||||
- uintptr_t data = init_context->data;
|
||||
- int err;
|
||||
-
|
||||
- err = kthread_park(current);
|
||||
- /* kthread_park(current) is never supposed to return an error */
|
||||
- WARN_ON(err != 0);
|
||||
- if (err)
|
||||
- goto init_complete;
|
||||
-
|
||||
- err = cgroup_attach_task_all(init_context->parent, current);
|
||||
- if (err) {
|
||||
- kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
|
||||
- __func__, err);
|
||||
- goto init_complete;
|
||||
- }
|
||||
-
|
||||
- set_user_nice(current, task_nice(init_context->parent));
|
||||
-
|
||||
-init_complete:
|
||||
- init_context->err = err;
|
||||
- complete(&init_context->init_done);
|
||||
- init_context = NULL;
|
||||
-
|
||||
- if (err)
|
||||
- goto out;
|
||||
-
|
||||
- /* Wait to be woken up by the spawner before proceeding. */
|
||||
- kthread_parkme();
|
||||
-
|
||||
- if (!kthread_should_stop())
|
||||
- err = thread_fn(kvm, data);
|
||||
-
|
||||
-out:
|
||||
- /*
|
||||
- * Move kthread back to its original cgroup to prevent it lingering in
|
||||
- * the cgroup of the VM process, after the latter finishes its
|
||||
- * execution.
|
||||
- *
|
||||
- * kthread_stop() waits on the 'exited' completion condition which is
|
||||
- * set in exit_mm(), via mm_release(), in do_exit(). However, the
|
||||
- * kthread is removed from the cgroup in the cgroup_exit() which is
|
||||
- * called after the exit_mm(). This causes the kthread_stop() to return
|
||||
- * before the kthread actually quits the cgroup.
|
||||
- */
|
||||
- rcu_read_lock();
|
||||
- parent = rcu_dereference(current->real_parent);
|
||||
- get_task_struct(parent);
|
||||
- rcu_read_unlock();
|
||||
- cgroup_attach_task_all(parent, current);
|
||||
- put_task_struct(parent);
|
||||
-
|
||||
- return err;
|
||||
-}
|
||||
-
|
||||
-int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
|
||||
- uintptr_t data, const char *name,
|
||||
- struct task_struct **thread_ptr)
|
||||
-{
|
||||
- struct kvm_vm_worker_thread_context init_context = {};
|
||||
- struct task_struct *thread;
|
||||
-
|
||||
- *thread_ptr = NULL;
|
||||
- init_context.kvm = kvm;
|
||||
- init_context.parent = current;
|
||||
- init_context.thread_fn = thread_fn;
|
||||
- init_context.data = data;
|
||||
- init_completion(&init_context.init_done);
|
||||
-
|
||||
- thread = kthread_run(kvm_vm_worker_thread, &init_context,
|
||||
- "%s-%d", name, task_pid_nr(current));
|
||||
- if (IS_ERR(thread))
|
||||
- return PTR_ERR(thread);
|
||||
-
|
||||
- /* kthread_run is never supposed to return NULL */
|
||||
- WARN_ON(thread == NULL);
|
||||
-
|
||||
- wait_for_completion(&init_context.init_done);
|
||||
-
|
||||
- if (!init_context.err)
|
||||
- *thread_ptr = thread;
|
||||
-
|
||||
- return init_context.err;
|
||||
-}
|
Reference in New Issue
Block a user