327 lines
10 KiB
Diff
327 lines
10 KiB
Diff
|
From 11fa4cfe7134f44f2cdac4b25636fc3291096979 Mon Sep 17 00:00:00 2001
|
||
|
From: Paolo Bonzini <pbonzini@redhat.com>
|
||
|
Date: Fri, 8 Nov 2024 08:07:37 -0500
|
||
|
Subject: KVM: x86: switch hugepage recovery thread to vhost_task
|
||
|
|
||
|
kvm_vm_create_worker_thread() is meant to be used for kthreads that
|
||
|
can consume significant amounts of CPU time on behalf of a VM or in
|
||
|
response to how the VM behaves (for example how it accesses its memory).
|
||
|
Therefore it wants to charge the CPU time consumed by that work to
|
||
|
the VM's container.
|
||
|
|
||
|
However, because of these threads, cgroups which have kvm instances inside
|
||
|
never complete freezing. This can be trivially reproduced:
|
||
|
|
||
|
root@test ~# mkdir /sys/fs/cgroup/test
|
||
|
root@test ~# echo $fish_pid > /sys/fs/cgroup/test/cgroup.procs
|
||
|
root@test ~# qemu-system-x86_64 --nographic -enable-kvm
|
||
|
|
||
|
and in another terminal:
|
||
|
|
||
|
root@test ~# echo 1 > /sys/fs/cgroup/test/cgroup.freeze
|
||
|
root@test ~# cat /sys/fs/cgroup/test/cgroup.events
|
||
|
populated 1
|
||
|
frozen 0
|
||
|
|
||
|
The cgroup freezing happens in the signal delivery path but
|
||
|
kvm_vm_worker_thread() thread never call into the signal delivery path while
|
||
|
joining non-root cgroups, so they never get frozen. Because the cgroup
|
||
|
freezer determines whether a given cgroup is frozen by comparing the number
|
||
|
of frozen threads to the total number of threads in the cgroup, the cgroup
|
||
|
never becomes frozen and users waiting for the state transition may hang
|
||
|
indefinitely.
|
||
|
|
||
|
Since the worker kthread is tied to a user process, it's better if
|
||
|
it behaves similarly to user tasks as much as possible, including
|
||
|
being able to send SIGSTOP and SIGCONT. In fact, vhost_task is all
|
||
|
that kvm_vm_create_worker_thread() wanted to be and more: not only it
|
||
|
inherits the userspace process's cgroups, it has other niceties like
|
||
|
being parented properly in the process tree. Use it instead of the
|
||
|
homegrown alternative.
|
||
|
|
||
|
(Commit message based on emails from Tejun).
|
||
|
|
||
|
Reported-by: Tejun Heo <tj@kernel.org>
|
||
|
Reported-by: Luca Boccassi <bluca@debian.org>
|
||
|
Tested-by: Luca Boccassi <bluca@debian.org>
|
||
|
Acked-by: Tejun Heo <tj@kernel.org>
|
||
|
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||
|
---
|
||
|
arch/x86/include/asm/kvm_host.h | 4 +-
|
||
|
arch/x86/kvm/Kconfig | 1 +
|
||
|
arch/x86/kvm/mmu/mmu.c | 67 +++++++++++----------
|
||
|
include/linux/kvm_host.h | 6 --
|
||
|
virt/kvm/kvm_main.c | 103 --------------------------------
|
||
|
5 files changed, 39 insertions(+), 142 deletions(-)
|
||
|
|
||
|
--- a/arch/x86/include/asm/kvm_host.h
|
||
|
+++ b/arch/x86/include/asm/kvm_host.h
|
||
|
@@ -26,6 +26,7 @@
|
||
|
#include <linux/irqbypass.h>
|
||
|
#include <linux/hyperv.h>
|
||
|
#include <linux/kfifo.h>
|
||
|
+#include <linux/sched/vhost_task.h>
|
||
|
|
||
|
#include <asm/apic.h>
|
||
|
#include <asm/pvclock-abi.h>
|
||
|
@@ -1445,7 +1446,8 @@ struct kvm_arch {
|
||
|
bool sgx_provisioning_allowed;
|
||
|
|
||
|
struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
|
||
|
- struct task_struct *nx_huge_page_recovery_thread;
|
||
|
+ struct vhost_task *nx_huge_page_recovery_thread;
|
||
|
+ u64 nx_huge_page_next;
|
||
|
|
||
|
#ifdef CONFIG_X86_64
|
||
|
/* The number of TDP MMU pages across all roots. */
|
||
|
--- a/arch/x86/kvm/Kconfig
|
||
|
+++ b/arch/x86/kvm/Kconfig
|
||
|
@@ -29,6 +29,7 @@ config KVM
|
||
|
select HAVE_KVM_IRQ_BYPASS
|
||
|
select HAVE_KVM_IRQ_ROUTING
|
||
|
select HAVE_KVM_READONLY_MEM
|
||
|
+ select VHOST_TASK
|
||
|
select KVM_ASYNC_PF
|
||
|
select USER_RETURN_NOTIFIER
|
||
|
select KVM_MMIO
|
||
|
--- a/arch/x86/kvm/mmu/mmu.c
|
||
|
+++ b/arch/x86/kvm/mmu/mmu.c
|
||
|
@@ -7160,7 +7160,7 @@ static int set_nx_huge_pages(const char
|
||
|
kvm_mmu_zap_all_fast(kvm);
|
||
|
mutex_unlock(&kvm->slots_lock);
|
||
|
|
||
|
- wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
|
||
|
+ vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
|
||
|
}
|
||
|
mutex_unlock(&kvm_lock);
|
||
|
}
|
||
|
@@ -7306,7 +7306,7 @@ static int set_nx_huge_pages_recovery_pa
|
||
|
mutex_lock(&kvm_lock);
|
||
|
|
||
|
list_for_each_entry(kvm, &vm_list, vm_list)
|
||
|
- wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
|
||
|
+ vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
|
||
|
|
||
|
mutex_unlock(&kvm_lock);
|
||
|
}
|
||
|
@@ -7409,62 +7409,65 @@ static void kvm_recover_nx_huge_pages(st
|
||
|
srcu_read_unlock(&kvm->srcu, rcu_idx);
|
||
|
}
|
||
|
|
||
|
-static long get_nx_huge_page_recovery_timeout(u64 start_time)
|
||
|
+#define NX_HUGE_PAGE_DISABLED (-1)
|
||
|
+
|
||
|
+static u64 get_nx_huge_page_recovery_next(void)
|
||
|
{
|
||
|
bool enabled;
|
||
|
uint period;
|
||
|
|
||
|
enabled = calc_nx_huge_pages_recovery_period(&period);
|
||
|
|
||
|
- return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
|
||
|
- : MAX_SCHEDULE_TIMEOUT;
|
||
|
+ return enabled ? get_jiffies_64() + msecs_to_jiffies(period)
|
||
|
+ : NX_HUGE_PAGE_DISABLED;
|
||
|
}
|
||
|
|
||
|
-static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
|
||
|
+static void kvm_nx_huge_page_recovery_worker_kill(void *data)
|
||
|
{
|
||
|
- u64 start_time;
|
||
|
- long remaining_time;
|
||
|
-
|
||
|
- while (true) {
|
||
|
- start_time = get_jiffies_64();
|
||
|
- remaining_time = get_nx_huge_page_recovery_timeout(start_time);
|
||
|
-
|
||
|
- set_current_state(TASK_INTERRUPTIBLE);
|
||
|
- while (!kthread_should_stop() && remaining_time > 0) {
|
||
|
- schedule_timeout(remaining_time);
|
||
|
- remaining_time = get_nx_huge_page_recovery_timeout(start_time);
|
||
|
- set_current_state(TASK_INTERRUPTIBLE);
|
||
|
- }
|
||
|
+}
|
||
|
|
||
|
- set_current_state(TASK_RUNNING);
|
||
|
+static bool kvm_nx_huge_page_recovery_worker(void *data)
|
||
|
+{
|
||
|
+ struct kvm *kvm = data;
|
||
|
+ long remaining_time;
|
||
|
|
||
|
- if (kthread_should_stop())
|
||
|
- return 0;
|
||
|
+ if (kvm->arch.nx_huge_page_next == NX_HUGE_PAGE_DISABLED)
|
||
|
+ return false;
|
||
|
|
||
|
- kvm_recover_nx_huge_pages(kvm);
|
||
|
+ remaining_time = kvm->arch.nx_huge_page_next - get_jiffies_64();
|
||
|
+ if (remaining_time > 0) {
|
||
|
+ schedule_timeout(remaining_time);
|
||
|
+ /* check for signals and come back */
|
||
|
+ return true;
|
||
|
}
|
||
|
+
|
||
|
+ __set_current_state(TASK_RUNNING);
|
||
|
+ kvm_recover_nx_huge_pages(kvm);
|
||
|
+ kvm->arch.nx_huge_page_next = get_nx_huge_page_recovery_next();
|
||
|
+ return true;
|
||
|
}
|
||
|
|
||
|
int kvm_mmu_post_init_vm(struct kvm *kvm)
|
||
|
{
|
||
|
- int err;
|
||
|
-
|
||
|
if (nx_hugepage_mitigation_hard_disabled)
|
||
|
return 0;
|
||
|
|
||
|
- err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
|
||
|
- "kvm-nx-lpage-recovery",
|
||
|
- &kvm->arch.nx_huge_page_recovery_thread);
|
||
|
- if (!err)
|
||
|
- kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
|
||
|
+ kvm->arch.nx_huge_page_next = get_nx_huge_page_recovery_next();
|
||
|
+ kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
|
||
|
+ kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
|
||
|
+ kvm, "kvm-nx-lpage-recovery");
|
||
|
|
||
|
- return err;
|
||
|
+ if (!kvm->arch.nx_huge_page_recovery_thread)
|
||
|
+ return -ENOMEM;
|
||
|
+
|
||
|
+ vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
|
||
|
+ return 0;
|
||
|
}
|
||
|
|
||
|
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
|
||
|
{
|
||
|
if (kvm->arch.nx_huge_page_recovery_thread)
|
||
|
- kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
|
||
|
+ vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
|
||
|
}
|
||
|
|
||
|
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
|
||
|
--- a/include/linux/kvm_host.h
|
||
|
+++ b/include/linux/kvm_host.h
|
||
|
@@ -2370,12 +2370,6 @@ static inline int kvm_arch_vcpu_run_pid_
|
||
|
}
|
||
|
#endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */
|
||
|
|
||
|
-typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
|
||
|
-
|
||
|
-int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
|
||
|
- uintptr_t data, const char *name,
|
||
|
- struct task_struct **thread_ptr);
|
||
|
-
|
||
|
#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
|
||
|
static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
|
||
|
{
|
||
|
--- a/virt/kvm/kvm_main.c
|
||
|
+++ b/virt/kvm/kvm_main.c
|
||
|
@@ -6573,106 +6573,3 @@ void kvm_exit(void)
|
||
|
kvm_irqfd_exit();
|
||
|
}
|
||
|
EXPORT_SYMBOL_GPL(kvm_exit);
|
||
|
-
|
||
|
-struct kvm_vm_worker_thread_context {
|
||
|
- struct kvm *kvm;
|
||
|
- struct task_struct *parent;
|
||
|
- struct completion init_done;
|
||
|
- kvm_vm_thread_fn_t thread_fn;
|
||
|
- uintptr_t data;
|
||
|
- int err;
|
||
|
-};
|
||
|
-
|
||
|
-static int kvm_vm_worker_thread(void *context)
|
||
|
-{
|
||
|
- /*
|
||
|
- * The init_context is allocated on the stack of the parent thread, so
|
||
|
- * we have to locally copy anything that is needed beyond initialization
|
||
|
- */
|
||
|
- struct kvm_vm_worker_thread_context *init_context = context;
|
||
|
- struct task_struct *parent;
|
||
|
- struct kvm *kvm = init_context->kvm;
|
||
|
- kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
|
||
|
- uintptr_t data = init_context->data;
|
||
|
- int err;
|
||
|
-
|
||
|
- err = kthread_park(current);
|
||
|
- /* kthread_park(current) is never supposed to return an error */
|
||
|
- WARN_ON(err != 0);
|
||
|
- if (err)
|
||
|
- goto init_complete;
|
||
|
-
|
||
|
- err = cgroup_attach_task_all(init_context->parent, current);
|
||
|
- if (err) {
|
||
|
- kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
|
||
|
- __func__, err);
|
||
|
- goto init_complete;
|
||
|
- }
|
||
|
-
|
||
|
- set_user_nice(current, task_nice(init_context->parent));
|
||
|
-
|
||
|
-init_complete:
|
||
|
- init_context->err = err;
|
||
|
- complete(&init_context->init_done);
|
||
|
- init_context = NULL;
|
||
|
-
|
||
|
- if (err)
|
||
|
- goto out;
|
||
|
-
|
||
|
- /* Wait to be woken up by the spawner before proceeding. */
|
||
|
- kthread_parkme();
|
||
|
-
|
||
|
- if (!kthread_should_stop())
|
||
|
- err = thread_fn(kvm, data);
|
||
|
-
|
||
|
-out:
|
||
|
- /*
|
||
|
- * Move kthread back to its original cgroup to prevent it lingering in
|
||
|
- * the cgroup of the VM process, after the latter finishes its
|
||
|
- * execution.
|
||
|
- *
|
||
|
- * kthread_stop() waits on the 'exited' completion condition which is
|
||
|
- * set in exit_mm(), via mm_release(), in do_exit(). However, the
|
||
|
- * kthread is removed from the cgroup in the cgroup_exit() which is
|
||
|
- * called after the exit_mm(). This causes the kthread_stop() to return
|
||
|
- * before the kthread actually quits the cgroup.
|
||
|
- */
|
||
|
- rcu_read_lock();
|
||
|
- parent = rcu_dereference(current->real_parent);
|
||
|
- get_task_struct(parent);
|
||
|
- rcu_read_unlock();
|
||
|
- cgroup_attach_task_all(parent, current);
|
||
|
- put_task_struct(parent);
|
||
|
-
|
||
|
- return err;
|
||
|
-}
|
||
|
-
|
||
|
-int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
|
||
|
- uintptr_t data, const char *name,
|
||
|
- struct task_struct **thread_ptr)
|
||
|
-{
|
||
|
- struct kvm_vm_worker_thread_context init_context = {};
|
||
|
- struct task_struct *thread;
|
||
|
-
|
||
|
- *thread_ptr = NULL;
|
||
|
- init_context.kvm = kvm;
|
||
|
- init_context.parent = current;
|
||
|
- init_context.thread_fn = thread_fn;
|
||
|
- init_context.data = data;
|
||
|
- init_completion(&init_context.init_done);
|
||
|
-
|
||
|
- thread = kthread_run(kvm_vm_worker_thread, &init_context,
|
||
|
- "%s-%d", name, task_pid_nr(current));
|
||
|
- if (IS_ERR(thread))
|
||
|
- return PTR_ERR(thread);
|
||
|
-
|
||
|
- /* kthread_run is never supposed to return NULL */
|
||
|
- WARN_ON(thread == NULL);
|
||
|
-
|
||
|
- wait_for_completion(&init_context.init_done);
|
||
|
-
|
||
|
- if (!init_context.err)
|
||
|
- *thread_ptr = thread;
|
||
|
-
|
||
|
- return init_context.err;
|
||
|
-}
|