Created
February 23, 2018 19:08
-
-
Save sboeuf/fc71f0218a81997251ee0d7668df2bd9 to your computer and use it in GitHub Desktop.
VM returning memory patchset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 995119a1dc3161e13a9039625a33f9b630fcb1ef Mon Sep 17 00:00:00 2001 | |
From: Sebastien Boeuf <[email protected]> | |
Date: Mon, 27 Feb 2017 11:46:40 -0800 | |
Subject: [PATCH 0/4] Return unused memory to the host system | |
This series tries to solve a common issue in the cloud context, that | |
is being able to prevent any virtual machine running on a host system | |
from keeping unused resources and particularly unused memory. | |
Ballooning mechanism exits but it is not enough efficient as it waits | |
memory limits of the system to be hit, then the host tries to predict | |
how much memory can be asked back from a guest, and finally issues an | |
hypercall to retrieve this memory. | |
The guest knows precisely the amount of memory it can return, that's | |
why the solution proposed by this series rely on the guest to notify | |
the host about the memory that can be released. It implements a new | |
hypercall which allows guests to notify hosts that some memory blocks | |
are no longer used and can be discarded. Guest calls this hypercall | |
when it sees large blocks (>=512k) merged in the buddy allocator. | |
The host implementation for this new hypercall relies on MADV_DONTNEED | |
or MADV_FREE advices, delaying memory release if that memory is not | |
needed by the host, and therefore saving future hypercalls in case | |
the guest needs to use that memory again. | |
Idle VMs hold on some cache that they are not using. They gain no | |
performance advantage from keeping it, and this deprives the rest of | |
the system from using the memory effectively. | |
There is /proc/sys/vm/drop_caches sysctl allowing to release all the | |
cache of the system, but this is too aggressive because in case a | |
previously idle VM suddenly wakes up, it won't find any cached data, | |
lowering system performances. | |
That is why this series introduces a new sysctl which is essentially | |
a more fine-grained /proc/sys/vm/drop_caches. It allows userspace to | |
drop a specified amount of cache instead of the whole thing. Combined | |
with a proper userspace application, it allows to reduce gradually | |
the amount of cache on the system in case no workload needs it. | |
Sebastien Boeuf (4): | |
mm: Export do_madvise() | |
x86: kvm: Notify host to release pages | |
x86: Return memory from guest to host kernel | |
sysctl: vm: Fine-grained cache shrinking | |
arch/x86/include/asm/kvm_para.h | 22 ++++++++++++++++++++++ | |
arch/x86/kernel/kvm.c | 10 ++++++++++ | |
arch/x86/kvm/x86.c | 17 +++++++++++++++++ | |
fs/drop_caches.c | 25 +++++++++++++++++++++++++ | |
include/linux/mm-arch-hooks.h | 8 ++++++++ | |
include/linux/mm.h | 11 +++++++++++ | |
include/uapi/linux/kvm_para.h | 3 +++ | |
kernel/sysctl.c | 15 +++++++++++++++ | |
mm/Makefile | 2 +- | |
mm/kvm.c | 25 +++++++++++++++++++++++++ | |
mm/madvise.c | 25 +++++++++++++++++++++---- | |
mm/page_alloc.c | 2 ++ | |
mm/vmscan.c | 2 -- | |
13 files changed, 160 insertions(+), 7 deletions(-) | |
create mode 100644 mm/kvm.c | |
-- | |
2.7.4 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 7c9d5a3736f2daa9d3d3732ca8593e8e17fb1452 Mon Sep 17 00:00:00 2001 | |
From: Sebastien Boeuf <[email protected]> | |
Date: Mon, 23 Jan 2017 15:03:52 -0800 | |
Subject: [PATCH 1/4] mm: Export do_madvise() | |
Combined with some interesting flags madvise() system call | |
allows to free memory more smartly and more efficiently than | |
we could do with a simple free(). The issue is that is not | |
available for kernel modules that could need it. | |
In order to solve this lack of support, this patch exports | |
do_madvise() so as to make it available to the entire kernel. | |
The already existing madvise() system call is unchanged and | |
now relies on this new do_madvise() function. | |
Suggested-by: Arjan van de Ven <[email protected]> | |
Signed-off-by: Sebastien Boeuf <[email protected]> | |
--- | |
include/linux/mm.h | 2 ++ | |
mm/madvise.c | 25 +++++++++++++++++++++---- | |
2 files changed, 23 insertions(+), 4 deletions(-) | |
diff --git a/include/linux/mm.h b/include/linux/mm.h | |
index 0d65dd7..4eeaf11 100644 | |
--- a/include/linux/mm.h | |
+++ b/include/linux/mm.h | |
@@ -2514,5 +2514,7 @@ void __init setup_nr_node_ids(void); | |
static inline void setup_nr_node_ids(void) {} | |
#endif | |
+extern int do_madvise(unsigned long start, size_t len_in, int behavior); | |
+ | |
#endif /* __KERNEL__ */ | |
#endif /* _LINUX_MM_H */ | |
diff --git a/mm/madvise.c b/mm/madvise.c | |
index dc5927c..b57f884 100644 | |
--- a/mm/madvise.c | |
+++ b/mm/madvise.c | |
@@ -658,9 +658,7 @@ madvise_behavior_valid(int behavior) | |
} | |
/* | |
- * The madvise(2) system call. | |
- * | |
- * Applications can use madvise() to advise the kernel how it should | |
+ * Kernel modules can use do_madvise() to advise the kernel how it should | |
* handle paging I/O in this VM area. The idea is to help the kernel | |
* use appropriate read-ahead and caching techniques. The information | |
* provided is advisory only, and can be safely disregarded by the | |
@@ -713,7 +711,7 @@ madvise_behavior_valid(int behavior) | |
* -EBADF - map exists, but area maps something that isn't a file. | |
* -EAGAIN - a kernel resource was temporarily unavailable. | |
*/ | |
-SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |
+int do_madvise(unsigned long start, size_t len_in, int behavior) | |
{ | |
unsigned long end, tmp; | |
struct vm_area_struct *vma, *prev; | |
@@ -807,3 +805,22 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |
return error; | |
} | |
+EXPORT_SYMBOL_GPL(do_madvise); | |
+ | |
+/* | |
+ * The madvise(2) system call. | |
+ * | |
+ * Applications can use madvise() system call to advise the kernel how | |
+ * it should handle paging I/O in this VM area. The idea is to help | |
+ * the kernel use appropriate read-ahead and caching techniques. The | |
+ * information provided is advisory only, and can be safely disregarded | |
+ * by the kernel without affecting the correct operation of the application. | |
+ * | |
+ * behavior values are the same than the ones defined in madvise() | |
+ * | |
+ * return values are the same than the ones defined in madvise() | |
+ */ | |
+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |
+{ | |
+ return do_madvise(start, len_in, behavior); | |
+} | |
-- | |
2.7.4 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 331df67207a23107104ff232ea2097f557250431 Mon Sep 17 00:00:00 2001 | |
From: Sebastien Boeuf <[email protected]> | |
Date: Mon, 23 Jan 2017 15:08:55 -0800 | |
Subject: [PATCH 2/4] x86: kvm: Notify host to release pages | |
In context of hypervisors managing several virtual machines, we | |
want those virtual machines to give the memory they used back to | |
the host when they don't need it anymore. | |
This patch introduces a new hypercall KVM_HC_RETURN_MEM, allowing | |
the guest kernel to notify the host kernel when such event occurs. | |
And relying on do_madvise() function that we have previously exported, | |
it issues a call to this function when it receives the new hypercall. | |
Use of do_madvise() with MADV_DONTNEED flag will allow the guest to | |
ask for a new page without going through a new hypercall. Instead, | |
it will be able to start using that memory again as it will get | |
faulted back in as a fresh new page. That's why do_madvise() is more | |
efficient than doing vm_unmap() to return some memory to the host. | |
This patch introduces also a new sysctl kvm_madv_instant_free, | |
allowing user to set MADV_FREE advice instead of MADV_DONTNEED. | |
Indeed, MADV_FREE saves more performances than using MADV_DONTNEED | |
because it does not zero the pages in case the memory has not been | |
freed by the kernel. This can happen when there was no need for the | |
kernel to get this memory back, meaning it was keeping those pages | |
in the right state to be re-used by the same application. | |
MADV_FREE being a very recent advice introduced in kernel 4.5, we | |
only want to enable it through a sysctl in case the user want to | |
use it. | |
Suggested-by: Arjan van de Ven <[email protected]> | |
Signed-off-by: Sebastien Boeuf <[email protected]> | |
--- | |
arch/x86/kvm/x86.c | 17 +++++++++++++++++ | |
include/linux/mm.h | 5 +++++ | |
include/uapi/linux/kvm_para.h | 3 +++ | |
kernel/sysctl.c | 7 +++++++ | |
mm/Makefile | 2 +- | |
mm/kvm.c | 25 +++++++++++++++++++++++++ | |
6 files changed, 58 insertions(+), 1 deletion(-) | |
create mode 100644 mm/kvm.c | |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c | |
index b2a4b11..2f34d6e 100644 | |
--- a/arch/x86/kvm/x86.c | |
+++ b/arch/x86/kvm/x86.c | |
@@ -46,6 +46,7 @@ | |
#include <linux/user-return-notifier.h> | |
#include <linux/srcu.h> | |
#include <linux/slab.h> | |
+#include <linux/mm.h> | |
#include <linux/perf_event.h> | |
#include <linux/uaccess.h> | |
#include <linux/hash.h> | |
@@ -6197,6 +6198,19 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) | |
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); | |
} | |
+static int kvm_pv_return_mem_op(struct kvm *kvm, gpa_t gpa, size_t len) | |
+{ | |
+ unsigned long start = gfn_to_hva(kvm, gpa_to_gfn(gpa)); | |
+ | |
+ if (len > KVM_MAX_RET_MEM_SIZE) | |
+ return KVM_EPERM; | |
+ | |
+ if (kvm_is_error_hva(start + len)) | |
+ return KVM_EFAULT; | |
+ | |
+ return do_madvise(start, len, kvm_ret_mem_advice); | |
+} | |
+ | |
void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) | |
{ | |
vcpu->arch.apicv_active = false; | |
@@ -6248,6 +6262,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |
ret = kvm_pv_clock_pairing(vcpu, a0, a1); | |
break; | |
#endif | |
+ case KVM_HC_RETURN_MEM: | |
+ ret = kvm_pv_return_mem_op(vcpu->kvm, a0, a1); | |
+ break; | |
default: | |
ret = -KVM_ENOSYS; | |
break; | |
diff --git a/include/linux/mm.h b/include/linux/mm.h | |
index 4eeaf11..0627906 100644 | |
--- a/include/linux/mm.h | |
+++ b/include/linux/mm.h | |
@@ -2363,6 +2363,11 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); | |
extern int sysctl_drop_caches; | |
int drop_caches_sysctl_handler(struct ctl_table *, int, | |
void __user *, size_t *, loff_t *); | |
+extern int sysctl_kvm_madv_instant_free; | |
+extern int kvm_ret_mem_advice; | |
+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write, | |
+ void __user *buffer, size_t *length, | |
+ loff_t *ppos); | |
#endif | |
void drop_slab(void); | |
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h | |
index fed506a..030eecf2 100644 | |
--- a/include/uapi/linux/kvm_para.h | |
+++ b/include/uapi/linux/kvm_para.h | |
@@ -25,6 +25,9 @@ | |
#define KVM_HC_MIPS_EXIT_VM 7 | |
#define KVM_HC_MIPS_CONSOLE_OUTPUT 8 | |
#define KVM_HC_CLOCK_PAIRING 9 | |
+#define KVM_HC_RETURN_MEM 10 | |
+ | |
+#define KVM_MAX_RET_MEM_SIZE (1 << 22) // 4MiB | |
/* | |
* hypercalls use architecture specific | |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c | |
index bb260ce..73dc325 100644 | |
--- a/kernel/sysctl.c | |
+++ b/kernel/sysctl.c | |
@@ -1384,6 +1384,13 @@ static struct ctl_table vm_table[] = { | |
.extra1 = &one, | |
.extra2 = &four, | |
}, | |
+ { | |
+ .procname = "kvm_madv_instant_free", | |
+ .data = &sysctl_kvm_madv_instant_free, | |
+ .maxlen = sizeof(int), | |
+ .mode = 0644, | |
+ .proc_handler = kvm_madv_instant_free_sysctl_handler, | |
+ }, | |
#ifdef CONFIG_COMPACTION | |
{ | |
.procname = "compact_memory", | |
diff --git a/mm/Makefile b/mm/Makefile | |
index 026f6a8..28d95be 100644 | |
--- a/mm/Makefile | |
+++ b/mm/Makefile | |
@@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ | |
mm_init.o mmu_context.o percpu.o slab_common.o \ | |
compaction.o vmacache.o swap_slots.o \ | |
interval_tree.o list_lru.o workingset.o \ | |
- debug.o $(mmu-y) | |
+ debug.o kvm.o $(mmu-y) | |
obj-y += init-mm.o | |
diff --git a/mm/kvm.c b/mm/kvm.c | |
new file mode 100644 | |
index 0000000..8945f6a | |
--- /dev/null | |
+++ b/mm/kvm.c | |
@@ -0,0 +1,25 @@ | |
+#include <linux/mman.h> | |
+ | |
+int sysctl_kvm_madv_instant_free; | |
+ | |
+int kvm_ret_mem_advice = MADV_DONTNEED; | |
+EXPORT_SYMBOL_GPL(kvm_ret_mem_advice); | |
+ | |
+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write, | |
+ void __user *buffer, size_t *length, loff_t *ppos) | |
+{ | |
+ int ret; | |
+ | |
+ ret = proc_dointvec(table, write, buffer, length, ppos); | |
+ if (ret) | |
+ return ret; | |
+ | |
+#ifdef MADV_FREE | |
+ if (sysctl_kvm_madv_instant_free > 0) | |
+ kvm_ret_mem_advice = MADV_FREE; | |
+ else | |
+ kvm_ret_mem_advice = MADV_DONTNEED; | |
+#endif | |
+ | |
+ return 0; | |
+} | |
-- | |
2.7.4 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From e590bbdf83e715e7b95a40697c0d24317c757eaa Mon Sep 17 00:00:00 2001 | |
From: Sebastien Boeuf <[email protected]> | |
Date: Mon, 23 Jan 2017 15:26:13 -0800 | |
Subject: [PATCH 3/4] x86: Return memory from guest to host kernel | |
All virtual machines need memory to perform various tasks, but this | |
memory is not released to the host after it is not used anymore. We | |
have to wait for the termination of the virtual machine to get this | |
memory back into the host. | |
Ballooning mechanism is close but not designed for the same purpose. | |
In case we hit memory limits of the system, the host predicts how much | |
memory can be asked back from a guest, and it issues an hypercall to | |
retrieve this memory. | |
The solution proposed is different because it does not wait for host | |
needs before to return memory, and it knows precisely how much memory | |
it can return. | |
The way to notify the host side about such a return is to rely on | |
the new hypercall KVM_HC_RETURN_MEM. In order to avoid the CPU to be | |
overloaded with too many hypercalls, we only return memory blocks of | |
order 7 (512k blocks) and higher. This value has been found running | |
memory tests using multiple threads allocating/freeing high amount | |
of memory. Those tests were run for different order values, and 7 was | |
the best tradeoff between the number of hypercalls issued and the | |
amount of memory returned to the host. | |
In order to limit performances impact related to this code addition, | |
we check for blocks of order 7 or higher. This means it only costs an | |
additional function call and a branch to perform this check. | |
Furthermore, this code has been added to the "merge" codepath of the | |
buddy allocator, which is not as sensitive as the "free" codepath. | |
Not all blocks going through the "free" codepath will end up in the | |
"merge" codepath because some of them won't find their free buddy. | |
But this is a negligible amount since the kernel does not use many | |
high order blocks directly. Instead, those bigger blocks are often | |
broken into smaller chunks used as low order blocks. At the time | |
those small blocks are released, they go through the merge path. | |
Benchmarks such as ebizzy and will-it-scale have been run in order | |
to make sure this patch does not affect kernel performances and no | |
significant differences were observed. | |
Suggested-by: Arjan van de Ven <[email protected]> | |
Signed-off-by: Sebastien Boeuf <[email protected]> | |
--- | |
arch/x86/include/asm/kvm_para.h | 22 ++++++++++++++++++++++ | |
arch/x86/kernel/kvm.c | 10 ++++++++++ | |
include/linux/mm-arch-hooks.h | 8 ++++++++ | |
mm/page_alloc.c | 2 ++ | |
4 files changed, 42 insertions(+) | |
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h | |
index bc62e7c..4a2f6d1 100644 | |
--- a/arch/x86/include/asm/kvm_para.h | |
+++ b/arch/x86/include/asm/kvm_para.h | |
@@ -92,6 +92,28 @@ void kvm_async_pf_task_wait(u32 token); | |
void kvm_async_pf_task_wake(u32 token); | |
u32 kvm_read_and_reset_pf_reason(void); | |
extern void kvm_disable_steal_time(void); | |
+void kvm_arch_return_memory(struct page *page, unsigned int order); | |
+ | |
+/* | |
+ * This order has been found in an empirical way, running memory tests | |
+ * through many iterations to assess the number of hypercalls issued | |
+ * and the amount of memory returned. In case you change this order to | |
+ * 6 or 8, it should not impact your performances significantly. | |
+ * | |
+ * Smaller values lead to less memory waste, but consume more CPU on | |
+ * hypercalls. Larger values use less CPU, but do not as precisely | |
+ * inform the hypervisor of which memory is free. | |
+ */ | |
+#define RET_MEM_BUDDY_ORDER 7 | |
+ | |
+static inline void arch_buddy_merge(struct page *page, unsigned int order) | |
+{ | |
+ if (order < RET_MEM_BUDDY_ORDER) | |
+ return; | |
+ | |
+ kvm_arch_return_memory(page, order); | |
+} | |
+#define arch_buddy_merge arch_buddy_merge | |
#ifdef CONFIG_PARAVIRT_SPINLOCKS | |
void __init kvm_spinlock_init(void); | |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c | |
index 14f65a5..ad424e9 100644 | |
--- a/arch/x86/kernel/kvm.c | |
+++ b/arch/x86/kernel/kvm.c | |
@@ -549,6 +549,16 @@ static __init int activate_jump_labels(void) | |
} | |
arch_initcall(activate_jump_labels); | |
+void kvm_arch_return_memory(struct page *page, unsigned int order) | |
+{ | |
+ if (!kvm_para_available()) | |
+ return; | |
+ | |
+ kvm_hypercall2(KVM_HC_RETURN_MEM, | |
+ page_to_phys(page), | |
+ PAGE_SIZE << order); | |
+} | |
+ | |
#ifdef CONFIG_PARAVIRT_SPINLOCKS | |
/* Kick a cpu by its apicid. Used to wake up a halted vcpu */ | |
diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h | |
index 4efc3f5..26eb3a0 100644 | |
--- a/include/linux/mm-arch-hooks.h | |
+++ b/include/linux/mm-arch-hooks.h | |
@@ -12,6 +12,7 @@ | |
#define _LINUX_MM_ARCH_HOOKS_H | |
#include <asm/mm-arch-hooks.h> | |
+#include <asm/kvm_para.h> | |
#ifndef arch_remap | |
static inline void arch_remap(struct mm_struct *mm, | |
@@ -22,4 +23,11 @@ static inline void arch_remap(struct mm_struct *mm, | |
#define arch_remap arch_remap | |
#endif | |
+#ifndef arch_buddy_merge | |
+static inline void arch_buddy_merge(struct page *page, unsigned int order) | |
+{ | |
+} | |
+#define arch_buddy_merge arch_buddy_merge | |
+#endif | |
+ | |
#endif /* _LINUX_MM_ARCH_HOOKS_H */ | |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c | |
index a7a6aac..137898e 100644 | |
--- a/mm/page_alloc.c | |
+++ b/mm/page_alloc.c | |
@@ -64,6 +64,7 @@ | |
#include <linux/page_owner.h> | |
#include <linux/kthread.h> | |
#include <linux/memcontrol.h> | |
+#include <linux/mm-arch-hooks.h> | |
#include <asm/sections.h> | |
#include <asm/tlbflush.h> | |
@@ -856,6 +857,7 @@ static inline void __free_one_page(struct page *page, | |
} | |
done_merging: | |
+ arch_buddy_merge(page, order); | |
set_page_order(page, order); | |
/* | |
-- | |
2.7.4 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 08a9e5c52859ca0f6ed05136d6d6550daf9a3ac1 Mon Sep 17 00:00:00 2001 | |
From: Sebastien Boeuf <[email protected]> | |
Date: Mon, 23 Jan 2017 15:32:39 -0800 | |
Subject: [PATCH 4/4] sysctl: vm: Fine-grained cache shrinking | |
Lots of virtual machines are let in idle state for days until they | |
are terminated, and they can keep a large amount of memory in their | |
cache, meaning this memory cannot be used by other processes. | |
We tried to release this memory using existing drop_caches sysctl, | |
but it led to the complete cache loss while it could have been used | |
whether the idle process wakes up. Indeed, the process can't find any | |
available cached data and it directly affects performances to rebuild | |
it from scratch. | |
Instead, the solution we want is based on shrinking gradually system | |
cache over time. This patch adds a new sysctl shrink_caches_mb so as | |
to allow userspace applications indicating the kernel it should shrink | |
system cache up to the amount (in MiB) specified. | |
There is an application called "memshrinker" which uses this new | |
mechanism. It runs in the background and periodically releases a | |
specified amount of cache. This amount is based on the remaining | |
cache on the system, and period is computed to follow a shrinking | |
model. It results in saving a lot of memory for other processes | |
running on the system. | |
Suggested-by: Arjan van de Ven <[email protected]> | |
Signed-off-by: Sebastien Boeuf <[email protected]> | |
--- | |
fs/drop_caches.c | 25 +++++++++++++++++++++++++ | |
include/linux/mm.h | 4 ++++ | |
kernel/sysctl.c | 8 ++++++++ | |
mm/vmscan.c | 2 -- | |
4 files changed, 37 insertions(+), 2 deletions(-) | |
diff --git a/fs/drop_caches.c b/fs/drop_caches.c | |
index d72d52b..f564dfc 100644 | |
--- a/fs/drop_caches.c | |
+++ b/fs/drop_caches.c | |
@@ -8,10 +8,12 @@ | |
#include <linux/writeback.h> | |
#include <linux/sysctl.h> | |
#include <linux/gfp.h> | |
+#include <linux/swap.h> | |
#include "internal.h" | |
/* A global variable is a bit ugly, but it keeps the code simple */ | |
int sysctl_drop_caches; | |
+int sysctl_shrink_caches_mb; | |
static void drop_pagecache_sb(struct super_block *sb, void *unused) | |
{ | |
@@ -67,3 +69,26 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, | |
} | |
return 0; | |
} | |
+ | |
+int shrink_caches_sysctl_handler(struct ctl_table *table, int write, | |
+ void __user *buffer, size_t *length, loff_t *ppos) | |
+{ | |
+ int ret; | |
+ unsigned long nr_to_reclaim, page_reclaimed; | |
+ | |
+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | |
+ if (ret) | |
+ return ret; | |
+ | |
+ nr_to_reclaim = sysctl_shrink_caches_mb * (1 << 20) / PAGE_SIZE; | |
+ if (write) { | |
+ page_reclaimed = shrink_all_memory(nr_to_reclaim); | |
+ if (page_reclaimed > 0) | |
+ lru_add_drain_all(); | |
+ | |
+ if (page_reclaimed != nr_to_reclaim) | |
+ return page_reclaimed; | |
+ } | |
+ | |
+ return 0; | |
+} | |
diff --git a/include/linux/mm.h b/include/linux/mm.h | |
index 0627906..0e27e60 100644 | |
--- a/include/linux/mm.h | |
+++ b/include/linux/mm.h | |
@@ -2368,6 +2368,10 @@ extern int kvm_ret_mem_advice; | |
int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write, | |
void __user *buffer, size_t *length, | |
loff_t *ppos); | |
+extern int sysctl_shrink_caches_mb; | |
+int shrink_caches_sysctl_handler(struct ctl_table *table, int write, | |
+ void __user *buffer, size_t *length, | |
+ loff_t *ppos); | |
#endif | |
void drop_slab(void); | |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c | |
index 73dc325..bcfeb8d 100644 | |
--- a/kernel/sysctl.c | |
+++ b/kernel/sysctl.c | |
@@ -1391,6 +1391,14 @@ static struct ctl_table vm_table[] = { | |
.mode = 0644, | |
.proc_handler = kvm_madv_instant_free_sysctl_handler, | |
}, | |
+ { | |
+ .procname = "shrink_caches_mb", | |
+ .data = &sysctl_shrink_caches_mb, | |
+ .maxlen = sizeof(int), | |
+ .mode = 0644, | |
+ .proc_handler = shrink_caches_sysctl_handler, | |
+ .extra1 = &one, | |
+ }, | |
#ifdef CONFIG_COMPACTION | |
{ | |
.procname = "compact_memory", | |
diff --git a/mm/vmscan.c b/mm/vmscan.c | |
index 70aa739..53c2265 100644 | |
--- a/mm/vmscan.c | |
+++ b/mm/vmscan.c | |
@@ -3522,7 +3522,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | |
wake_up_interruptible(&pgdat->kswapd_wait); | |
} | |
-#ifdef CONFIG_HIBERNATION | |
/* | |
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of | |
* freed pages. | |
@@ -3561,7 +3560,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |
return nr_reclaimed; | |
} | |
-#endif /* CONFIG_HIBERNATION */ | |
/* It's optimal to keep kswapds on the same CPUs as their memory, but | |
not required for correctness. So if the last cpu in a node goes | |
-- | |
2.7.4 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment