sboeuf · February 23, 2018 19:08
diff --git a/0000-cover-letter.patch b/0000-cover-letter.patch
 From 995119a1dc3161e13a9039625a33f9b630fcb1ef Mon Sep 17 00:00:00 2001
 From: Sebastien Boeuf <[email protected]>
 Date: Mon, 27 Feb 2017 11:46:40 -0800
 Subject: [PATCH 0/4] Return unused memory to the host system

 This series tries to solve a common issue in the cloud context, that
 is being able to prevent any virtual machine running on a host system
 from keeping unused resources and particularly unused memory.

 Ballooning mechanism exits but it is not enough efficient as it waits
 memory limits of the system to be hit, then the host tries to predict
 how much memory can be asked back from a guest, and finally issues an
 hypercall to retrieve this memory.
 The guest knows precisely the amount of memory it can return, that's
 why the solution proposed by this series rely on the guest to notify
 the host about the memory that can be released. It implements a new
 hypercall which allows guests to notify hosts that some memory blocks
 are no longer used and can be discarded. Guest calls this hypercall
 when it sees large blocks (>=512k) merged in the buddy allocator.
 The host implementation for this new hypercall relies on MADV_DONTNEED
 or MADV_FREE advices, delaying memory release if that memory is not
 needed by the host, and therefore saving future hypercalls in case
 the guest needs to use that memory again.

 Idle VMs hold on some cache that they are not using. They gain no
 performance advantage from keeping it, and this deprives the rest of
 the system from using the memory effectively.
 There is /proc/sys/vm/drop_caches sysctl allowing to release all the
 cache of the system, but this is too aggressive because in case a
 previously idle VM suddenly wakes up, it won't find any cached data,
 lowering system performances.
 That is why this series introduces a new sysctl which is essentially
 a more fine-grained /proc/sys/vm/drop_caches. It allows userspace to
 drop a specified amount of cache instead of the whole thing. Combined
 with a proper userspace application, it allows to reduce gradually
 the amount of cache on the system in case no workload needs it.

 Sebastien Boeuf (4):
  mm: Export do_madvise()
  x86: kvm: Notify host to release pages
  x86: Return memory from guest to host kernel
  sysctl: vm: Fine-grained cache shrinking

 arch/x86/include/asm/kvm_para.h | 22 ++++++++++++++++++++++
 arch/x86/kernel/kvm.c           | 10 ++++++++++
 arch/x86/kvm/x86.c              | 17 +++++++++++++++++
 fs/drop_caches.c                | 25 +++++++++++++++++++++++++
 include/linux/mm-arch-hooks.h   |  8 ++++++++
 include/linux/mm.h              | 11 +++++++++++
 include/uapi/linux/kvm_para.h   |  3 +++
 kernel/sysctl.c                 | 15 +++++++++++++++
 mm/Makefile                     |  2 +-
 mm/kvm.c                        | 25 +++++++++++++++++++++++++
 mm/madvise.c                    | 25 +++++++++++++++++++++----
 mm/page_alloc.c                 |  2 ++
 mm/vmscan.c                     |  2 --
 13 files changed, 160 insertions(+), 7 deletions(-)
 create mode 100644 mm/kvm.c

 -- 
 2.7.4

diff --git a/0001-mm-Export-do_madvise.patch b/0001-mm-Export-do_madvise.patch
 From 7c9d5a3736f2daa9d3d3732ca8593e8e17fb1452 Mon Sep 17 00:00:00 2001
 From: Sebastien Boeuf <[email protected]>
 Date: Mon, 23 Jan 2017 15:03:52 -0800
 Subject: [PATCH 1/4] mm: Export do_madvise()

 Combined with some interesting flags madvise() system call
 allows to free memory more smartly and more efficiently than
 we could do with a simple free(). The issue is that is not
 available for kernel modules that could need it.

 In order to solve this lack of support, this patch exports
 do_madvise() so as to make it available to the entire kernel.
 The already existing madvise() system call is unchanged and
 now relies on this new do_madvise() function.

 Suggested-by: Arjan van de Ven <[email protected]>
 Signed-off-by: Sebastien Boeuf <[email protected]>
 ---
 include/linux/mm.h |  2 ++
 mm/madvise.c       | 25 +++++++++++++++++++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

 diff --git a/include/linux/mm.h b/include/linux/mm.h
 index 0d65dd7..4eeaf11 100644
 --- a/include/linux/mm.h
 +++ b/include/linux/mm.h
 @@ -2514,5 +2514,7 @@ void __init setup_nr_node_ids(void);
 static inline void setup_nr_node_ids(void) {}
 #endif
 
 +extern int do_madvise(unsigned long start, size_t len_in, int behavior);
 +
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
 diff --git a/mm/madvise.c b/mm/madvise.c
 index dc5927c..b57f884 100644
 --- a/mm/madvise.c
 +++ b/mm/madvise.c
 @@ -658,9 +658,7 @@ madvise_behavior_valid(int behavior)
 }
 
 /*
 - * The madvise(2) system call.
 - *
 - * Applications can use madvise() to advise the kernel how it should
 + * Kernel modules can use do_madvise() to advise the kernel how it should
  * handle paging I/O in this VM area.  The idea is to help the kernel
  * use appropriate read-ahead and caching techniques.  The information
  * provided is advisory only, and can be safely disregarded by the
 @@ -713,7 +711,7 @@ madvise_behavior_valid(int behavior)
  *  -EBADF  - map exists, but area maps something that isn't a file.
  *  -EAGAIN - a kernel resource was temporarily unavailable.
  */
 -SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 +int do_madvise(unsigned long start, size_t len_in, int behavior)
 {
 	unsigned long end, tmp;
 	struct vm_area_struct *vma, *prev;
 @@ -807,3 +805,22 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 
 	return error;
 }
 +EXPORT_SYMBOL_GPL(do_madvise);
 +
 +/*
 + * The madvise(2) system call.
 + *
 + * Applications can use madvise() system call to advise the kernel how
 + * it should handle paging I/O in this VM area.  The idea is to help
 + * the kernel use appropriate read-ahead and caching techniques.  The
 + * information provided is advisory only, and can be safely disregarded
 + * by the kernel without affecting the correct operation of the application.
 + *
 + * behavior values are the same than the ones defined in madvise()
 + *
 + * return values are the same than the ones defined in madvise()
 + */
 +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 +{
 +	return do_madvise(start, len_in, behavior);
 +}
 -- 
 2.7.4

diff --git a/0002-x86-kvm-Notify-host-to-release-pages.patch b/0002-x86-kvm-Notify-host-to-release-pages.patch
 From 331df67207a23107104ff232ea2097f557250431 Mon Sep 17 00:00:00 2001
 From: Sebastien Boeuf <[email protected]>
 Date: Mon, 23 Jan 2017 15:08:55 -0800
 Subject: [PATCH 2/4] x86: kvm: Notify host to release pages

 In context of hypervisors managing several virtual machines, we
 want those virtual machines to give the memory they used back to
 the host when they don't need it anymore.

 This patch introduces a new hypercall KVM_HC_RETURN_MEM, allowing
 the guest kernel to notify the host kernel when such event occurs.
 And relying on do_madvise() function that we have previously exported,
 it issues a call to this function when it receives the new hypercall.

 Use of do_madvise() with MADV_DONTNEED flag will allow the guest to
 ask for a new page without going through a new hypercall. Instead,
 it will be able to start using that memory again as it will get
 faulted back in as a fresh new page. That's why do_madvise() is more
 efficient than doing vm_unmap() to return some memory to the host.

 This patch introduces also a new sysctl kvm_madv_instant_free,
 allowing user to set MADV_FREE advice instead of MADV_DONTNEED.
 Indeed, MADV_FREE saves more performances than using MADV_DONTNEED
 because it does not zero the pages in case the memory has not been
 freed by the kernel. This can happen when there was no need for the
 kernel to get this memory back, meaning it was keeping those pages
 in the right state to be re-used by the same application.
 MADV_FREE being a very recent advice introduced in kernel 4.5, we
 only want to enable it through a sysctl in case the user want to
 use it.

 Suggested-by: Arjan van de Ven <[email protected]>
 Signed-off-by: Sebastien Boeuf <[email protected]>
 ---
 arch/x86/kvm/x86.c            | 17 +++++++++++++++++
 include/linux/mm.h            |  5 +++++
 include/uapi/linux/kvm_para.h |  3 +++
 kernel/sysctl.c               |  7 +++++++
 mm/Makefile                   |  2 +-
 mm/kvm.c                      | 25 +++++++++++++++++++++++++
 6 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 mm/kvm.c

 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index b2a4b11..2f34d6e 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -46,6 +46,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
 +#include <linux/mm.h>
 #include <linux/perf_event.h>
 #include <linux/uaccess.h>
 #include <linux/hash.h>
 @@ -6197,6 +6198,19 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
 	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
 }
 
 +static int kvm_pv_return_mem_op(struct kvm *kvm, gpa_t gpa, size_t len)
 +{
 +	unsigned long start = gfn_to_hva(kvm, gpa_to_gfn(gpa));
 +
 +	if (len > KVM_MAX_RET_MEM_SIZE)
 +		return KVM_EPERM;
 +
 +	if (kvm_is_error_hva(start + len))
 +		return KVM_EFAULT;
 +
 +	return do_madvise(start, len, kvm_ret_mem_advice);
 +}
 +
 void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.apicv_active = false;
 @@ -6248,6 +6262,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
 		break;
 #endif
 +	case KVM_HC_RETURN_MEM:
 +		ret = kvm_pv_return_mem_op(vcpu->kvm, a0, a1);
 +		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
 diff --git a/include/linux/mm.h b/include/linux/mm.h
 index 4eeaf11..0627906 100644
 --- a/include/linux/mm.h
 +++ b/include/linux/mm.h
 @@ -2363,6 +2363,11 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
 extern int sysctl_drop_caches;
 int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 +extern int sysctl_kvm_madv_instant_free;
 +extern int kvm_ret_mem_advice;
 +int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
 +					 void __user *buffer, size_t *length,
 +					 loff_t *ppos);
 #endif
 
 void drop_slab(void);
 diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
 index fed506a..030eecf2 100644
 --- a/include/uapi/linux/kvm_para.h
 +++ b/include/uapi/linux/kvm_para.h
 @@ -25,6 +25,9 @@
 #define KVM_HC_MIPS_EXIT_VM		7
 #define KVM_HC_MIPS_CONSOLE_OUTPUT	8
 #define KVM_HC_CLOCK_PAIRING		9
 +#define KVM_HC_RETURN_MEM		10
 +
 +#define KVM_MAX_RET_MEM_SIZE (1 << 22) // 4MiB
 
 /*
  * hypercalls use architecture specific
 diff --git a/kernel/sysctl.c b/kernel/sysctl.c
 index bb260ce..73dc325 100644
 --- a/kernel/sysctl.c
 +++ b/kernel/sysctl.c
 @@ -1384,6 +1384,13 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &one,
 		.extra2		= &four,
 	},
 +	{
 +		.procname	= "kvm_madv_instant_free",
 +		.data		= &sysctl_kvm_madv_instant_free,
 +		.maxlen		= sizeof(int),
 +		.mode		= 0644,
 +		.proc_handler	= kvm_madv_instant_free_sysctl_handler,
 +	},
 #ifdef CONFIG_COMPACTION
 	{
 		.procname	= "compact_memory",
 diff --git a/mm/Makefile b/mm/Makefile
 index 026f6a8..28d95be 100644
 --- a/mm/Makefile
 +++ b/mm/Makefile
 @@ -39,7 +39,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o vmacache.o swap_slots.o \
 			   interval_tree.o list_lru.o workingset.o \
 -			   debug.o $(mmu-y)
 +			   debug.o kvm.o $(mmu-y)
 
 obj-y += init-mm.o
 
 diff --git a/mm/kvm.c b/mm/kvm.c
 new file mode 100644
 index 0000000..8945f6a
 --- /dev/null
 +++ b/mm/kvm.c
 @@ -0,0 +1,25 @@
 +#include <linux/mman.h>
 +
 +int sysctl_kvm_madv_instant_free;
 +
 +int kvm_ret_mem_advice = MADV_DONTNEED;
 +EXPORT_SYMBOL_GPL(kvm_ret_mem_advice);
 +
 +int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
 +	void __user *buffer, size_t *length, loff_t *ppos)
 +{
 +	int ret;
 +
 +	ret = proc_dointvec(table, write, buffer, length, ppos);
 +	if (ret)
 +		return ret;
 +
 +#ifdef MADV_FREE
 +	if (sysctl_kvm_madv_instant_free > 0)
 +		kvm_ret_mem_advice = MADV_FREE;
 +	else
 +		kvm_ret_mem_advice = MADV_DONTNEED;
 +#endif
 +
 +	return 0;
 +}
 -- 
 2.7.4

diff --git a/0003-x86-Return-memory-from-guest-to-host-kernel.patch b/0003-x86-Return-memory-from-guest-to-host-kernel.patch
 From e590bbdf83e715e7b95a40697c0d24317c757eaa Mon Sep 17 00:00:00 2001
 From: Sebastien Boeuf <[email protected]>
 Date: Mon, 23 Jan 2017 15:26:13 -0800
 Subject: [PATCH 3/4] x86: Return memory from guest to host kernel

 All virtual machines need memory to perform various tasks, but this
 memory is not released to the host after it is not used anymore. We
 have to wait for the termination of the virtual machine to get this
 memory back into the host.

 Ballooning mechanism is close but not designed for the same purpose.
 In case we hit memory limits of the system, the host predicts how much
 memory can be asked back from a guest, and it issues an hypercall to
 retrieve this memory.

 The solution proposed is different because it does not wait for host
 needs before to return memory, and it knows precisely how much memory
 it can return.

 The way to notify the host side about such a return is to rely on
 the new hypercall KVM_HC_RETURN_MEM. In order to avoid the CPU to be
 overloaded with too many hypercalls, we only return memory blocks of
 order 7 (512k blocks) and higher. This value has been found running
 memory tests using multiple threads allocating/freeing high amount
 of memory. Those tests were run for different order values, and 7 was
 the best tradeoff between the number of hypercalls issued and the
 amount of memory returned to the host.

 In order to limit performances impact related to this code addition,
 we check for blocks of order 7 or higher. This means it only costs an
 additional function call and a branch to perform this check.

 Furthermore, this code has been added to the "merge" codepath of the
 buddy allocator, which is not as sensitive as the "free" codepath.
 Not all blocks going through the "free" codepath will end up in the
 "merge" codepath because some of them won't find their free buddy.
 But this is a negligible amount since the kernel does not use many
 high order blocks directly. Instead, those bigger blocks are often
 broken into smaller chunks used as low order blocks. At the time
 those small blocks are released, they go through the merge path.

 Benchmarks such as ebizzy and will-it-scale have been run in order
 to make sure this patch does not affect kernel performances and no
 significant differences were observed.

 Suggested-by: Arjan van de Ven <[email protected]>
 Signed-off-by: Sebastien Boeuf <[email protected]>
 ---
 arch/x86/include/asm/kvm_para.h | 22 ++++++++++++++++++++++
 arch/x86/kernel/kvm.c           | 10 ++++++++++
 include/linux/mm-arch-hooks.h   |  8 ++++++++
 mm/page_alloc.c                 |  2 ++
 4 files changed, 42 insertions(+)

 diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
 index bc62e7c..4a2f6d1 100644
 --- a/arch/x86/include/asm/kvm_para.h
 +++ b/arch/x86/include/asm/kvm_para.h
 @@ -92,6 +92,28 @@ void kvm_async_pf_task_wait(u32 token);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
 +void kvm_arch_return_memory(struct page *page, unsigned int order);
 +
 +/*
 + * This order has been found in an empirical way, running memory tests
 + * through many iterations to assess the number of hypercalls issued
 + * and the amount of memory returned. In case you change this order to
 + * 6 or 8, it should not impact your performances significantly.
 + *
 + * Smaller values lead to less memory waste, but consume more CPU on
 + * hypercalls. Larger values use less CPU, but do not as precisely
 + * inform the hypervisor of which memory is free.
 + */
 +#define RET_MEM_BUDDY_ORDER 7
 +
 +static inline void arch_buddy_merge(struct page *page, unsigned int order)
 +{
 +	if (order < RET_MEM_BUDDY_ORDER)
 +		return;
 +
 +	kvm_arch_return_memory(page, order);
 +}
 +#define arch_buddy_merge arch_buddy_merge
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init kvm_spinlock_init(void);
 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
 index 14f65a5..ad424e9 100644
 --- a/arch/x86/kernel/kvm.c
 +++ b/arch/x86/kernel/kvm.c
 @@ -549,6 +549,16 @@ static __init int activate_jump_labels(void)
 }
 arch_initcall(activate_jump_labels);
 
 +void kvm_arch_return_memory(struct page *page, unsigned int order)
 +{
 +	if (!kvm_para_available())
 +		return;
 +
 +	kvm_hypercall2(KVM_HC_RETURN_MEM,
 +		       page_to_phys(page),
 +		       PAGE_SIZE << order);
 +}
 +
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
 diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
 index 4efc3f5..26eb3a0 100644
 --- a/include/linux/mm-arch-hooks.h
 +++ b/include/linux/mm-arch-hooks.h
 @@ -12,6 +12,7 @@
 #define _LINUX_MM_ARCH_HOOKS_H
 
 #include <asm/mm-arch-hooks.h>
 +#include <asm/kvm_para.h>
 
 #ifndef arch_remap
 static inline void arch_remap(struct mm_struct *mm,
 @@ -22,4 +23,11 @@ static inline void arch_remap(struct mm_struct *mm,
 #define arch_remap arch_remap
 #endif
 
 +#ifndef arch_buddy_merge
 +static inline void arch_buddy_merge(struct page *page, unsigned int order)
 +{
 +}
 +#define arch_buddy_merge arch_buddy_merge
 +#endif
 +
 #endif /* _LINUX_MM_ARCH_HOOKS_H */
 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 index a7a6aac..137898e 100644
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 @@ -64,6 +64,7 @@
 #include <linux/page_owner.h>
 #include <linux/kthread.h>
 #include <linux/memcontrol.h>
 +#include <linux/mm-arch-hooks.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
 @@ -856,6 +857,7 @@ static inline void __free_one_page(struct page *page,
 	}
 
 done_merging:
 +	arch_buddy_merge(page, order);
 	set_page_order(page, order);
 
 	/*
 -- 
 2.7.4

diff --git a/0004-sysctl-vm-Fine-grained-cache-shrinking.patch b/0004-sysctl-vm-Fine-grained-cache-shrinking.patch
 From 08a9e5c52859ca0f6ed05136d6d6550daf9a3ac1 Mon Sep 17 00:00:00 2001
 From: Sebastien Boeuf <[email protected]>
 Date: Mon, 23 Jan 2017 15:32:39 -0800
 Subject: [PATCH 4/4] sysctl: vm: Fine-grained cache shrinking

 Lots of virtual machines are let in idle state for days until they
 are terminated, and they can keep a large amount of memory in their
 cache, meaning this memory cannot be used by other processes.

 We tried to release this memory using existing drop_caches sysctl,
 but it led to the complete cache loss while it could have been used
 whether the idle process wakes up. Indeed, the process can't find any
 available cached data and it directly affects performances to rebuild
 it from scratch.

 Instead, the solution we want is based on shrinking gradually system
 cache over time. This patch adds a new sysctl shrink_caches_mb so as
 to allow userspace applications indicating the kernel it should shrink
 system cache up to the amount (in MiB) specified.

 There is an application called "memshrinker" which uses this new
 mechanism. It runs in the background and periodically releases a
 specified amount of cache. This amount is based on the remaining
 cache on the system, and period is computed to follow a shrinking
 model. It results in saving a lot of memory for other processes
 running on the system.

 Suggested-by: Arjan van de Ven <[email protected]>
 Signed-off-by: Sebastien Boeuf <[email protected]>
 ---
 fs/drop_caches.c   | 25 +++++++++++++++++++++++++
 include/linux/mm.h |  4 ++++
 kernel/sysctl.c    |  8 ++++++++
 mm/vmscan.c        |  2 --
 4 files changed, 37 insertions(+), 2 deletions(-)

 diff --git a/fs/drop_caches.c b/fs/drop_caches.c
 index d72d52b..f564dfc 100644
 --- a/fs/drop_caches.c
 +++ b/fs/drop_caches.c
 @@ -8,10 +8,12 @@
 #include <linux/writeback.h>
 #include <linux/sysctl.h>
 #include <linux/gfp.h>
 +#include <linux/swap.h>
 #include "internal.h"
 
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
 +int sysctl_shrink_caches_mb;
 
 static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
 @@ -67,3 +69,26 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write,
 	}
 	return 0;
 }
 +
 +int shrink_caches_sysctl_handler(struct ctl_table *table, int write,
 +	void __user *buffer, size_t *length, loff_t *ppos)
 +{
 +	int ret;
 +	unsigned long nr_to_reclaim, page_reclaimed;
 +
 +	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 +	if (ret)
 +		return ret;
 +
 +	nr_to_reclaim = sysctl_shrink_caches_mb * (1 << 20) / PAGE_SIZE;
 +	if (write) {
 +		page_reclaimed = shrink_all_memory(nr_to_reclaim);
 +		if (page_reclaimed > 0)
 +			lru_add_drain_all();
 +
 +		if (page_reclaimed != nr_to_reclaim)
 +			return page_reclaimed;
 +	}
 +
 +	return 0;
 +}
 diff --git a/include/linux/mm.h b/include/linux/mm.h
 index 0627906..0e27e60 100644
 --- a/include/linux/mm.h
 +++ b/include/linux/mm.h
 @@ -2368,6 +2368,10 @@ extern int kvm_ret_mem_advice;
 int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
 					 void __user *buffer, size_t *length,
 					 loff_t *ppos);
 +extern int sysctl_shrink_caches_mb;
 +int shrink_caches_sysctl_handler(struct ctl_table *table, int write,
 +				 void __user *buffer, size_t *length,
 +				 loff_t *ppos);
 #endif
 
 void drop_slab(void);
 diff --git a/kernel/sysctl.c b/kernel/sysctl.c
 index 73dc325..bcfeb8d 100644
 --- a/kernel/sysctl.c
 +++ b/kernel/sysctl.c
 @@ -1391,6 +1391,14 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= kvm_madv_instant_free_sysctl_handler,
 	},
 +	{
 +		.procname       = "shrink_caches_mb",
 +		.data           = &sysctl_shrink_caches_mb,
 +		.maxlen         = sizeof(int),
 +		.mode           = 0644,
 +		.proc_handler   = shrink_caches_sysctl_handler,
 +		.extra1         = &one,
 +	},
 #ifdef CONFIG_COMPACTION
 	{
 		.procname	= "compact_memory",
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 70aa739..53c2265 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -3522,7 +3522,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
 -#ifdef CONFIG_HIBERNATION
 /*
  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
  * freed pages.
 @@ -3561,7 +3560,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 
 	return nr_reclaimed;
 }
 -#endif /* CONFIG_HIBERNATION */
 
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
    not required for correctness.  So if the last cpu in a node goes
 -- 
 2.7.4
	From 995119a1dc3161e13a9039625a33f9b630fcb1ef Mon Sep 17 00:00:00 2001
	From: Sebastien Boeuf <[email protected]>
	Date: Mon, 27 Feb 2017 11:46:40 -0800
	Subject: [PATCH 0/4] Return unused memory to the host system

	This series tries to solve a common issue in the cloud context, that
	is being able to prevent any virtual machine running on a host system
	from keeping unused resources and particularly unused memory.

	Ballooning mechanism exits but it is not enough efficient as it waits
	memory limits of the system to be hit, then the host tries to predict
	how much memory can be asked back from a guest, and finally issues an
	hypercall to retrieve this memory.
	The guest knows precisely the amount of memory it can return, that's
	why the solution proposed by this series rely on the guest to notify
	the host about the memory that can be released. It implements a new
	hypercall which allows guests to notify hosts that some memory blocks
	are no longer used and can be discarded. Guest calls this hypercall
	when it sees large blocks (>=512k) merged in the buddy allocator.
	The host implementation for this new hypercall relies on MADV_DONTNEED
	or MADV_FREE advices, delaying memory release if that memory is not
	needed by the host, and therefore saving future hypercalls in case
	the guest needs to use that memory again.

	Idle VMs hold on some cache that they are not using. They gain no
	performance advantage from keeping it, and this deprives the rest of
	the system from using the memory effectively.
	There is /proc/sys/vm/drop_caches sysctl allowing to release all the
	cache of the system, but this is too aggressive because in case a
	previously idle VM suddenly wakes up, it won't find any cached data,
	lowering system performances.
	That is why this series introduces a new sysctl which is essentially
	a more fine-grained /proc/sys/vm/drop_caches. It allows userspace to
	drop a specified amount of cache instead of the whole thing. Combined
	with a proper userspace application, it allows to reduce gradually
	the amount of cache on the system in case no workload needs it.

	Sebastien Boeuf (4):
	mm: Export do_madvise()
	x86: kvm: Notify host to release pages
	x86: Return memory from guest to host kernel
	sysctl: vm: Fine-grained cache shrinking

	arch/x86/include/asm/kvm_para.h \| 22 ++++++++++++++++++++++
	arch/x86/kernel/kvm.c \| 10 ++++++++++
	arch/x86/kvm/x86.c \| 17 +++++++++++++++++
	fs/drop_caches.c \| 25 +++++++++++++++++++++++++
	include/linux/mm-arch-hooks.h \| 8 ++++++++
	include/linux/mm.h \| 11 +++++++++++
	include/uapi/linux/kvm_para.h \| 3 +++
	kernel/sysctl.c \| 15 +++++++++++++++
	mm/Makefile \| 2 +-
	mm/kvm.c \| 25 +++++++++++++++++++++++++
	mm/madvise.c \| 25 +++++++++++++++++++++----
	mm/page_alloc.c \| 2 ++
	mm/vmscan.c \| 2 --
	13 files changed, 160 insertions(+), 7 deletions(-)
	create mode 100644 mm/kvm.c

	--
	2.7.4
	From 7c9d5a3736f2daa9d3d3732ca8593e8e17fb1452 Mon Sep 17 00:00:00 2001
	From: Sebastien Boeuf <[email protected]>
	Date: Mon, 23 Jan 2017 15:03:52 -0800
	Subject: [PATCH 1/4] mm: Export do_madvise()

	Combined with some interesting flags madvise() system call
	allows to free memory more smartly and more efficiently than
	we could do with a simple free(). The issue is that is not
	available for kernel modules that could need it.

	In order to solve this lack of support, this patch exports
	do_madvise() so as to make it available to the entire kernel.
	The already existing madvise() system call is unchanged and
	now relies on this new do_madvise() function.

	Suggested-by: Arjan van de Ven <[email protected]>
	Signed-off-by: Sebastien Boeuf <[email protected]>
	---
	include/linux/mm.h \| 2 ++
	mm/madvise.c \| 25 +++++++++++++++++++++----
	2 files changed, 23 insertions(+), 4 deletions(-)

	diff --git a/include/linux/mm.h b/include/linux/mm.h
	index 0d65dd7..4eeaf11 100644
	--- a/include/linux/mm.h
	+++ b/include/linux/mm.h
	@@ -2514,5 +2514,7 @@ void __init setup_nr_node_ids(void);
	static inline void setup_nr_node_ids(void) {}
	#endif

	+extern int do_madvise(unsigned long start, size_t len_in, int behavior);
	+
	#endif /* __KERNEL__ */
	#endif /* _LINUX_MM_H */
	diff --git a/mm/madvise.c b/mm/madvise.c
	index dc5927c..b57f884 100644
	--- a/mm/madvise.c
	+++ b/mm/madvise.c
	@@ -658,9 +658,7 @@ madvise_behavior_valid(int behavior)
	}

	/*
	- * The madvise(2) system call.
	- *
	- * Applications can use madvise() to advise the kernel how it should
	+ * Kernel modules can use do_madvise() to advise the kernel how it should
	* handle paging I/O in this VM area. The idea is to help the kernel
	* use appropriate read-ahead and caching techniques. The information
	* provided is advisory only, and can be safely disregarded by the
	@@ -713,7 +711,7 @@ madvise_behavior_valid(int behavior)
	* -EBADF - map exists, but area maps something that isn't a file.
	* -EAGAIN - a kernel resource was temporarily unavailable.
	*/
	-SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
	+int do_madvise(unsigned long start, size_t len_in, int behavior)
	{
	unsigned long end, tmp;
	struct vm_area_struct vma, prev;
	@@ -807,3 +805,22 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)

	return error;
	}
	+EXPORT_SYMBOL_GPL(do_madvise);
	+
	+/*
	+ * The madvise(2) system call.
	+ *
	+ * Applications can use madvise() system call to advise the kernel how
	+ * it should handle paging I/O in this VM area. The idea is to help
	+ * the kernel use appropriate read-ahead and caching techniques. The
	+ * information provided is advisory only, and can be safely disregarded
	+ * by the kernel without affecting the correct operation of the application.
	+ *
	+ * behavior values are the same than the ones defined in madvise()
	+ *
	+ * return values are the same than the ones defined in madvise()
	+ */
	+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
	+{
	+ return do_madvise(start, len_in, behavior);
	+}
	--
	2.7.4
	From 331df67207a23107104ff232ea2097f557250431 Mon Sep 17 00:00:00 2001
	From: Sebastien Boeuf <[email protected]>
	Date: Mon, 23 Jan 2017 15:08:55 -0800
	Subject: [PATCH 2/4] x86: kvm: Notify host to release pages

	In context of hypervisors managing several virtual machines, we
	want those virtual machines to give the memory they used back to
	the host when they don't need it anymore.

	This patch introduces a new hypercall KVM_HC_RETURN_MEM, allowing
	the guest kernel to notify the host kernel when such event occurs.
	And relying on do_madvise() function that we have previously exported,
	it issues a call to this function when it receives the new hypercall.

	Use of do_madvise() with MADV_DONTNEED flag will allow the guest to
	ask for a new page without going through a new hypercall. Instead,
	it will be able to start using that memory again as it will get
	faulted back in as a fresh new page. That's why do_madvise() is more
	efficient than doing vm_unmap() to return some memory to the host.

	This patch introduces also a new sysctl kvm_madv_instant_free,
	allowing user to set MADV_FREE advice instead of MADV_DONTNEED.
	Indeed, MADV_FREE saves more performances than using MADV_DONTNEED
	because it does not zero the pages in case the memory has not been
	freed by the kernel. This can happen when there was no need for the
	kernel to get this memory back, meaning it was keeping those pages
	in the right state to be re-used by the same application.
	MADV_FREE being a very recent advice introduced in kernel 4.5, we
	only want to enable it through a sysctl in case the user want to
	use it.

	Suggested-by: Arjan van de Ven <[email protected]>
	Signed-off-by: Sebastien Boeuf <[email protected]>
	---
	arch/x86/kvm/x86.c \| 17 +++++++++++++++++
	include/linux/mm.h \| 5 +++++
	include/uapi/linux/kvm_para.h \| 3 +++
	kernel/sysctl.c \| 7 +++++++
	mm/Makefile \| 2 +-
	mm/kvm.c \| 25 +++++++++++++++++++++++++
	6 files changed, 58 insertions(+), 1 deletion(-)
	create mode 100644 mm/kvm.c

	diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
	index b2a4b11..2f34d6e 100644
	--- a/arch/x86/kvm/x86.c
	+++ b/arch/x86/kvm/x86.c
	@@ -46,6 +46,7 @@
	#include <linux/user-return-notifier.h>
	#include <linux/srcu.h>
	#include <linux/slab.h>
	+#include <linux/mm.h>
	#include <linux/perf_event.h>
	#include <linux/uaccess.h>
	#include <linux/hash.h>
	@@ -6197,6 +6198,19 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
	}

	+static int kvm_pv_return_mem_op(struct kvm *kvm, gpa_t gpa, size_t len)
	+{
	+ unsigned long start = gfn_to_hva(kvm, gpa_to_gfn(gpa));
	+
	+ if (len > KVM_MAX_RET_MEM_SIZE)
	+ return KVM_EPERM;
	+
	+ if (kvm_is_error_hva(start + len))
	+ return KVM_EFAULT;
	+
	+ return do_madvise(start, len, kvm_ret_mem_advice);
	+}
	+
	void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
	{
	vcpu->arch.apicv_active = false;
	@@ -6248,6 +6262,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
	ret = kvm_pv_clock_pairing(vcpu, a0, a1);
	break;
	#endif
	+ case KVM_HC_RETURN_MEM:
	+ ret = kvm_pv_return_mem_op(vcpu->kvm, a0, a1);
	+ break;
	default:
	ret = -KVM_ENOSYS;
	break;
	diff --git a/include/linux/mm.h b/include/linux/mm.h
	index 4eeaf11..0627906 100644
	--- a/include/linux/mm.h
	+++ b/include/linux/mm.h
	@@ -2363,6 +2363,11 @@ extern bool process_shares_mm(struct task_struct p, struct mm_struct mm);
	extern int sysctl_drop_caches;
	int drop_caches_sysctl_handler(struct ctl_table *, int,
	void __user , size_t , loff_t *);
	+extern int sysctl_kvm_madv_instant_free;
	+extern int kvm_ret_mem_advice;
	+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
	+ void __user buffer, size_t length,
	+ loff_t *ppos);
	#endif

	void drop_slab(void);
	diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
	index fed506a..030eecf2 100644
	--- a/include/uapi/linux/kvm_para.h
	+++ b/include/uapi/linux/kvm_para.h
	@@ -25,6 +25,9 @@
	#define KVM_HC_MIPS_EXIT_VM 7
	#define KVM_HC_MIPS_CONSOLE_OUTPUT 8
	#define KVM_HC_CLOCK_PAIRING 9
	+#define KVM_HC_RETURN_MEM 10
	+
	+#define KVM_MAX_RET_MEM_SIZE (1 << 22) // 4MiB

	/*
	* hypercalls use architecture specific
	diff --git a/kernel/sysctl.c b/kernel/sysctl.c
	index bb260ce..73dc325 100644
	--- a/kernel/sysctl.c
	+++ b/kernel/sysctl.c
	@@ -1384,6 +1384,13 @@ static struct ctl_table vm_table[] = {
	.extra1 = &one,
	.extra2 = &four,
	},
	+ {
	+ .procname = "kvm_madv_instant_free",
	+ .data = &sysctl_kvm_madv_instant_free,
	+ .maxlen = sizeof(int),
	+ .mode = 0644,
	+ .proc_handler = kvm_madv_instant_free_sysctl_handler,
	+ },
	#ifdef CONFIG_COMPACTION
	{
	.procname = "compact_memory",
	diff --git a/mm/Makefile b/mm/Makefile
	index 026f6a8..28d95be 100644
	--- a/mm/Makefile
	+++ b/mm/Makefile
	@@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o \
	mm_init.o mmu_context.o percpu.o slab_common.o \
	compaction.o vmacache.o swap_slots.o \
	interval_tree.o list_lru.o workingset.o \
	- debug.o $(mmu-y)
	+ debug.o kvm.o $(mmu-y)

	obj-y += init-mm.o

	diff --git a/mm/kvm.c b/mm/kvm.c
	new file mode 100644
	index 0000000..8945f6a
	--- /dev/null
	+++ b/mm/kvm.c
	@@ -0,0 +1,25 @@
	+#include <linux/mman.h>
	+
	+int sysctl_kvm_madv_instant_free;
	+
	+int kvm_ret_mem_advice = MADV_DONTNEED;
	+EXPORT_SYMBOL_GPL(kvm_ret_mem_advice);
	+
	+int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
	+ void __user buffer, size_t length, loff_t *ppos)
	+{
	+ int ret;
	+
	+ ret = proc_dointvec(table, write, buffer, length, ppos);
	+ if (ret)
	+ return ret;
	+
	+#ifdef MADV_FREE
	+ if (sysctl_kvm_madv_instant_free > 0)
	+ kvm_ret_mem_advice = MADV_FREE;
	+ else
	+ kvm_ret_mem_advice = MADV_DONTNEED;
	+#endif
	+
	+ return 0;
	+}
	--
	2.7.4
	From e590bbdf83e715e7b95a40697c0d24317c757eaa Mon Sep 17 00:00:00 2001
	From: Sebastien Boeuf <[email protected]>
	Date: Mon, 23 Jan 2017 15:26:13 -0800
	Subject: [PATCH 3/4] x86: Return memory from guest to host kernel

	All virtual machines need memory to perform various tasks, but this
	memory is not released to the host after it is not used anymore. We
	have to wait for the termination of the virtual machine to get this
	memory back into the host.

	Ballooning mechanism is close but not designed for the same purpose.
	In case we hit memory limits of the system, the host predicts how much
	memory can be asked back from a guest, and it issues an hypercall to
	retrieve this memory.

	The solution proposed is different because it does not wait for host
	needs before to return memory, and it knows precisely how much memory
	it can return.

	The way to notify the host side about such a return is to rely on
	the new hypercall KVM_HC_RETURN_MEM. In order to avoid the CPU to be
	overloaded with too many hypercalls, we only return memory blocks of
	order 7 (512k blocks) and higher. This value has been found running
	memory tests using multiple threads allocating/freeing high amount
	of memory. Those tests were run for different order values, and 7 was
	the best tradeoff between the number of hypercalls issued and the
	amount of memory returned to the host.

	In order to limit performances impact related to this code addition,
	we check for blocks of order 7 or higher. This means it only costs an
	additional function call and a branch to perform this check.

	Furthermore, this code has been added to the "merge" codepath of the
	buddy allocator, which is not as sensitive as the "free" codepath.
	Not all blocks going through the "free" codepath will end up in the
	"merge" codepath because some of them won't find their free buddy.
	But this is a negligible amount since the kernel does not use many
	high order blocks directly. Instead, those bigger blocks are often
	broken into smaller chunks used as low order blocks. At the time
	those small blocks are released, they go through the merge path.

	Benchmarks such as ebizzy and will-it-scale have been run in order
	to make sure this patch does not affect kernel performances and no
	significant differences were observed.

	Suggested-by: Arjan van de Ven <[email protected]>
	Signed-off-by: Sebastien Boeuf <[email protected]>
	---
	arch/x86/include/asm/kvm_para.h \| 22 ++++++++++++++++++++++
	arch/x86/kernel/kvm.c \| 10 ++++++++++
	include/linux/mm-arch-hooks.h \| 8 ++++++++
	mm/page_alloc.c \| 2 ++
	4 files changed, 42 insertions(+)

	diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
	index bc62e7c..4a2f6d1 100644
	--- a/arch/x86/include/asm/kvm_para.h
	+++ b/arch/x86/include/asm/kvm_para.h
	@@ -92,6 +92,28 @@ void kvm_async_pf_task_wait(u32 token);
	void kvm_async_pf_task_wake(u32 token);
	u32 kvm_read_and_reset_pf_reason(void);
	extern void kvm_disable_steal_time(void);
	+void kvm_arch_return_memory(struct page *page, unsigned int order);
	+
	+/*
	+ * This order has been found in an empirical way, running memory tests
	+ * through many iterations to assess the number of hypercalls issued
	+ * and the amount of memory returned. In case you change this order to
	+ * 6 or 8, it should not impact your performances significantly.
	+ *
	+ * Smaller values lead to less memory waste, but consume more CPU on
	+ * hypercalls. Larger values use less CPU, but do not as precisely
	+ * inform the hypervisor of which memory is free.
	+ */
	+#define RET_MEM_BUDDY_ORDER 7
	+
	+static inline void arch_buddy_merge(struct page *page, unsigned int order)
	+{
	+ if (order < RET_MEM_BUDDY_ORDER)
	+ return;
	+
	+ kvm_arch_return_memory(page, order);
	+}
	+#define arch_buddy_merge arch_buddy_merge

	#ifdef CONFIG_PARAVIRT_SPINLOCKS
	void __init kvm_spinlock_init(void);
	diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
	index 14f65a5..ad424e9 100644
	--- a/arch/x86/kernel/kvm.c
	+++ b/arch/x86/kernel/kvm.c
	@@ -549,6 +549,16 @@ static __init int activate_jump_labels(void)
	}
	arch_initcall(activate_jump_labels);

	+void kvm_arch_return_memory(struct page *page, unsigned int order)
	+{
	+ if (!kvm_para_available())
	+ return;
	+
	+ kvm_hypercall2(KVM_HC_RETURN_MEM,
	+ page_to_phys(page),
	+ PAGE_SIZE << order);
	+}
	+
	#ifdef CONFIG_PARAVIRT_SPINLOCKS

	/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
	diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
	index 4efc3f5..26eb3a0 100644
	--- a/include/linux/mm-arch-hooks.h
	+++ b/include/linux/mm-arch-hooks.h
	@@ -12,6 +12,7 @@
	#define _LINUX_MM_ARCH_HOOKS_H

	#include <asm/mm-arch-hooks.h>
	+#include <asm/kvm_para.h>

	#ifndef arch_remap
	static inline void arch_remap(struct mm_struct *mm,
	@@ -22,4 +23,11 @@ static inline void arch_remap(struct mm_struct *mm,
	#define arch_remap arch_remap
	#endif

	+#ifndef arch_buddy_merge
	+static inline void arch_buddy_merge(struct page *page, unsigned int order)
	+{
	+}
	+#define arch_buddy_merge arch_buddy_merge
	+#endif
	+
	#endif /* _LINUX_MM_ARCH_HOOKS_H */
	diff --git a/mm/page_alloc.c b/mm/page_alloc.c
	index a7a6aac..137898e 100644
	--- a/mm/page_alloc.c
	+++ b/mm/page_alloc.c
	@@ -64,6 +64,7 @@
	#include <linux/page_owner.h>
	#include <linux/kthread.h>
	#include <linux/memcontrol.h>
	+#include <linux/mm-arch-hooks.h>

	#include <asm/sections.h>
	#include <asm/tlbflush.h>
	@@ -856,6 +857,7 @@ static inline void __free_one_page(struct page *page,
	}

	done_merging:
	+ arch_buddy_merge(page, order);
	set_page_order(page, order);

	/*
	--
	2.7.4
	From 08a9e5c52859ca0f6ed05136d6d6550daf9a3ac1 Mon Sep 17 00:00:00 2001
	From: Sebastien Boeuf <[email protected]>
	Date: Mon, 23 Jan 2017 15:32:39 -0800
	Subject: [PATCH 4/4] sysctl: vm: Fine-grained cache shrinking

	Lots of virtual machines are let in idle state for days until they
	are terminated, and they can keep a large amount of memory in their
	cache, meaning this memory cannot be used by other processes.

	We tried to release this memory using existing drop_caches sysctl,
	but it led to the complete cache loss while it could have been used
	whether the idle process wakes up. Indeed, the process can't find any
	available cached data and it directly affects performances to rebuild
	it from scratch.

	Instead, the solution we want is based on shrinking gradually system
	cache over time. This patch adds a new sysctl shrink_caches_mb so as
	to allow userspace applications indicating the kernel it should shrink
	system cache up to the amount (in MiB) specified.

	There is an application called "memshrinker" which uses this new
	mechanism. It runs in the background and periodically releases a
	specified amount of cache. This amount is based on the remaining
	cache on the system, and period is computed to follow a shrinking
	model. It results in saving a lot of memory for other processes
	running on the system.

	Suggested-by: Arjan van de Ven <[email protected]>
	Signed-off-by: Sebastien Boeuf <[email protected]>
	---
	fs/drop_caches.c \| 25 +++++++++++++++++++++++++
	include/linux/mm.h \| 4 ++++
	kernel/sysctl.c \| 8 ++++++++
	mm/vmscan.c \| 2 --
	4 files changed, 37 insertions(+), 2 deletions(-)

	diff --git a/fs/drop_caches.c b/fs/drop_caches.c
	index d72d52b..f564dfc 100644
	--- a/fs/drop_caches.c
	+++ b/fs/drop_caches.c
	@@ -8,10 +8,12 @@
	#include <linux/writeback.h>
	#include <linux/sysctl.h>
	#include <linux/gfp.h>
	+#include <linux/swap.h>
	#include "internal.h"

	/* A global variable is a bit ugly, but it keeps the code simple */
	int sysctl_drop_caches;
	+int sysctl_shrink_caches_mb;

	static void drop_pagecache_sb(struct super_block sb, void unused)
	{
	@@ -67,3 +69,26 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write,
	}
	return 0;
	}
	+
	+int shrink_caches_sysctl_handler(struct ctl_table *table, int write,
	+ void __user buffer, size_t length, loff_t *ppos)
	+{
	+ int ret;
	+ unsigned long nr_to_reclaim, page_reclaimed;
	+
	+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
	+ if (ret)
	+ return ret;
	+
	+ nr_to_reclaim = sysctl_shrink_caches_mb * (1 << 20) / PAGE_SIZE;
	+ if (write) {
	+ page_reclaimed = shrink_all_memory(nr_to_reclaim);
	+ if (page_reclaimed > 0)
	+ lru_add_drain_all();
	+
	+ if (page_reclaimed != nr_to_reclaim)
	+ return page_reclaimed;
	+ }
	+
	+ return 0;
	+}
	diff --git a/include/linux/mm.h b/include/linux/mm.h
	index 0627906..0e27e60 100644
	--- a/include/linux/mm.h
	+++ b/include/linux/mm.h
	@@ -2368,6 +2368,10 @@ extern int kvm_ret_mem_advice;
	int kvm_madv_instant_free_sysctl_handler(struct ctl_table *table, int write,
	void __user buffer, size_t length,
	loff_t *ppos);
	+extern int sysctl_shrink_caches_mb;
	+int shrink_caches_sysctl_handler(struct ctl_table *table, int write,
	+ void __user buffer, size_t length,
	+ loff_t *ppos);
	#endif

	void drop_slab(void);
	diff --git a/kernel/sysctl.c b/kernel/sysctl.c
	index 73dc325..bcfeb8d 100644
	--- a/kernel/sysctl.c
	+++ b/kernel/sysctl.c
	@@ -1391,6 +1391,14 @@ static struct ctl_table vm_table[] = {
	.mode = 0644,
	.proc_handler = kvm_madv_instant_free_sysctl_handler,
	},
	+ {
	+ .procname = "shrink_caches_mb",
	+ .data = &sysctl_shrink_caches_mb,
	+ .maxlen = sizeof(int),
	+ .mode = 0644,
	+ .proc_handler = shrink_caches_sysctl_handler,
	+ .extra1 = &one,
	+ },
	#ifdef CONFIG_COMPACTION
	{
	.procname = "compact_memory",
	diff --git a/mm/vmscan.c b/mm/vmscan.c
	index 70aa739..53c2265 100644
	--- a/mm/vmscan.c
	+++ b/mm/vmscan.c
	@@ -3522,7 +3522,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
	wake_up_interruptible(&pgdat->kswapd_wait);
	}

	-#ifdef CONFIG_HIBERNATION
	/*
	* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
	* freed pages.
	@@ -3561,7 +3560,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)

	return nr_reclaimed;
	}
	-#endif /* CONFIG_HIBERNATION */

	/* It's optimal to keep kswapds on the same CPUs as their memory, but
	not required for correctness. So if the last cpu in a node goes
	--
	2.7.4