laoar · April 19, 2020 10:04
diff --git a/0001-mm-memcg-reduce-size-of-struct-mem_cgroup-by-using-b.patch b/0001-mm-memcg-reduce-size-of-struct-mem_cgroup-by-using-b.patch
 Subject: [PATCH 1/4] mm, memcg: reduce size of struct mem_cgroup by using bit 
 field
 
 There are some members in struct mem_group can be either 0(false) or
 1(true), so we can define them using bit field to reduce size. With this
 patch, the size of struct mem_cgroup can be reduced by 64 bytes in theory,
 but as there're some MEMCG_PADDING()s, the real number may be different,
 which is relate with the cacheline size. Anyway, this patch could reduce
 the size of struct mem_cgroup more or less.

 Cc: Aaron Lu <[email protected]>
 Signed-off-by: Yafang Shao <[email protected]>
 ---
 include/linux/memcontrol.h | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index a7a0a1a5..612a457 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -229,20 +229,26 @@ struct mem_cgroup {
        /*  
         * Should the accounting and control be hierarchical, per subtree?
         */  
 -       bool use_hierarchy;
 +       unsigned int use_hierarchy : 1;
 
        /*  
         * Should the OOM killer kill all belonging tasks, had it kill one?
         */  
 -       bool oom_group;
 +       unsigned int  oom_group : 1;
 
        /* protected by memcg_oom_lock */
 -       bool            oom_lock;
 -       int             under_oom;
 +       unsigned int oom_lock : 1;
 
 -       int     swappiness;
        /* OOM-Killer disable */
 -       int             oom_kill_disable;
 +       unsigned int oom_kill_disable : 1;
 +
 +       /* Legacy tcp memory accounting */
 +       unsigned int tcpmem_active : 1;
 +       unsigned int tcpmem_pressure : 1;
 +
 +       int under_oom;
 +
 +       int     swappiness;
 
        /* memory.events and memory.events.local */
        struct cgroup_file events_file;
 @@ -297,9 +303,6 @@ struct mem_cgroup {
 
        unsigned long           socket_pressure;
 
 -       /* Legacy tcp memory accounting */
 -       bool                    tcpmem_active;
 -       int                     tcpmem_pressure;
 
 #ifdef CONFIG_MEMCG_KMEM
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 -- 
 1.8.3.1
diff --git a/0002-mm-memcg-introduce-MEMCG_PROT_SKIP-for-memcg-zero-us.patch b/0002-mm-memcg-introduce-MEMCG_PROT_SKIP-for-memcg-zero-us.patch
 Subject: [PATCH 2/4] mm, memcg: introduce MEMCG_PROT_SKIP for memcg zero usage
 case

 If the usage of a memcg is zero, we don't need to do useless work to scan
 it. That is a minor optimization.

 Cc: Roman Gushchin <[email protected]>
 Signed-off-by: Yafang Shao <[email protected]>
 ---
 include/linux/memcontrol.h | 1 +
 mm/memcontrol.c            | 2 +-
 mm/vmscan.c                | 6 ++++++
 3 files changed, 8 insertions(+), 1 deletion(-)

 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 612a457..1a315c7 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -54,6 +54,7 @@ enum mem_cgroup_protection {
        MEMCG_PROT_NONE,
        MEMCG_PROT_LOW,
        MEMCG_PROT_MIN,
 +       MEMCG_PROT_SKIP,        /* For zero usage case */
 };
 
 struct mem_cgroup_reclaim_cookie {
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index c5b5f74..f35fcca 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -6292,7 +6292,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 
        usage = page_counter_read(&memcg->memory);
        if (!usage)
 -               return MEMCG_PROT_NONE;
 +               return MEMCG_PROT_SKIP;
 
        emin = memcg->memory.min;
        elow = memcg->memory.low;
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 5a6445e..3c4c2da 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -2677,6 +2677,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
                         * thresholds (see get_scan_count).
                         */  
                        break;
 +               case MEMCG_PROT_SKIP:
 +                       /*
 +                        * Skip scanning this memcg if the usage of it is
 +                        * zero.
 +                        */
 +                       continue;
                }   
 
                reclaimed = sc->nr_reclaimed;
 -- 
diff --git a/0003-mm-memcg-reset-memcg-s-memory.-min-low-for-reclaimin.patch b/0003-mm-memcg-reset-memcg-s-memory.-min-low-for-reclaimin.patch
 Subject: [PATCH 3/4] mm, memcg: reset memcg's memory.{min, low} for reclaiming
 itself

 memory.{emin, elow} are set in mem_cgroup_protected(), and the values of
 them won't be changed until next recalculation in this function. After
 either or both of them are set, the next reclaimer to relcaim this memcg
 may be a different reclaimer, e.g. this memcg is also the root memcg of
 the new reclaimer, and then in mem_cgroup_protection() in get_scan_count()
 the old values of them will be used to calculate scan count, that is not 
 proper. We should reset them to zero in this case.

 Cc: Chris Down <[email protected]>
 Signed-off-by: Yafang Shao <[email protected]>
 ---
 mm/memcontrol.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index f35fcca..234370c 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -6287,8 +6287,17 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 
        if (!root)
                root = root_mem_cgroup;
 -       if (memcg == root)
 +       if (memcg == root) {
 +               /*
 +                * Reset memory.(emin, elow) for reclaiming the memcg
 +                * itself.
 +                */
 +               if (memcg != root_mem_cgroup) {
 +                       memcg->memory.emin = 0;
 +                       memcg->memory.emin = 0;
 +               }
                return MEMCG_PROT_NONE;
 +       }
 
        usage = page_counter_read(&memcg->memory);
        if (!usage)
diff --git a/0004-memcg-inode-protect-page-cache-from-freeing-inode.patch b/0004-memcg-inode-protect-page-cache-from-freeing-inode.patch
 Subject: [PATCH 4/4] memcg, inode: protect page cache from freeing inode

 On my server there're some running MEMCGs protected by memory.{min, low},
 but I find the usage of these MEMCGs abruptly become very small, which are
 far more less than the protect limit. That confused me and finally I found
 that is because of inode stealing.
 Once an inode is freed, all its belonging page caches will be dropped as
 well, no matter how may page caches it has. So if we intend to protect the
 page caches in a memcg, we must protect their host (the inode) first.
 Otherwise the memcg protection can be easily bypassed with freeing inode,
 especially if there're big files in this memcg.
 The inherent mismatch between memcg and inode is a trouble. One inode can
 be shared by different MEMCGs, but it is a very rare case. If an inode is
 shared, its belonging page caches may be charged to MEMCGs. Currently
 there's no perfect solution to fix this kind of issue, but the inode
 majority-writer ownership switching can help it more or less.

 Cc: Roman Gushchin <[email protected]>
 Cc: Chris Down <[email protected]>
 Signed-off-by: Yafang Shao <[email protected]>
 ---
 fs/inode.c                 |  9 +++++++++
 include/linux/memcontrol.h | 15 +++++++++++++++
 mm/memcontrol.c            | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c                |  4 ++++
 4 files changed, 74 insertions(+)

 diff --git a/fs/inode.c b/fs/inode.c
 index fef457a..b022447 100644
 --- a/fs/inode.c
 +++ b/fs/inode.c
 @@ -734,6 +734,15 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
        if (!spin_trylock(&inode->i_lock))
                return LRU_SKIP;
 
 +
 +       /* Page protection only works in reclaimer */
 +       if (inode->i_data.nrpages && current->reclaim_state) {
 +               if (mem_cgroup_inode_protected(inode)) {
 +                       spin_unlock(&inode->i_lock);
 +                       return LRU_ROTATE;
 +               }
 +       }
 +
        /*
         * Referenced or dirty inodes are still in use. Give them another pass
         * through the LRU as we canot reclaim them now.
 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 1a315c7..21338f0 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -247,6 +247,9 @@ struct mem_cgroup {
        unsigned int tcpmem_active : 1;
        unsigned int tcpmem_pressure : 1;
 
 +       /* Soft protection will be ignored if it's true */
 +       unsigned int in_low_reclaim : 1;
 +
        int under_oom;
 
        int     swappiness;
 @@ -363,6 +366,7 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
 
 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
                                                struct mem_cgroup *memcg);
 +unsigned long mem_cgroup_inode_protected(struct inode *inode);
 
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                          gfp_t gfp_mask, struct mem_cgroup **memcgp,
 @@ -850,6 +854,11 @@ static inline enum mem_cgroup_protection mem_cgroup_protected(
        return MEMCG_PROT_NONE;
 }
 
 +static inline unsigned long mem_cgroup_inode_protected(struct inode *inode)
 +{
 +       return 0;
 +}
 +
 static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                                        gfp_t gfp_mask,
                                        struct mem_cgroup **memcgp,
 @@ -926,6 +935,12 @@ static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
        return NULL;
 }
 
 +static inline struct mem_cgroup *
 +mem_cgroup_from_css(struct cgroup_subsys_state *css)
 +{
 +       return NULL;
 +}
 +
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 234370c..efb53f3 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -6355,6 +6355,52 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 }
 
 /**
 + * Once an inode is freed, all its belonging page caches will be dropped as
 + * well, even if there're lots of page caches. So if we intend to protect
 + * page caches in a memcg, we must protect their host first. Otherwise the
 + * memory usage can be dropped abruptly if there're big files in this
 + * memcg. IOW the memcy protection can be easily bypassed with freeing
 + * inode. We should prevent it.
 + * The inherent mismatch between memcg and inode is a trouble. One inode
 + * can be shared by different MEMCGs, but it is a very rare case. If
 + * an inode is shared, its belonging page caches may be charged to
 + * different MEMCGs. Currently there's no perfect solution to fix this
 + * kind of issue, but the inode majority-writer ownership switching can
 + * help it more or less.
 + */
 +unsigned long mem_cgroup_inode_protected(struct inode *inode)
 +{
 +       unsigned long cgroup_size;
 +       unsigned long protect = 0;
 +       struct bdi_writeback *wb;
 +       struct mem_cgroup *memcg;
 +
 +       wb = inode_to_wb(inode);
 +       if (!wb)
 +               goto out;
 +
 +       memcg = mem_cgroup_from_css(wb->memcg_css);
 +       if (!memcg || memcg == root_mem_cgroup)
 +               goto out;
 +
 +       protect = mem_cgroup_protection(memcg, memcg->in_low_reclaim);
 +       if (!protect)
 +               goto out;
 +
 +       cgroup_size = mem_cgroup_size(memcg);
 +       /*
 +        * Don't need to protect this inode, if the usage is still above
 +        * the limit after reclaiming this inode and its belonging page
 +        * caches.
 +        */
 +       if (inode->i_data.nrpages + protect < cgroup_size)
 +               protect = 0;
 +
 +out:
 +       return protect;
 +/**
  * mem_cgroup_try_charge - try charging a page
  * @page: page to charge
  * @mm: mm context of the victim
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 3c4c2da..1cc7fc2 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -2666,6 +2666,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
                                sc->memcg_low_skipped = 1;
                                continue;
                        }
 +                       memcg->in_low_reclaim = 1;
                        memcg_memory_event(memcg, MEMCG_LOW);
                        break;
                case MEMCG_PROT_NONE:
 @@ -2693,6 +2694,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
                shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
                            sc->priority);
 
 +               if (memcg->in_low_reclaim)
 +                       memcg->in_low_reclaim = 0;
 +
                /* Record the group's reclaim efficiency */
                vmpressure(sc->gfp_mask, memcg, false,
                           sc->nr_scanned - scanned,
 -- 
 1.8.3.1
	Subject: [PATCH 1/4] mm, memcg: reduce size of struct mem_cgroup by using bit
	field

	There are some members in struct mem_group can be either 0(false) or
	1(true), so we can define them using bit field to reduce size. With this
	patch, the size of struct mem_cgroup can be reduced by 64 bytes in theory,
	but as there're some MEMCG_PADDING()s, the real number may be different,
	which is relate with the cacheline size. Anyway, this patch could reduce
	the size of struct mem_cgroup more or less.

	Cc: Aaron Lu <[email protected]>
	Signed-off-by: Yafang Shao <[email protected]>
	---
	include/linux/memcontrol.h \| 21 ++++++++++++---------
	1 file changed, 12 insertions(+), 9 deletions(-)

	diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
	index a7a0a1a5..612a457 100644
	--- a/include/linux/memcontrol.h
	+++ b/include/linux/memcontrol.h
	@@ -229,20 +229,26 @@ struct mem_cgroup {
	/*
	* Should the accounting and control be hierarchical, per subtree?
	*/
	- bool use_hierarchy;
	+ unsigned int use_hierarchy : 1;

	/*
	* Should the OOM killer kill all belonging tasks, had it kill one?
	*/
	- bool oom_group;
	+ unsigned int oom_group : 1;

	/* protected by memcg_oom_lock */
	- bool oom_lock;
	- int under_oom;
	+ unsigned int oom_lock : 1;

	- int swappiness;
	/* OOM-Killer disable */
	- int oom_kill_disable;
	+ unsigned int oom_kill_disable : 1;
	+
	+ /* Legacy tcp memory accounting */
	+ unsigned int tcpmem_active : 1;
	+ unsigned int tcpmem_pressure : 1;
	+
	+ int under_oom;
	+
	+ int swappiness;

	/* memory.events and memory.events.local */
	struct cgroup_file events_file;
	@@ -297,9 +303,6 @@ struct mem_cgroup {

	unsigned long socket_pressure;

	- /* Legacy tcp memory accounting */
	- bool tcpmem_active;
	- int tcpmem_pressure;

	#ifdef CONFIG_MEMCG_KMEM
	/* Index in the kmem_cache->memcg_params.memcg_caches array */
	--
	1.8.3.1
	Subject: [PATCH 2/4] mm, memcg: introduce MEMCG_PROT_SKIP for memcg zero usage
	case

	If the usage of a memcg is zero, we don't need to do useless work to scan
	it. That is a minor optimization.

	Cc: Roman Gushchin <[email protected]>
	Signed-off-by: Yafang Shao <[email protected]>
	---
	include/linux/memcontrol.h \| 1 +
	mm/memcontrol.c \| 2 +-
	mm/vmscan.c \| 6 ++++++
	3 files changed, 8 insertions(+), 1 deletion(-)

	diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
	index 612a457..1a315c7 100644
	--- a/include/linux/memcontrol.h
	+++ b/include/linux/memcontrol.h
	@@ -54,6 +54,7 @@ enum mem_cgroup_protection {
	MEMCG_PROT_NONE,
	MEMCG_PROT_LOW,
	MEMCG_PROT_MIN,
	+ MEMCG_PROT_SKIP, /* For zero usage case */
	};

	struct mem_cgroup_reclaim_cookie {
	diff --git a/mm/memcontrol.c b/mm/memcontrol.c
	index c5b5f74..f35fcca 100644
	--- a/mm/memcontrol.c
	+++ b/mm/memcontrol.c
	@@ -6292,7 +6292,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,

	usage = page_counter_read(&memcg->memory);
	if (!usage)
	- return MEMCG_PROT_NONE;
	+ return MEMCG_PROT_SKIP;

	emin = memcg->memory.min;
	elow = memcg->memory.low;
	diff --git a/mm/vmscan.c b/mm/vmscan.c
	index 5a6445e..3c4c2da 100644
	--- a/mm/vmscan.c
	+++ b/mm/vmscan.c
	@@ -2677,6 +2677,12 @@ static void shrink_node_memcgs(pg_data_t pgdat, struct scan_control sc)
	* thresholds (see get_scan_count).
	*/
	break;
	+ case MEMCG_PROT_SKIP:
	+ /*
	+ * Skip scanning this memcg if the usage of it is
	+ * zero.
	+ */
	+ continue;
	}

	reclaimed = sc->nr_reclaimed;
	--
	Subject: [PATCH 3/4] mm, memcg: reset memcg's memory.{min, low} for reclaiming
	itself

	memory.{emin, elow} are set in mem_cgroup_protected(), and the values of
	them won't be changed until next recalculation in this function. After
	either or both of them are set, the next reclaimer to relcaim this memcg
	may be a different reclaimer, e.g. this memcg is also the root memcg of
	the new reclaimer, and then in mem_cgroup_protection() in get_scan_count()
	the old values of them will be used to calculate scan count, that is not
	proper. We should reset them to zero in this case.

	Cc: Chris Down <[email protected]>
	Signed-off-by: Yafang Shao <[email protected]>
	---
	mm/memcontrol.c \| 11 ++++++++++-
	1 file changed, 10 insertions(+), 1 deletion(-)

	diff --git a/mm/memcontrol.c b/mm/memcontrol.c
	index f35fcca..234370c 100644
	--- a/mm/memcontrol.c
	+++ b/mm/memcontrol.c
	@@ -6287,8 +6287,17 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,

	if (!root)
	root = root_mem_cgroup;
	- if (memcg == root)
	+ if (memcg == root) {
	+ /*
	+ * Reset memory.(emin, elow) for reclaiming the memcg
	+ * itself.
	+ */
	+ if (memcg != root_mem_cgroup) {
	+ memcg->memory.emin = 0;
	+ memcg->memory.emin = 0;
	+ }
	return MEMCG_PROT_NONE;
	+ }

	usage = page_counter_read(&memcg->memory);
	if (!usage)
	Subject: [PATCH 4/4] memcg, inode: protect page cache from freeing inode

	On my server there're some running MEMCGs protected by memory.{min, low},
	but I find the usage of these MEMCGs abruptly become very small, which are
	far more less than the protect limit. That confused me and finally I found
	that is because of inode stealing.
	Once an inode is freed, all its belonging page caches will be dropped as
	well, no matter how may page caches it has. So if we intend to protect the
	page caches in a memcg, we must protect their host (the inode) first.
	Otherwise the memcg protection can be easily bypassed with freeing inode,
	especially if there're big files in this memcg.
	The inherent mismatch between memcg and inode is a trouble. One inode can
	be shared by different MEMCGs, but it is a very rare case. If an inode is
	shared, its belonging page caches may be charged to MEMCGs. Currently
	there's no perfect solution to fix this kind of issue, but the inode
	majority-writer ownership switching can help it more or less.

	Cc: Roman Gushchin <[email protected]>
	Cc: Chris Down <[email protected]>
	Signed-off-by: Yafang Shao <[email protected]>
	---
	fs/inode.c \| 9 +++++++++
	include/linux/memcontrol.h \| 15 +++++++++++++++
	mm/memcontrol.c \| 46 ++++++++++++++++++++++++++++++++++++++++++++++
	mm/vmscan.c \| 4 ++++
	4 files changed, 74 insertions(+)

	diff --git a/fs/inode.c b/fs/inode.c
	index fef457a..b022447 100644
	--- a/fs/inode.c
	+++ b/fs/inode.c
	@@ -734,6 +734,15 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
	if (!spin_trylock(&inode->i_lock))
	return LRU_SKIP;

	+
	+ /* Page protection only works in reclaimer */
	+ if (inode->i_data.nrpages && current->reclaim_state) {
	+ if (mem_cgroup_inode_protected(inode)) {
	+ spin_unlock(&inode->i_lock);
	+ return LRU_ROTATE;
	+ }
	+ }
	+
	/*
	* Referenced or dirty inodes are still in use. Give them another pass
	* through the LRU as we canot reclaim them now.
	diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
	index 1a315c7..21338f0 100644
	--- a/include/linux/memcontrol.h
	+++ b/include/linux/memcontrol.h
	@@ -247,6 +247,9 @@ struct mem_cgroup {
	unsigned int tcpmem_active : 1;
	unsigned int tcpmem_pressure : 1;

	+ /* Soft protection will be ignored if it's true */
	+ unsigned int in_low_reclaim : 1;
	+
	int under_oom;

	int swappiness;
	@@ -363,6 +366,7 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,

	enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
	struct mem_cgroup *memcg);
	+unsigned long mem_cgroup_inode_protected(struct inode *inode);

	int mem_cgroup_try_charge(struct page page, struct mm_struct mm,
	gfp_t gfp_mask, struct mem_cgroup **memcgp,
	@@ -850,6 +854,11 @@ static inline enum mem_cgroup_protection mem_cgroup_protected(
	return MEMCG_PROT_NONE;
	}

	+static inline unsigned long mem_cgroup_inode_protected(struct inode *inode)
	+{
	+ return 0;
	+}
	+
	static inline int mem_cgroup_try_charge(struct page page, struct mm_struct mm,
	gfp_t gfp_mask,
	struct mem_cgroup **memcgp,
	@@ -926,6 +935,12 @@ static inline struct mem_cgroup get_mem_cgroup_from_page(struct page page)
	return NULL;
	}

	+static inline struct mem_cgroup *
	+mem_cgroup_from_css(struct cgroup_subsys_state *css)
	+{
	+ return NULL;
	+}
	+
	static inline void mem_cgroup_put(struct mem_cgroup *memcg)
	{
	}
	diff --git a/mm/memcontrol.c b/mm/memcontrol.c
	index 234370c..efb53f3 100644
	--- a/mm/memcontrol.c
	+++ b/mm/memcontrol.c
	@@ -6355,6 +6355,52 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
	}

	/**
	+ * Once an inode is freed, all its belonging page caches will be dropped as
	+ * well, even if there're lots of page caches. So if we intend to protect
	+ * page caches in a memcg, we must protect their host first. Otherwise the
	+ * memory usage can be dropped abruptly if there're big files in this
	+ * memcg. IOW the memcy protection can be easily bypassed with freeing
	+ * inode. We should prevent it.
	+ * The inherent mismatch between memcg and inode is a trouble. One inode
	+ * can be shared by different MEMCGs, but it is a very rare case. If
	+ * an inode is shared, its belonging page caches may be charged to
	+ * different MEMCGs. Currently there's no perfect solution to fix this
	+ * kind of issue, but the inode majority-writer ownership switching can
	+ * help it more or less.
	+ */
	+unsigned long mem_cgroup_inode_protected(struct inode *inode)
	+{
	+ unsigned long cgroup_size;
	+ unsigned long protect = 0;
	+ struct bdi_writeback *wb;
	+ struct mem_cgroup *memcg;
	+
	+ wb = inode_to_wb(inode);
	+ if (!wb)
	+ goto out;
	+
	+ memcg = mem_cgroup_from_css(wb->memcg_css);
	+ if (!memcg \|\| memcg == root_mem_cgroup)
	+ goto out;
	+
	+ protect = mem_cgroup_protection(memcg, memcg->in_low_reclaim);
	+ if (!protect)
	+ goto out;
	+
	+ cgroup_size = mem_cgroup_size(memcg);
	+ /*
	+ * Don't need to protect this inode, if the usage is still above
	+ * the limit after reclaiming this inode and its belonging page
	+ * caches.
	+ */
	+ if (inode->i_data.nrpages + protect < cgroup_size)
	+ protect = 0;
	+
	+out:
	+ return protect;
	+/**
	* mem_cgroup_try_charge - try charging a page
	* @page: page to charge
	* @mm: mm context of the victim
	diff --git a/mm/vmscan.c b/mm/vmscan.c
	index 3c4c2da..1cc7fc2 100644
	--- a/mm/vmscan.c
	+++ b/mm/vmscan.c
	@@ -2666,6 +2666,7 @@ static void shrink_node_memcgs(pg_data_t pgdat, struct scan_control sc)
	sc->memcg_low_skipped = 1;
	continue;
	}
	+ memcg->in_low_reclaim = 1;
	memcg_memory_event(memcg, MEMCG_LOW);
	break;
	case MEMCG_PROT_NONE:
	@@ -2693,6 +2694,9 @@ static void shrink_node_memcgs(pg_data_t pgdat, struct scan_control sc)
	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
	sc->priority);

	+ if (memcg->in_low_reclaim)
	+ memcg->in_low_reclaim = 0;
	+
	/* Record the group's reclaim efficiency */
	vmpressure(sc->gfp_mask, memcg, false,
	sc->nr_scanned - scanned,
	--
	1.8.3.1