laoar · April 19, 2020 10:03
diff --git a/0002-mm-memcg-introduce-MEMCG_PROT_SKIP-for-memcg-zero-us.patch b/0002-mm-memcg-introduce-MEMCG_PROT_SKIP-for-memcg-zero-us.patch
 From e75fa7b22e766aa6c7b87aecb30d552fb1d2b386 Mon Sep 17 00:00:00 2001
 From: Yafang Shao <[email protected]>
 Date: Mon, 16 Dec 2019 05:09:31 -0500
 Subject: [PATCH 2/5] mm, memcg: introduce MEMCG_PROT_SKIP for memcg zero usage
 case

 If the usage of a memcg is zero, we don't need to do useless work to scan
 it. That is a minor optimization.

 Cc: Roman Gushchin <[email protected]>
 Signed-off-by: Yafang Shao <[email protected]>
 ---
 include/linux/memcontrol.h | 1 +
 mm/memcontrol.c            | 2 +-
 mm/vmscan.c                | 6 ++++++
 3 files changed, 8 insertions(+), 1 deletion(-)

 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 612a457..1a315c7 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -54,6 +54,7 @@ enum mem_cgroup_protection {
 	MEMCG_PROT_NONE,
 	MEMCG_PROT_LOW,
 	MEMCG_PROT_MIN,
 +	MEMCG_PROT_SKIP,	/* For zero usage case */
 };
 
 struct mem_cgroup_reclaim_cookie {
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index c5b5f74..f35fcca 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -6292,7 +6292,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 
 	usage = page_counter_read(&memcg->memory);
 	if (!usage)
 -		return MEMCG_PROT_NONE;
 +		return MEMCG_PROT_SKIP;
 
 	emin = memcg->memory.min;
 	elow = memcg->memory.low;
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 5a6445e..3c4c2da 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -2677,6 +2677,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 			 * thresholds (see get_scan_count).
 			 */
 			break;
 +		case MEMCG_PROT_SKIP:
 +			/*
 +			 * Skip scanning this memcg if the usage of it is
 +			 * zero.
 +			 */
 +			continue;
 		}
 
 		reclaimed = sc->nr_reclaimed;
 -- 
 1.8.3.1

diff --git a/0003-mm-memcg-reset-memcg-s-memory.-min-low-for-reclaimin.patch b/0003-mm-memcg-reset-memcg-s-memory.-min-low-for-reclaimin.patch
 From 69407b40ecd4d50988270fa28d43ae37cf714bf4 Mon Sep 17 00:00:00 2001
 From: Yafang Shao <[email protected]>
 Date: Mon, 16 Dec 2019 05:46:30 -0500
 Subject: [PATCH 3/5] mm, memcg: reset memcg's memory.{min, low} for reclaiming
 itself

 memory.{emin, elow} are set in mem_cgroup_protected(), and the values of
 them won't be changed until next recalculation in this function. After
 either or both of them are set, the next reclaimer to relcaim this memcg
 may be a different reclaimer, e.g. this memcg is also the root memcg of
 the new reclaimer, and then in mem_cgroup_protection() in get_scan_count()
 the old values of them will be used to calculate scan count, that is not
 proper. We should reset them to zero in this case.

 Here's an example of this issue.

    root_mem_cgroup
         /
        A   memory.max=1024M memory.min=512M memory.current=800M

 Once kswapd is waked up, it will try to scan all MEMCGs, including
 this A, and it will assign memory.emin of A with 512M.
 After that, A may reach its hard limit(memory.max), and then it will
 do memcg reclaim. Because A is the root of this reclaimer, so it will
 not calculate its memory.emin. So the memory.emin is the old value
 512M, and then this old value will be used in
 mem_cgroup_protection() in get_scan_count() to get the scan count.
 That is not proper.

 Cc: Chris Down <[email protected]>
 Signed-off-by: Yafang Shao <[email protected]>
 ---
 mm/memcontrol.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index f35fcca..2e78931 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -6287,8 +6287,17 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 
 	if (!root)
 		root = root_mem_cgroup;
 -	if (memcg == root)
 +	if (memcg == root) {
 +		/*
 +		 * Reset memory.(emin, elow) for reclaiming the memcg
 +		 * itself.
 +		 */
 +		if (memcg != root_mem_cgroup) {
 +			memcg->memory.emin = 0;
 +			memcg->memory.elow = 0;
 +		}
 		return MEMCG_PROT_NONE;
 +	}
 
 	usage = page_counter_read(&memcg->memory);
 	if (!usage)
 -- 
 1.8.3.1

diff --git a/0004-mm-make-memcg-visible-to-lru-walker-isolation-functi.patch b/0004-mm-make-memcg-visible-to-lru-walker-isolation-functi.patch
 From e22b54ba8e49c05134d22c24353b40e61db64b12 Mon Sep 17 00:00:00 2001
 From: Yafang Shao <[email protected]>
 Date: Fri, 20 Dec 2019 02:41:40 -0500
 Subject: [PATCH 4/5] mm: make memcg visible to lru walker isolation function

 The lru walker isolation function may use this memcg to do something, e.g.
 the inode isolatation function will use the memcg to do inode protection in
 followup patch. So make memcg visible to the lru walker isolation function.

 Something should be emphasized in this patch is it replaces
 for_each_memcg_cache_index() with for_each_mem_cgroup() in
 list_lru_walk_node(). Because there's a gap between these two MACROs that
 for_each_mem_cgroup() depends on CONFIG_MEMCG while the other one depends
 on CONFIG_MEMCG_KMEM. But as list_lru_memcg_aware() returns false if
 CONFIG_MEMCG_KMEM is not configured, it is safe to this replacement.

 Cc: Dave Chinner <[email protected]>
 Signed-off-by: Yafang Shao <[email protected]>
 ---
 include/linux/memcontrol.h | 24 ++++++++++++++++++++++++
 mm/list_lru.c              | 14 +++++++-------
 mm/memcontrol.c            | 15 ---------------
 3 files changed, 31 insertions(+), 22 deletions(-)

 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 1a315c7..e1c5544 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -449,6 +449,23 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 int mem_cgroup_scan_tasks(struct mem_cgroup *,
 			  int (*)(struct task_struct *, void *), void *);
 
 +/*
 + * Iteration constructs for visiting all cgroups (under a tree).  If
 + * loops are exited prematurely (break), mem_cgroup_iter_break() must
 + * be used for reference counting.
 + */
 +#define for_each_mem_cgroup_tree(iter, root)		\
 +	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
 +	     iter != NULL;				\
 +	     iter = mem_cgroup_iter(root, iter, NULL))
 +
 +#define for_each_mem_cgroup(iter)			\
 +	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
 +	     iter != NULL;				\
 +	     iter = mem_cgroup_iter(NULL, iter, NULL))
 +
 +
 +
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
 	if (mem_cgroup_disabled())
 @@ -949,6 +966,12 @@ static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 	return 0;
 }
 
 +#define for_each_mem_cgroup(iter)		\
 +	for (; NULL; )
 +
 +#define for_each_mem_cgroup(iter)		\
 +	for (; NULL; )
 +
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
 	return 0;
 @@ -1388,6 +1411,7 @@ void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
 #define for_each_memcg_cache_index(_idx)	\
 	for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)
 
 +
 static inline bool memcg_kmem_enabled(void)
 {
 	return static_branch_unlikely(&memcg_kmem_enabled_key);
 diff --git a/mm/list_lru.c b/mm/list_lru.c
 index 0f1f6b0..337895e 100644
 --- a/mm/list_lru.c
 +++ b/mm/list_lru.c
 @@ -207,11 +207,11 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
 EXPORT_SYMBOL_GPL(list_lru_count_node);
 
 static unsigned long
 -__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
 +__list_lru_walk_one(struct list_lru_node *nlru, struct mem_cgroup *memcg,
 		    list_lru_walk_cb isolate, void *cb_arg,
 		    unsigned long *nr_to_walk)
 {
 -
 +	int memcg_idx = memcg_cache_id(memcg);
 	struct list_lru_one *l;
 	struct list_head *item, *n;
 	unsigned long isolated = 0;
 @@ -273,7 +273,7 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
 	unsigned long ret;
 
 	spin_lock(&nlru->lock);
 -	ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
 +	ret = __list_lru_walk_one(nlru, memcg, isolate, cb_arg,
 				  nr_to_walk);
 	spin_unlock(&nlru->lock);
 	return ret;
 @@ -289,7 +289,7 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
 	unsigned long ret;
 
 	spin_lock_irq(&nlru->lock);
 -	ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
 +	ret = __list_lru_walk_one(nlru, memcg, isolate, cb_arg,
 				  nr_to_walk);
 	spin_unlock_irq(&nlru->lock);
 	return ret;
 @@ -299,17 +299,17 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
 				 list_lru_walk_cb isolate, void *cb_arg,
 				 unsigned long *nr_to_walk)
 {
 +	struct mem_cgroup *memcg;
 	long isolated = 0;
 -	int memcg_idx;
 
 	isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
 				      nr_to_walk);
 	if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
 -		for_each_memcg_cache_index(memcg_idx) {
 +		for_each_mem_cgroup(memcg) {
 			struct list_lru_node *nlru = &lru->node[nid];
 
 			spin_lock(&nlru->lock);
 -			isolated += __list_lru_walk_one(nlru, memcg_idx,
 +			isolated += __list_lru_walk_one(nlru, memcg,
 							isolate, cb_arg,
 							nr_to_walk);
 			spin_unlock(&nlru->lock);
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 2e78931..2fc2bf4 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -222,21 +222,6 @@ enum res_type {
 /* Used for OOM nofiier */
 #define OOM_CONTROL		(0)
 
 -/*
 - * Iteration constructs for visiting all cgroups (under a tree).  If
 - * loops are exited prematurely (break), mem_cgroup_iter_break() must
 - * be used for reference counting.
 - */
 -#define for_each_mem_cgroup_tree(iter, root)		\
 -	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
 -	     iter != NULL;				\
 -	     iter = mem_cgroup_iter(root, iter, NULL))
 -
 -#define for_each_mem_cgroup(iter)			\
 -	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
 -	     iter != NULL;				\
 -	     iter = mem_cgroup_iter(NULL, iter, NULL))
 -
 static inline bool should_force_charge(void)
 {
 	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 -- 
 1.8.3.1

diff --git a/0005-memcg-inode-protect-page-cache-from-freeing-inode.patch b/0005-memcg-inode-protect-page-cache-from-freeing-inode.patch
 From 150abc1af5c95d05ba82b83a13bb93703b16f402 Mon Sep 17 00:00:00 2001
 From: Yafang Shao <[email protected]>
 Date: Fri, 20 Dec 2019 04:08:29 -0500
 Subject: [PATCH 5/5] memcg, inode: protect page cache from freeing inode

 On my server there're some running MEMCGs protected by memory.{min, low},
 but I found the usage of these MEMCGs abruptly became very small, which
 were far less than the protect limit. It confused me and finally I
 found that was because of inode stealing.
 Once an inode is freed, all its belonging page caches will be dropped as
 well, no matter how may page caches it has. So if we intend to protect the
 page caches in a memcg, we must protect their host (the inode) first.
 Otherwise the memcg protection can be easily bypassed with freeing inode,
 especially if there're big files in this memcg.

 Supposes we have a memcg, and the stat of this memcg is as bellow,
 	memory.current = 1024M
 	memory.min = 512M
 And in this memcg there's a inode with 800M page caches.
 Once this memcg is scanned by kswapd or other regular reclaimers,
    kswapd <<<< It can be either of the regular reclaimers.
        shrink_node_memcgs
 	    switch (mem_cgroup_protected()) <<<< Not protected
 		case MEMCG_PROT_NONE:  <<<< Will scan this memcg
 			beak;
            shrink_lruvec() <<<< Reclaim the page caches
            shrink_slab()   <<<< It may free this inode and drop all its
                                 page caches(800M).

 The inherent mismatch between memcg and inode is a trouble. One inode can
 be shared by different MEMCGs, but it is a very rare case. If an inode is
 shared, its belonging page caches may be charged to different MEMCGs.
 Currently there's no perfect solution to fix this kind of issue, but the
 inode majority-writer ownership switching can help it more or less.

 Cc: Roman Gushchin <[email protected]>
 Cc: Chris Down <[email protected]>
 Cc: Dave Chinner <[email protected]>
 Signed-off-by: Yafang Shao <[email protected]>
 ---
 fs/inode.c                 | 22 ++++++++++++++++++++--
 include/linux/memcontrol.h | 11 +++++++++++
 mm/memcontrol.c            | 35 +++++++++++++++++++++++++++++++++++
 mm/vmscan.c                |  5 +++++
 4 files changed, 71 insertions(+), 2 deletions(-)

 diff --git a/fs/inode.c b/fs/inode.c
 index fef457a..dfd9140 100644
 --- a/fs/inode.c
 +++ b/fs/inode.c
 @@ -54,6 +54,13 @@
  *   inode_hash_lock
  */
 
 +struct inode_head {
 +	struct list_head *freeable;
 +#ifdef CONFIG_MEMCG_KMEM
 +	struct mem_cgroup *memcg;
 +#endif
 +};
 +
 static unsigned int i_hash_mask __read_mostly;
 static unsigned int i_hash_shift __read_mostly;
 static struct hlist_head *inode_hashtable __read_mostly;
 @@ -724,8 +731,10 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 static enum lru_status inode_lru_isolate(struct list_head *item,
 		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
 -	struct list_head *freeable = arg;
 +	struct inode_head *ihead = (struct inode_head *)arg;
 +	struct list_head *freeable = ihead->freeable;
 	struct inode	*inode = container_of(item, struct inode, i_lru);
 +	struct mem_cgroup *memcg = ihead->memcg;
 
 	/*
 	 * we are inverting the lru lock/inode->i_lock here, so use a trylock.
 @@ -734,6 +743,12 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 	if (!spin_trylock(&inode->i_lock))
 		return LRU_SKIP;
 
 +	if (memcg && inode->i_data.nrpages &&
 +	    !(memcg_can_reclaim_inode(memcg, inode))) {
 +		spin_unlock(&inode->i_lock);
 +		return LRU_ROTATE;
 +	}
 +
 	/*
 	 * Referenced or dirty inodes are still in use. Give them another pass
 	 * through the LRU as we canot reclaim them now.
 @@ -789,11 +804,14 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
  */
 long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
 {
 +	struct inode_head ihead;
 	LIST_HEAD(freeable);
 	long freed;
 
 +	ihead.freeable = &freeable;
 +	ihead.memcg = sc->memcg;
 	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
 -				     inode_lru_isolate, &freeable);
 +				     inode_lru_isolate, &ihead);
 	dispose_list(&freeable);
 	return freed;
 }
 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index e1c5544..0552131 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -247,6 +247,9 @@ struct mem_cgroup {
 	unsigned int tcpmem_active : 1;
 	unsigned int tcpmem_pressure : 1;
 
 +	/* Soft protection will be ignored if it's true */
 +	unsigned int in_low_reclaim : 1;
 +
 	int under_oom;
 
 	int	swappiness;
 @@ -363,6 +366,8 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
 
 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 						struct mem_cgroup *memcg);
 +bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
 +				      struct inode *inode);
 
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 			  gfp_t gfp_mask, struct mem_cgroup **memcgp,
 @@ -867,6 +872,12 @@ static inline enum mem_cgroup_protection mem_cgroup_protected(
 	return MEMCG_PROT_NONE;
 }
 
 +static inline bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
 +						    struct inode *memcg)
 +{
 +	return true;
 +}
 +
 static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask,
 					struct mem_cgroup **memcgp,
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 2fc2bf4..6756fd3 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -6340,6 +6340,41 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 }
 
 /**
 + * Once an inode is freed, all its belonging page caches will be dropped as
 + * well, even if there're lots of page caches. So if we intend to protect
 + * page caches in a memcg, we must protect their host(the inode) first.
 + * Otherwise the memcg protection can be easily bypassed with freeing inode,
 + * especially if there're big files in this memcg.
 + * The inherent mismatch between memcg and inode is a trouble. One inode
 + * can be shared by different MEMCGs, but it is a very rare case. If
 + * an inode is shared, its belonging page caches may be charged to
 + * different MEMCGs. Currently there's no perfect solution to fix this
 + * kind of issue, but the inode majority-writer ownership switching can
 + * help it more or less.
 + */
 +bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
 +				      struct inode *inode)
 +{
 +	unsigned long cgroup_size;
 +	unsigned long protection;
 +	bool reclaimable = true;
 +
 +	protection = mem_cgroup_protection(memcg, memcg->in_low_reclaim);
 +	if (!protection)
 +		goto out;
 +
 +	/*
 +	 * Don't protect this inode if the usage of this memcg is still
 +	 * above the protection after reclaiming this inode and all its
 +	 * belonging page caches.
 +	 */
 +	cgroup_size = mem_cgroup_size(memcg);
 +	if (inode->i_data.nrpages + protection > cgroup_size)
 +		relaimable = false;
 +out:
 +	return reclaimable;
 +}
 +
 +/**
  * mem_cgroup_try_charge - try charging a page
  * @page: page to charge
  * @mm: mm context of the victim
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 3c4c2da..ecc5c1d 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -2666,6 +2666,8 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 				sc->memcg_low_skipped = 1;
 				continue;
 			}
 +
 +			memcg->in_low_reclaim = 1;
 			memcg_memory_event(memcg, MEMCG_LOW);
 			break;
 		case MEMCG_PROT_NONE:
 @@ -2693,6 +2695,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
 			    sc->priority);
 
 +		if (memcg->in_low_reclaim)
 +			memcg->in_low_reclaim = 0;
 +
 		/* Record the group's reclaim efficiency */
 		vmpressure(sc->gfp_mask, memcg, false,
 			   sc->nr_scanned - scanned,
 -- 
 1.8.3.1

diff --git a/mm-memcg-reduce-size-of-struct-mem_cgroup-by-using-b.patch b/mm-memcg-reduce-size-of-struct-mem_cgroup-by-using-b.patch
 From bb831e95980190a65a9dd700e503758dc877c39f Mon Sep 17 00:00:00 2001
 From: Yafang Shao <[email protected]>
 Date: Mon, 16 Dec 2019 04:21:30 -0500
 Subject: [PATCH 1/5] mm, memcg: reduce size of struct mem_cgroup by using bit
 field

 There are some members in struct mem_group can be either 0(false) or
 1(true), so we can define them using bit field to reduce size. With this
 patch, the size of struct mem_cgroup can be reduced by 64 bytes in theory,
 but as there're some MEMCG_PADDING()s, the real number may be different,
 which is relate with the cacheline size. Anyway, this patch could reduce
 the size of struct mem_cgroup more or less.

 Signed-off-by: Yafang Shao <[email protected]>
 ---
 include/linux/memcontrol.h | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index a7a0a1a5..612a457 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -229,20 +229,26 @@ struct mem_cgroup {
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 -	bool use_hierarchy;
 +	unsigned int use_hierarchy : 1;
 
 	/*
 	 * Should the OOM killer kill all belonging tasks, had it kill one?
 	 */
 -	bool oom_group;
 +	unsigned int  oom_group : 1;
 
 	/* protected by memcg_oom_lock */
 -	bool		oom_lock;
 -	int		under_oom;
 +	unsigned int oom_lock : 1;
 
 -	int	swappiness;
 	/* OOM-Killer disable */
 -	int		oom_kill_disable;
 +	unsigned int oom_kill_disable : 1;
 +
 +	/* Legacy tcp memory accounting */
 +	unsigned int tcpmem_active : 1;
 +	unsigned int tcpmem_pressure : 1;
 +
 +	int under_oom;
 +
 +	int	swappiness;
 
 	/* memory.events and memory.events.local */
 	struct cgroup_file events_file;
 @@ -297,9 +303,6 @@ struct mem_cgroup {
 
 	unsigned long		socket_pressure;
 
 -	/* Legacy tcp memory accounting */
 -	bool			tcpmem_active;
 -	int			tcpmem_pressure;
 
 #ifdef CONFIG_MEMCG_KMEM
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 -- 
 1.8.3.1
	From e75fa7b22e766aa6c7b87aecb30d552fb1d2b386 Mon Sep 17 00:00:00 2001
	From: Yafang Shao <[email protected]>
	Date: Mon, 16 Dec 2019 05:09:31 -0500
	Subject: [PATCH 2/5] mm, memcg: introduce MEMCG_PROT_SKIP for memcg zero usage
	case

	If the usage of a memcg is zero, we don't need to do useless work to scan
	it. That is a minor optimization.

	Cc: Roman Gushchin <[email protected]>
	Signed-off-by: Yafang Shao <[email protected]>
	---
	include/linux/memcontrol.h \| 1 +
	mm/memcontrol.c \| 2 +-
	mm/vmscan.c \| 6 ++++++
	3 files changed, 8 insertions(+), 1 deletion(-)

	diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
	index 612a457..1a315c7 100644
	--- a/include/linux/memcontrol.h
	+++ b/include/linux/memcontrol.h
	@@ -54,6 +54,7 @@ enum mem_cgroup_protection {
	MEMCG_PROT_NONE,
	MEMCG_PROT_LOW,
	MEMCG_PROT_MIN,
	+ MEMCG_PROT_SKIP, /* For zero usage case */
	};

	struct mem_cgroup_reclaim_cookie {
	diff --git a/mm/memcontrol.c b/mm/memcontrol.c
	index c5b5f74..f35fcca 100644
	--- a/mm/memcontrol.c
	+++ b/mm/memcontrol.c
	@@ -6292,7 +6292,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,

	usage = page_counter_read(&memcg->memory);
	if (!usage)
	- return MEMCG_PROT_NONE;
	+ return MEMCG_PROT_SKIP;

	emin = memcg->memory.min;
	elow = memcg->memory.low;
	diff --git a/mm/vmscan.c b/mm/vmscan.c
	index 5a6445e..3c4c2da 100644
	--- a/mm/vmscan.c
	+++ b/mm/vmscan.c
	@@ -2677,6 +2677,12 @@ static void shrink_node_memcgs(pg_data_t pgdat, struct scan_control sc)
	* thresholds (see get_scan_count).
	*/
	break;
	+ case MEMCG_PROT_SKIP:
	+ /*
	+ * Skip scanning this memcg if the usage of it is
	+ * zero.
	+ */
	+ continue;
	}

	reclaimed = sc->nr_reclaimed;
	--
	1.8.3.1
	From 69407b40ecd4d50988270fa28d43ae37cf714bf4 Mon Sep 17 00:00:00 2001
	From: Yafang Shao <[email protected]>
	Date: Mon, 16 Dec 2019 05:46:30 -0500
	Subject: [PATCH 3/5] mm, memcg: reset memcg's memory.{min, low} for reclaiming
	itself

	memory.{emin, elow} are set in mem_cgroup_protected(), and the values of
	them won't be changed until next recalculation in this function. After
	either or both of them are set, the next reclaimer to relcaim this memcg
	may be a different reclaimer, e.g. this memcg is also the root memcg of
	the new reclaimer, and then in mem_cgroup_protection() in get_scan_count()
	the old values of them will be used to calculate scan count, that is not
	proper. We should reset them to zero in this case.

	Here's an example of this issue.

	root_mem_cgroup
	/
	A memory.max=1024M memory.min=512M memory.current=800M

	Once kswapd is waked up, it will try to scan all MEMCGs, including
	this A, and it will assign memory.emin of A with 512M.
	After that, A may reach its hard limit(memory.max), and then it will
	do memcg reclaim. Because A is the root of this reclaimer, so it will
	not calculate its memory.emin. So the memory.emin is the old value
	512M, and then this old value will be used in
	mem_cgroup_protection() in get_scan_count() to get the scan count.
	That is not proper.

	Cc: Chris Down <[email protected]>
	Signed-off-by: Yafang Shao <[email protected]>
	---
	mm/memcontrol.c \| 11 ++++++++++-
	1 file changed, 10 insertions(+), 1 deletion(-)

	diff --git a/mm/memcontrol.c b/mm/memcontrol.c
	index f35fcca..2e78931 100644
	--- a/mm/memcontrol.c
	+++ b/mm/memcontrol.c
	@@ -6287,8 +6287,17 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,

	if (!root)
	root = root_mem_cgroup;
	- if (memcg == root)
	+ if (memcg == root) {
	+ /*
	+ * Reset memory.(emin, elow) for reclaiming the memcg
	+ * itself.
	+ */
	+ if (memcg != root_mem_cgroup) {
	+ memcg->memory.emin = 0;
	+ memcg->memory.elow = 0;
	+ }
	return MEMCG_PROT_NONE;
	+ }

	usage = page_counter_read(&memcg->memory);
	if (!usage)
	--
	1.8.3.1
	From e22b54ba8e49c05134d22c24353b40e61db64b12 Mon Sep 17 00:00:00 2001
	From: Yafang Shao <[email protected]>
	Date: Fri, 20 Dec 2019 02:41:40 -0500
	Subject: [PATCH 4/5] mm: make memcg visible to lru walker isolation function

	The lru walker isolation function may use this memcg to do something, e.g.
	the inode isolatation function will use the memcg to do inode protection in
	followup patch. So make memcg visible to the lru walker isolation function.

	Something should be emphasized in this patch is it replaces
	for_each_memcg_cache_index() with for_each_mem_cgroup() in
	list_lru_walk_node(). Because there's a gap between these two MACROs that
	for_each_mem_cgroup() depends on CONFIG_MEMCG while the other one depends
	on CONFIG_MEMCG_KMEM. But as list_lru_memcg_aware() returns false if
	CONFIG_MEMCG_KMEM is not configured, it is safe to this replacement.

	Cc: Dave Chinner <[email protected]>
	Signed-off-by: Yafang Shao <[email protected]>
	---
	include/linux/memcontrol.h \| 24 ++++++++++++++++++++++++
	mm/list_lru.c \| 14 +++++++-------
	mm/memcontrol.c \| 15 ---------------
	3 files changed, 31 insertions(+), 22 deletions(-)

	diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
	index 1a315c7..e1c5544 100644
	--- a/include/linux/memcontrol.h
	+++ b/include/linux/memcontrol.h
	@@ -449,6 +449,23 @@ struct mem_cgroup mem_cgroup_iter(struct mem_cgroup ,
	int mem_cgroup_scan_tasks(struct mem_cgroup *,
	int ()(struct task_struct , void ), void );

	+/*
	+ * Iteration constructs for visiting all cgroups (under a tree). If
	+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
	+ * be used for reference counting.
	+ */
	+#define for_each_mem_cgroup_tree(iter, root) \
	+ for (iter = mem_cgroup_iter(root, NULL, NULL); \
	+ iter != NULL; \
	+ iter = mem_cgroup_iter(root, iter, NULL))
	+
	+#define for_each_mem_cgroup(iter) \
	+ for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
	+ iter != NULL; \
	+ iter = mem_cgroup_iter(NULL, iter, NULL))
	+
	+
	+
	static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
	{
	if (mem_cgroup_disabled())
	@@ -949,6 +966,12 @@ static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
	return 0;
	}

	+#define for_each_mem_cgroup(iter) \
	+ for (; NULL; )
	+
	+#define for_each_mem_cgroup(iter) \
	+ for (; NULL; )
	+
	static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
	{
	return 0;
	@@ -1388,6 +1411,7 @@ void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
	#define for_each_memcg_cache_index(_idx) \
	for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)

	+
	static inline bool memcg_kmem_enabled(void)
	{
	return static_branch_unlikely(&memcg_kmem_enabled_key);
	diff --git a/mm/list_lru.c b/mm/list_lru.c
	index 0f1f6b0..337895e 100644
	--- a/mm/list_lru.c
	+++ b/mm/list_lru.c
	@@ -207,11 +207,11 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
	EXPORT_SYMBOL_GPL(list_lru_count_node);

	static unsigned long
	-__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
	+__list_lru_walk_one(struct list_lru_node nlru, struct mem_cgroup memcg,
	list_lru_walk_cb isolate, void *cb_arg,
	unsigned long *nr_to_walk)
	{
	-
	+ int memcg_idx = memcg_cache_id(memcg);
	struct list_lru_one *l;
	struct list_head item, n;
	unsigned long isolated = 0;
	@@ -273,7 +273,7 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
	unsigned long ret;

	spin_lock(&nlru->lock);
	- ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
	+ ret = __list_lru_walk_one(nlru, memcg, isolate, cb_arg,
	nr_to_walk);
	spin_unlock(&nlru->lock);
	return ret;
	@@ -289,7 +289,7 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
	unsigned long ret;

	spin_lock_irq(&nlru->lock);
	- ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
	+ ret = __list_lru_walk_one(nlru, memcg, isolate, cb_arg,
	nr_to_walk);
	spin_unlock_irq(&nlru->lock);
	return ret;
	@@ -299,17 +299,17 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
	list_lru_walk_cb isolate, void *cb_arg,
	unsigned long *nr_to_walk)
	{
	+ struct mem_cgroup *memcg;
	long isolated = 0;
	- int memcg_idx;

	isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
	nr_to_walk);
	if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
	- for_each_memcg_cache_index(memcg_idx) {
	+ for_each_mem_cgroup(memcg) {
	struct list_lru_node *nlru = &lru->node[nid];

	spin_lock(&nlru->lock);
	- isolated += __list_lru_walk_one(nlru, memcg_idx,
	+ isolated += __list_lru_walk_one(nlru, memcg,
	isolate, cb_arg,
	nr_to_walk);
	spin_unlock(&nlru->lock);
	diff --git a/mm/memcontrol.c b/mm/memcontrol.c
	index 2e78931..2fc2bf4 100644
	--- a/mm/memcontrol.c
	+++ b/mm/memcontrol.c
	@@ -222,21 +222,6 @@ enum res_type {
	/* Used for OOM nofiier */
	#define OOM_CONTROL (0)

	-/*
	- * Iteration constructs for visiting all cgroups (under a tree). If
	- * loops are exited prematurely (break), mem_cgroup_iter_break() must
	- * be used for reference counting.
	- */
	-#define for_each_mem_cgroup_tree(iter, root) \
	- for (iter = mem_cgroup_iter(root, NULL, NULL); \
	- iter != NULL; \
	- iter = mem_cgroup_iter(root, iter, NULL))
	-
	-#define for_each_mem_cgroup(iter) \
	- for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
	- iter != NULL; \
	- iter = mem_cgroup_iter(NULL, iter, NULL))
	-
	static inline bool should_force_charge(void)
	{
	return tsk_is_oom_victim(current) \|\| fatal_signal_pending(current) \|\|
	--
	1.8.3.1
	From 150abc1af5c95d05ba82b83a13bb93703b16f402 Mon Sep 17 00:00:00 2001
	From: Yafang Shao <[email protected]>
	Date: Fri, 20 Dec 2019 04:08:29 -0500
	Subject: [PATCH 5/5] memcg, inode: protect page cache from freeing inode

	On my server there're some running MEMCGs protected by memory.{min, low},
	but I found the usage of these MEMCGs abruptly became very small, which
	were far less than the protect limit. It confused me and finally I
	found that was because of inode stealing.
	Once an inode is freed, all its belonging page caches will be dropped as
	well, no matter how may page caches it has. So if we intend to protect the
	page caches in a memcg, we must protect their host (the inode) first.
	Otherwise the memcg protection can be easily bypassed with freeing inode,
	especially if there're big files in this memcg.

	Supposes we have a memcg, and the stat of this memcg is as bellow,
	memory.current = 1024M
	memory.min = 512M
	And in this memcg there's a inode with 800M page caches.
	Once this memcg is scanned by kswapd or other regular reclaimers,
	kswapd <<<< It can be either of the regular reclaimers.
	shrink_node_memcgs
	switch (mem_cgroup_protected()) <<<< Not protected
	case MEMCG_PROT_NONE: <<<< Will scan this memcg
	beak;
	shrink_lruvec() <<<< Reclaim the page caches
	shrink_slab() <<<< It may free this inode and drop all its
	page caches(800M).

	The inherent mismatch between memcg and inode is a trouble. One inode can
	be shared by different MEMCGs, but it is a very rare case. If an inode is
	shared, its belonging page caches may be charged to different MEMCGs.
	Currently there's no perfect solution to fix this kind of issue, but the
	inode majority-writer ownership switching can help it more or less.

	Cc: Roman Gushchin <[email protected]>
	Cc: Chris Down <[email protected]>
	Cc: Dave Chinner <[email protected]>
	Signed-off-by: Yafang Shao <[email protected]>
	---
	fs/inode.c \| 22 ++++++++++++++++++++--
	include/linux/memcontrol.h \| 11 +++++++++++
	mm/memcontrol.c \| 35 +++++++++++++++++++++++++++++++++++
	mm/vmscan.c \| 5 +++++
	4 files changed, 71 insertions(+), 2 deletions(-)

	diff --git a/fs/inode.c b/fs/inode.c
	index fef457a..dfd9140 100644
	--- a/fs/inode.c
	+++ b/fs/inode.c
	@@ -54,6 +54,13 @@
	* inode_hash_lock
	*/

	+struct inode_head {
	+ struct list_head *freeable;
	+#ifdef CONFIG_MEMCG_KMEM
	+ struct mem_cgroup *memcg;
	+#endif
	+};
	+
	static unsigned int i_hash_mask __read_mostly;
	static unsigned int i_hash_shift __read_mostly;
	static struct hlist_head *inode_hashtable __read_mostly;
	@@ -724,8 +731,10 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
	static enum lru_status inode_lru_isolate(struct list_head *item,
	struct list_lru_one lru, spinlock_t lru_lock, void *arg)
	{
	- struct list_head *freeable = arg;
	+ struct inode_head ihead = (struct inode_head )arg;
	+ struct list_head *freeable = ihead->freeable;
	struct inode *inode = container_of(item, struct inode, i_lru);
	+ struct mem_cgroup *memcg = ihead->memcg;

	/*
	* we are inverting the lru lock/inode->i_lock here, so use a trylock.
	@@ -734,6 +743,12 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
	if (!spin_trylock(&inode->i_lock))
	return LRU_SKIP;

	+ if (memcg && inode->i_data.nrpages &&
	+ !(memcg_can_reclaim_inode(memcg, inode))) {
	+ spin_unlock(&inode->i_lock);
	+ return LRU_ROTATE;
	+ }
	+
	/*
	* Referenced or dirty inodes are still in use. Give them another pass
	* through the LRU as we canot reclaim them now.
	@@ -789,11 +804,14 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
	*/
	long prune_icache_sb(struct super_block sb, struct shrink_control sc)
	{
	+ struct inode_head ihead;
	LIST_HEAD(freeable);
	long freed;

	+ ihead.freeable = &freeable;
	+ ihead.memcg = sc->memcg;
	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
	- inode_lru_isolate, &freeable);
	+ inode_lru_isolate, &ihead);
	dispose_list(&freeable);
	return freed;
	}
	diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
	index e1c5544..0552131 100644
	--- a/include/linux/memcontrol.h
	+++ b/include/linux/memcontrol.h
	@@ -247,6 +247,9 @@ struct mem_cgroup {
	unsigned int tcpmem_active : 1;
	unsigned int tcpmem_pressure : 1;

	+ /* Soft protection will be ignored if it's true */
	+ unsigned int in_low_reclaim : 1;
	+
	int under_oom;

	int swappiness;
	@@ -363,6 +366,8 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,

	enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
	struct mem_cgroup *memcg);
	+bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
	+ struct inode *inode);

	int mem_cgroup_try_charge(struct page page, struct mm_struct mm,
	gfp_t gfp_mask, struct mem_cgroup **memcgp,
	@@ -867,6 +872,12 @@ static inline enum mem_cgroup_protection mem_cgroup_protected(
	return MEMCG_PROT_NONE;
	}

	+static inline bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
	+ struct inode *memcg)
	+{
	+ return true;
	+}
	+
	static inline int mem_cgroup_try_charge(struct page page, struct mm_struct mm,
	gfp_t gfp_mask,
	struct mem_cgroup **memcgp,
	diff --git a/mm/memcontrol.c b/mm/memcontrol.c
	index 2fc2bf4..6756fd3 100644
	--- a/mm/memcontrol.c
	+++ b/mm/memcontrol.c
	@@ -6340,6 +6340,41 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
	}

	/**
	+ * Once an inode is freed, all its belonging page caches will be dropped as
	+ * well, even if there're lots of page caches. So if we intend to protect
	+ * page caches in a memcg, we must protect their host(the inode) first.
	+ * Otherwise the memcg protection can be easily bypassed with freeing inode,
	+ * especially if there're big files in this memcg.
	+ * The inherent mismatch between memcg and inode is a trouble. One inode
	+ * can be shared by different MEMCGs, but it is a very rare case. If
	+ * an inode is shared, its belonging page caches may be charged to
	+ * different MEMCGs. Currently there's no perfect solution to fix this
	+ * kind of issue, but the inode majority-writer ownership switching can
	+ * help it more or less.
	+ */
	+bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
	+ struct inode *inode)
	+{
	+ unsigned long cgroup_size;
	+ unsigned long protection;
	+ bool reclaimable = true;
	+
	+ protection = mem_cgroup_protection(memcg, memcg->in_low_reclaim);
	+ if (!protection)
	+ goto out;
	+
	+ /*
	+ * Don't protect this inode if the usage of this memcg is still
	+ * above the protection after reclaiming this inode and all its
	+ * belonging page caches.
	+ */
	+ cgroup_size = mem_cgroup_size(memcg);
	+ if (inode->i_data.nrpages + protection > cgroup_size)
	+ relaimable = false;
	+out:
	+ return reclaimable;
	+}
	+
	+/**
	* mem_cgroup_try_charge - try charging a page
	* @page: page to charge
	* @mm: mm context of the victim
	diff --git a/mm/vmscan.c b/mm/vmscan.c
	index 3c4c2da..ecc5c1d 100644
	--- a/mm/vmscan.c
	+++ b/mm/vmscan.c
	@@ -2666,6 +2666,8 @@ static void shrink_node_memcgs(pg_data_t pgdat, struct scan_control sc)
	sc->memcg_low_skipped = 1;
	continue;
	}
	+
	+ memcg->in_low_reclaim = 1;
	memcg_memory_event(memcg, MEMCG_LOW);
	break;
	case MEMCG_PROT_NONE:
	@@ -2693,6 +2695,9 @@ static void shrink_node_memcgs(pg_data_t pgdat, struct scan_control sc)
	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
	sc->priority);

	+ if (memcg->in_low_reclaim)
	+ memcg->in_low_reclaim = 0;
	+
	/* Record the group's reclaim efficiency */
	vmpressure(sc->gfp_mask, memcg, false,
	sc->nr_scanned - scanned,
	--
	1.8.3.1
	From bb831e95980190a65a9dd700e503758dc877c39f Mon Sep 17 00:00:00 2001
	From: Yafang Shao <[email protected]>
	Date: Mon, 16 Dec 2019 04:21:30 -0500
	Subject: [PATCH 1/5] mm, memcg: reduce size of struct mem_cgroup by using bit
	field

	There are some members in struct mem_group can be either 0(false) or
	1(true), so we can define them using bit field to reduce size. With this
	patch, the size of struct mem_cgroup can be reduced by 64 bytes in theory,
	but as there're some MEMCG_PADDING()s, the real number may be different,
	which is relate with the cacheline size. Anyway, this patch could reduce
	the size of struct mem_cgroup more or less.

	Signed-off-by: Yafang Shao <[email protected]>
	---
	include/linux/memcontrol.h \| 21 ++++++++++++---------
	1 file changed, 12 insertions(+), 9 deletions(-)

	diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
	index a7a0a1a5..612a457 100644
	--- a/include/linux/memcontrol.h
	+++ b/include/linux/memcontrol.h
	@@ -229,20 +229,26 @@ struct mem_cgroup {
	/*
	* Should the accounting and control be hierarchical, per subtree?
	*/
	- bool use_hierarchy;
	+ unsigned int use_hierarchy : 1;

	/*
	* Should the OOM killer kill all belonging tasks, had it kill one?
	*/
	- bool oom_group;
	+ unsigned int oom_group : 1;

	/* protected by memcg_oom_lock */
	- bool oom_lock;
	- int under_oom;
	+ unsigned int oom_lock : 1;

	- int swappiness;
	/* OOM-Killer disable */
	- int oom_kill_disable;
	+ unsigned int oom_kill_disable : 1;
	+
	+ /* Legacy tcp memory accounting */
	+ unsigned int tcpmem_active : 1;
	+ unsigned int tcpmem_pressure : 1;
	+
	+ int under_oom;
	+
	+ int swappiness;

	/* memory.events and memory.events.local */
	struct cgroup_file events_file;
	@@ -297,9 +303,6 @@ struct mem_cgroup {

	unsigned long socket_pressure;

	- /* Legacy tcp memory accounting */
	- bool tcpmem_active;
	- int tcpmem_pressure;

	#ifdef CONFIG_MEMCG_KMEM
	/* Index in the kmem_cache->memcg_params.memcg_caches array */
	--
	1.8.3.1