Skip to content

Commit b4536f0

Browse files
Michal Hockotorvalds
authored andcommitted
mm, memcg: fix the active list aging for lowmem requests when memcg is enabled
Nils Holland and Klaus Ethgen have reported unexpected OOM killer invocations with 32b kernel starting with 4.8 kernels kworker/u4:5 invoked oom-killer: gfp_mask=0x2400840(GFP_NOFS|__GFP_NOFAIL), nodemask=0, order=0, oom_score_adj=0 kworker/u4:5 cpuset=/ mems_allowed=0 CPU: 1 PID: 2603 Comm: kworker/u4:5 Not tainted 4.9.0-gentoo #2 [...] Mem-Info: active_anon:58685 inactive_anon:90 isolated_anon:0 active_file:274324 inactive_file:281962 isolated_file:0 unevictable:0 dirty:649 writeback:0 unstable:0 slab_reclaimable:40662 slab_unreclaimable:17754 mapped:7382 shmem:202 pagetables:351 bounce:0 free:206736 free_pcp:332 free_cma:0 Node 0 active_anon:234740kB inactive_anon:360kB active_file:1097296kB inactive_file:1127848kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:29528kB dirty:2596kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 184320kB anon_thp: 808kB writeback_tmp:0kB unstable:0kB pages_scanned:0 all_unreclaimable? no DMA free:3952kB min:788kB low:984kB high:1180kB active_anon:0kB inactive_anon:0kB active_file:7316kB inactive_file:0kB unevictable:0kB writepending:96kB present:15992kB managed:15916kB mlocked:0kB slab_reclaimable:3200kB slab_unreclaimable:1408kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 813 3474 3474 Normal free:41332kB min:41368kB low:51708kB high:62048kB active_anon:0kB inactive_anon:0kB active_file:532748kB inactive_file:44kB unevictable:0kB writepending:24kB present:897016kB managed:836248kB mlocked:0kB slab_reclaimable:159448kB slab_unreclaimable:69608kB kernel_stack:1112kB pagetables:1404kB bounce:0kB free_pcp:528kB local_pcp:340kB free_cma:0kB lowmem_reserve[]: 0 0 21292 21292 HighMem free:781660kB min:512kB low:34356kB high:68200kB active_anon:234740kB inactive_anon:360kB active_file:557232kB inactive_file:1127804kB unevictable:0kB writepending:2592kB present:2725384kB managed:2725384kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:800kB local_pcp:608kB free_cma:0kB the oom killer is clearly pre-mature because there there is still a lot of page cache in the zone Normal which should satisfy this lowmem request. Further debugging has shown that the reclaim cannot make any forward progress because the page cache is hidden in the active list which doesn't get rotated because inactive_list_is_low is not memcg aware. The code simply subtracts per-zone highmem counters from the respective memcg's lru sizes which doesn't make any sense. We can simply end up always seeing the resulting active and inactive counts 0 and return false. This issue is not limited to 32b kernels but in practice the effect on systems without CONFIG_HIGHMEM would be much harder to notice because we do not invoke the OOM killer for allocations requests targeting < ZONE_NORMAL. Fix the issue by tracking per zone lru page counts in mem_cgroup_per_node and subtract per-memcg highmem counts when memcg is enabled. Introduce helper lruvec_zone_lru_size which redirects to either zone counters or mem_cgroup_get_zone_lru_size when appropriate. We are losing empty LRU but non-zero lru size detection introduced by ca70723 ("mm: update_lru_size warn and reset bad lru_size") because of the inherent zone vs. node discrepancy. Fixes: f8d1a31 ("mm: consider whether to decivate based on eligible zones inactive ratio") Link: http://lkml.kernel.org/r/20170104100825.3729-1-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Reported-by: Nils Holland <nholland@tisys.org> Tested-by: Nils Holland <nholland@tisys.org> Reported-by: Klaus Ethgen <Klaus@Ethgen.de> Acked-by: Minchan Kim <minchan@kernel.org> Acked-by: Mel Gorman <mgorman@suse.de> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: <stable@vger.kernel.org> [4.8+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent f073bdc commit b4536f0

4 files changed

Lines changed: 49 additions & 24 deletions

File tree

include/linux/memcontrol.h

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ struct mem_cgroup_reclaim_iter {
120120
*/
121121
struct mem_cgroup_per_node {
122122
struct lruvec lruvec;
123-
unsigned long lru_size[NR_LRU_LISTS];
123+
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
124124

125125
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
126126

@@ -432,7 +432,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
432432
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
433433

434434
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
435-
int nr_pages);
435+
int zid, int nr_pages);
436436

437437
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
438438
int nid, unsigned int lru_mask);
@@ -441,9 +441,23 @@ static inline
441441
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
442442
{
443443
struct mem_cgroup_per_node *mz;
444+
unsigned long nr_pages = 0;
445+
int zid;
444446

445447
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
446-
return mz->lru_size[lru];
448+
for (zid = 0; zid < MAX_NR_ZONES; zid++)
449+
nr_pages += mz->lru_zone_size[zid][lru];
450+
return nr_pages;
451+
}
452+
453+
static inline
454+
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
455+
enum lru_list lru, int zone_idx)
456+
{
457+
struct mem_cgroup_per_node *mz;
458+
459+
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
460+
return mz->lru_zone_size[zone_idx][lru];
447461
}
448462

449463
void mem_cgroup_handle_over_high(void);
@@ -671,6 +685,12 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
671685
{
672686
return 0;
673687
}
688+
static inline
689+
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
690+
enum lru_list lru, int zone_idx)
691+
{
692+
return 0;
693+
}
674694

675695
static inline unsigned long
676696
mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,

include/linux/mm_inline.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
3939
{
4040
__update_lru_size(lruvec, lru, zid, nr_pages);
4141
#ifdef CONFIG_MEMCG
42-
mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
42+
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
4343
#endif
4444
}
4545

mm/memcontrol.c

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -625,17 +625,16 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
625625
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
626626
int nid, unsigned int lru_mask)
627627
{
628+
struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
628629
unsigned long nr = 0;
629-
struct mem_cgroup_per_node *mz;
630630
enum lru_list lru;
631631

632632
VM_BUG_ON((unsigned)nid >= nr_node_ids);
633633

634634
for_each_lru(lru) {
635635
if (!(BIT(lru) & lru_mask))
636636
continue;
637-
mz = mem_cgroup_nodeinfo(memcg, nid);
638-
nr += mz->lru_size[lru];
637+
nr += mem_cgroup_get_lru_size(lruvec, lru);
639638
}
640639
return nr;
641640
}
@@ -1002,34 +1001,33 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
10021001
* mem_cgroup_update_lru_size - account for adding or removing an lru page
10031002
* @lruvec: mem_cgroup per zone lru vector
10041003
* @lru: index of lru list the page is sitting on
1004+
* @zid: zone id of the accounted pages
10051005
* @nr_pages: positive when adding or negative when removing
10061006
*
10071007
* This function must be called under lru_lock, just before a page is added
10081008
* to or just after a page is removed from an lru list (that ordering being
10091009
* so as to allow it to check that lru_size 0 is consistent with list_empty).
10101010
*/
10111011
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1012-
int nr_pages)
1012+
int zid, int nr_pages)
10131013
{
10141014
struct mem_cgroup_per_node *mz;
10151015
unsigned long *lru_size;
10161016
long size;
1017-
bool empty;
10181017

10191018
if (mem_cgroup_disabled())
10201019
return;
10211020

10221021
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1023-
lru_size = mz->lru_size + lru;
1024-
empty = list_empty(lruvec->lists + lru);
1022+
lru_size = &mz->lru_zone_size[zid][lru];
10251023

10261024
if (nr_pages < 0)
10271025
*lru_size += nr_pages;
10281026

10291027
size = *lru_size;
1030-
if (WARN_ONCE(size < 0 || empty != !size,
1031-
"%s(%p, %d, %d): lru_size %ld but %sempty\n",
1032-
__func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
1028+
if (WARN_ONCE(size < 0,
1029+
"%s(%p, %d, %d): lru_size %ld\n",
1030+
__func__, lruvec, lru, nr_pages, size)) {
10331031
VM_BUG_ON(1);
10341032
*lru_size = 0;
10351033
}

mm/vmscan.c

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,16 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
242242
return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
243243
}
244244

245+
unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru,
246+
int zone_idx)
247+
{
248+
if (!mem_cgroup_disabled())
249+
return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx);
250+
251+
return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx],
252+
NR_ZONE_LRU_BASE + lru);
253+
}
254+
245255
/*
246256
* Add a shrinker callback to be called from the vm.
247257
*/
@@ -1382,8 +1392,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
13821392
* be complete before mem_cgroup_update_lru_size due to a santity check.
13831393
*/
13841394
static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1385-
enum lru_list lru, unsigned long *nr_zone_taken,
1386-
unsigned long nr_taken)
1395+
enum lru_list lru, unsigned long *nr_zone_taken)
13871396
{
13881397
int zid;
13891398

@@ -1392,11 +1401,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
13921401
continue;
13931402

13941403
__update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1395-
}
1396-
13971404
#ifdef CONFIG_MEMCG
1398-
mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
1405+
mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
13991406
#endif
1407+
}
1408+
14001409
}
14011410

14021411
/*
@@ -1501,7 +1510,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
15011510
*nr_scanned = scan;
15021511
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
15031512
nr_taken, mode, is_file_lru(lru));
1504-
update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
1513+
update_lru_sizes(lruvec, lru, nr_zone_taken);
15051514
return nr_taken;
15061515
}
15071516

@@ -2047,10 +2056,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
20472056
if (!managed_zone(zone))
20482057
continue;
20492058

2050-
inactive_zone = zone_page_state(zone,
2051-
NR_ZONE_LRU_BASE + (file * LRU_FILE));
2052-
active_zone = zone_page_state(zone,
2053-
NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
2059+
inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid);
2060+
active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid);
20542061

20552062
inactive -= min(inactive, inactive_zone);
20562063
active -= min(active, active_zone);

0 commit comments

Comments
 (0)