Last active
April 17, 2023 06:45
-
-
Save jiewei-ke/4962ac0061703fdcd0837bab2dbec261 to your computer and use it in GitHub Desktop.
This gist repo is about an ext4 deadlock issue which may be happened when OOM killer is triggered on CentOS 7 with version after 3.10.0-862.el7.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ext4 deadlock issue | |
## 0. Background | |
This gist repo is about an ext4 deadlock issue which may be happened when OOM | |
killer is triggered on CentOS 7 with version after 3.10.0-862.el7. | |
See https://bugzilla.redhat.com/show_bug.cgi?id=1911392 for more details. | |
## 1. How to reproduce this issue? | |
Download the ext4-repro-with-multiple-cgroup.sh and ext4-repro-with-write, and run | |
$ bash ext4-repro-with-multiple-cgroup.sh | |
It will do the following things, | |
1. Prepare a test binary, named 'ext4-repro-with-write', which will open a file | |
(located on an ext4 fs) and keep writing to it; see ext4-repro-with-write.go; | |
2. Create 10 cgroups, each with memory limit size 100 MB; | |
3. For each cgroup, create 30 services to run the ext4-repro-with-write binary; | |
4. The deadlock issue can be reproduced in 10 mins, the ext4 will hang; other | |
tasks doing I/O on the same ext4 fs will hang also. | |
To stop the test, run | |
$ systemctl stop ext4-repro-cgroup*-* | |
## 2. How to fix it? | |
We proposed two patches to fix this issue. | |
The final solution will be updated here when we get confirmation from Redhat. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/bash | |
echo "clean up" | |
systemctl stop ext4-repro-cgroup*-* | |
mkdir /usr/share/ext4-repro/ | |
cp ext4-repro-with-write /tmp/ext4-repro-with-write | |
chmod +x /tmp/ext4-repro-with-write | |
for i in `seq 1 10`;do | |
echo "prepare cgroup" | |
cat << EOF > /etc/systemd/system/system-fio${i}.slice | |
[Unit] | |
Description=zbs Slice | |
Before=slices.target | |
Wants=system.slice | |
After=system.slice | |
EOF | |
mkdir /etc/systemd/system/system-fio${i}.slice.d | |
cat << EOF > /etc/systemd/system/system-fio${i}.slice.d/50-MemoryLimit.conf | |
[Slice] | |
MemoryLimit=104857600 | |
EOF | |
systemctl daemon-reload | |
systemctl restart system-fio${i}.slice | |
systemctl status system-fio${i}.slice | |
echo "prepare fio data path" | |
TEST_PATH="/usr/share/ext4-repro" | |
mkdir $TEST_PATH | |
for j in `seq 1 30`;do | |
mkdir /etc/systemd/system/ext4-repro-cgroup${i}-${j}.service.d/ | |
cat << EOF > /etc/systemd/system/ext4-repro-cgroup${i}-${j}.service.d/cgroup.conf | |
[Service] | |
Slice=system-fio${i}.slice | |
EOF | |
cat << EOF > /usr/lib/systemd/system/ext4-repro-cgroup${i}-${j}.service | |
[Unit] | |
After=network.target | |
[Service] | |
ExecStart=/tmp/ext4-repro-with-write | |
Restart=always | |
RestartSec=3s | |
[Install] | |
WantedBy=multi-user.target | |
EOF | |
done | |
pushd /usr/lib/systemd/system | |
systemctl restart ext4-repro-cgroup${i}*.service | |
popd | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"flag" | |
"fmt" | |
"math/rand" | |
"os" | |
"strconv" | |
"time" | |
) | |
const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
const path = "/usr/share/ext4-repo/" | |
func RandStringBytes(n int) string { | |
b := make([]byte, n) | |
for i := range b { | |
b[i] = letterBytes[rand.Intn(len(letterBytes))] | |
} | |
return string(b) | |
} | |
func createFileName(path string) string { | |
name, _ := os.Hostname() | |
fn := path + name + strconv.Itoa(int(time.Now().Unix())) + RandStringBytes(8) | |
return fn | |
} | |
func w(path string) { | |
fn := createFileName(path) | |
fmt.Println(fn) | |
f, _ := os.OpenFile(fn, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) | |
fmt.Println(f) | |
defer os.Remove(fn) | |
defer f.Close() | |
for { | |
s := RandStringBytes(10) | |
_, _ = f.WriteString(s) | |
f.Sync() | |
} | |
} | |
func main() { | |
path := flag.String("path", "/usr/share/ext4-repo/", "a string") | |
flag.Parse() | |
fmt.Println("path:", *path) | |
go w(*path) | |
for { | |
time.Sleep(time.Duration(1) * time.Second) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 4b64f5a2fdeee9589572de9ddd44b8f8134777e4 Mon Sep 17 00:00:00 2001 | |
From: Jiewei Ke <[email protected]> | |
Date: Mon, 21 Dec 2020 20:57:27 -0500 | |
Subject: [PATCH 1/2] buffer: crossport patches to ensure grow_dev_page() will | |
always use GFP_NOFAIL | |
Following patches are crossported to 3.10.0, | |
bc48f001de12 buffer: eliminate the need to call free_more_memory() in getblk_slow() | |
94dc24c0c59a buffer: grow_dev_page() should use GFP_NOFAIL for all cases | |
640ab98fb362 buffer: have alloc_page_buffers() use __GFP_NOFAIL | |
--- | |
drivers/md/md-bitmap.c | 2 +- | |
fs/buffer.c | 62 ++++++++------------------------------------- | |
fs/ntfs/aops.c | 2 +- | |
fs/ntfs/mft.c | 2 +- | |
include/linux/buffer_head.h | 2 +- | |
5 files changed, 15 insertions(+), 55 deletions(-) | |
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c | |
index f0b5deb..aee6b00 100644 | |
--- a/drivers/md/md-bitmap.c | |
+++ b/drivers/md/md-bitmap.c | |
@@ -351,7 +351,7 @@ static int read_page(struct file *file, unsigned long index, | |
pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, | |
(unsigned long long)index << PAGE_SHIFT); | |
- bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); | |
+ bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false); | |
if (!bh) { | |
ret = -ENOMEM; | |
goto out; | |
diff --git a/fs/buffer.c b/fs/buffer.c | |
index ccb6790..0cda816 100644 | |
--- a/fs/buffer.c | |
+++ b/fs/buffer.c | |
@@ -254,27 +254,6 @@ out: | |
} | |
/* | |
- * Kick the writeback threads then try to free up some ZONE_NORMAL memory. | |
- */ | |
-static void free_more_memory(void) | |
-{ | |
- struct zone *zone; | |
- int nid; | |
- | |
- wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); | |
- yield(); | |
- | |
- for_each_online_node(nid) { | |
- (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), | |
- gfp_zone(GFP_NOFS), NULL, | |
- &zone); | |
- if (zone) | |
- try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, | |
- GFP_NOFS, NULL); | |
- } | |
-} | |
- | |
-/* | |
* I/O completion handler for block_read_full_page() - pages | |
* which come unlocked at the end of I/O. | |
*/ | |
@@ -829,16 +808,19 @@ int remove_inode_buffers(struct inode *inode) | |
* which may not fail from ordinary buffer allocations. | |
*/ | |
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, | |
- int retry) | |
+ bool retry) | |
{ | |
struct buffer_head *bh, *head; | |
+ gfp_t gfp = GFP_NOFS; | |
long offset; | |
-try_again: | |
+ if (retry) | |
+ gfp |= __GFP_NOFAIL; | |
+ | |
head = NULL; | |
offset = PAGE_SIZE; | |
while ((offset -= size) >= 0) { | |
- bh = alloc_buffer_head(GFP_NOFS); | |
+ bh = alloc_buffer_head(gfp); | |
if (!bh) | |
goto no_grow; | |
@@ -864,23 +846,7 @@ no_grow: | |
} while (head); | |
} | |
- /* | |
- * Return failure for non-async IO requests. Async IO requests | |
- * are not allowed to fail, so we have to wait until buffer heads | |
- * become available. But we don't want tasks sleeping with | |
- * partially complete buffers, so all were released above. | |
- */ | |
- if (!retry) | |
- return NULL; | |
- | |
- /* We're _really_ low on memory. Now we just | |
- * wait for old buffer heads to become free due to | |
- * finishing IO. Since this is an async request and | |
- * the reserve list is empty, we're sure there are | |
- * async buffer heads in use. | |
- */ | |
- free_more_memory(); | |
- goto try_again; | |
+ return NULL; | |
} | |
EXPORT_SYMBOL_GPL(alloc_page_buffers); | |
@@ -969,8 +935,6 @@ grow_dev_page(struct block_device *bdev, sector_t block, | |
gfp_mask |= __GFP_NOFAIL; | |
page = find_or_create_page(inode->i_mapping, index, gfp_mask); | |
- if (!page) | |
- return ret; | |
BUG_ON(!PageLocked(page)); | |
@@ -988,9 +952,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, | |
/* | |
* Allocate some buffers for this page | |
*/ | |
- bh = alloc_page_buffers(page, size, 0); | |
- if (!bh) | |
- goto failed; | |
+ bh = alloc_page_buffers(page, size, true); | |
/* | |
* Link the page to the buffers and initialise them. Take the | |
@@ -1070,8 +1032,6 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) | |
ret = grow_buffers(bdev, block, size); | |
if (ret < 0) | |
return NULL; | |
- if (ret == 0) | |
- free_more_memory(); | |
} | |
} | |
@@ -1537,7 +1497,7 @@ void create_empty_buffers(struct page *page, | |
{ | |
struct buffer_head *bh, *head, *tail; | |
- head = alloc_page_buffers(page, blocksize, 1); | |
+ head = alloc_page_buffers(page, blocksize, true); | |
bh = head; | |
do { | |
bh->b_state |= b_state; | |
@@ -2586,7 +2546,7 @@ int nobh_write_begin(struct address_space *mapping, | |
* Be careful: the buffer linked list is a NULL terminated one, rather | |
* than the circular one we're used to. | |
*/ | |
- head = alloc_page_buffers(page, blocksize, 0); | |
+ head = alloc_page_buffers(page, blocksize, false); | |
if (!head) { | |
ret = -ENOMEM; | |
goto out_release; | |
@@ -2866,7 +2826,7 @@ int block_truncate_page(struct address_space *mapping, | |
length = blocksize - length; | |
iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | |
- | |
+ | |
page = grab_cache_page(mapping, index); | |
err = -ENOMEM; | |
if (!page) | |
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c | |
index fa9c05f..9cb0166 100644 | |
--- a/fs/ntfs/aops.c | |
+++ b/fs/ntfs/aops.c | |
@@ -1599,7 +1599,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { | |
spin_lock(&mapping->private_lock); | |
if (unlikely(!page_has_buffers(page))) { | |
spin_unlock(&mapping->private_lock); | |
- bh = head = alloc_page_buffers(page, bh_size, 1); | |
+ bh = head = alloc_page_buffers(page, bh_size, true); | |
spin_lock(&mapping->private_lock); | |
if (likely(!page_has_buffers(page))) { | |
struct buffer_head *tail; | |
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c | |
index 3014a36..9db0e31 100644 | |
--- a/fs/ntfs/mft.c | |
+++ b/fs/ntfs/mft.c | |
@@ -506,7 +506,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, | |
if (unlikely(!page_has_buffers(page))) { | |
struct buffer_head *tail; | |
- bh = head = alloc_page_buffers(page, blocksize, 1); | |
+ bh = head = alloc_page_buffers(page, blocksize, true); | |
do { | |
set_buffer_uptodate(bh); | |
tail = bh; | |
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h | |
index ab09b3c..d470df0 100644 | |
--- a/include/linux/buffer_head.h | |
+++ b/include/linux/buffer_head.h | |
@@ -157,7 +157,7 @@ void set_bh_page(struct buffer_head *bh, | |
struct page *page, unsigned long offset); | |
int try_to_free_buffers(struct page *); | |
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, | |
- int retry); | |
+ bool retry); | |
void create_empty_buffers(struct page *, unsigned long, | |
unsigned long b_state); | |
void end_buffer_read_sync(struct buffer_head *bh, int uptodate); | |
-- | |
2.11.0 | |
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 7dc5fb5fd28afbc09736025ca42f0e15aefe719f Mon Sep 17 00:00:00 2001 | |
From: Jiewei Ke <[email protected]> | |
Date: Mon, 28 Dec 2020 01:56:41 -0500 | |
Subject: [PATCH 2/2] fs, mm: avoid unexpected NOFAIL removal in | |
__add_to_page_cache_locked | |
NOFAIL flag may be unexpectedly removed after mapping_gfp_constraint() is | |
called in __add_to_page_cache_locked(), which may cause critical issues | |
since upper layer cannot handle it. | |
For example, a deadlock in ext4 may happen like this: when the memory | |
usage is above the the cgroup limit, ext4 fs may loop forever in | |
__getblk_slow() waiting for free memory during transaction commit and | |
block all other transactions; while in order to release some memory usage, | |
OOM-killer may want to kill the task which is waiting for transaction and | |
becomes uninterruptable. | |
This patch fixes it by adding back the NOFAIL flag after | |
mapping_gfp_constraint() if it has been specified by upper layer. For the | |
ext4 case, there are no failure cases in grow_dev_page() that occur | |
because of a failed memory allocation. | |
Also add return value check for find_or_create_page() and | |
alloc_page_buffers() for safety, even though NOFAIL flag has been | |
specified. | |
--- | |
fs/buffer.c | 4 ++++ | |
mm/filemap.c | 5 +++++ | |
2 files changed, 9 insertions(+) | |
diff --git a/fs/buffer.c b/fs/buffer.c | |
index 0cda816..af3bbcb 100644 | |
--- a/fs/buffer.c | |
+++ b/fs/buffer.c | |
@@ -935,6 +935,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, | |
gfp_mask |= __GFP_NOFAIL; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment