Skip to content

Instantly share code, notes, and snippets.

@karthick18
Created December 28, 2010 23:51
Show Gist options
  • Save karthick18/757928 to your computer and use it in GitHub Desktop.
Save karthick18/757928 to your computer and use it in GitHub Desktop.
Another variant of the last gist: 757086 but using mmap to write to backing storage over write to try and create a kernel panic in __mark_inode_dirty: fs-writeback.c :978 when USB device with ext4 is removed while the writes are targeted to its backing st
/*
* The below code is an effort to reproduce a kernel panic in __mark_inode_dirty triggered by a USB device pull
* while in the middle of writes to the disk. The issue is _consistently_ reproducible with our product runtime
* and log directories mount-binded to the USB drive and ejecting the USB drive while they are running.
* My 4 and a half yr old daughter has to be credited with reproducing the issue since I first observed the
* panic when she mercilessly pulled the USB cable while I was working :) After that, I have been trying hard
* to isolate the issue outside our product code without much success.
* The below code is an effort to reproduce an ext4 uninterruptibe task hang by trying
* to create many mmapped files from child processes logging to a USB backing storage and forcefully ejecting
* the USB drive in the middle of such writes. This code is a variant of another unsuccessful kernel-panic test code
* also in my github gist: https://gist.github.com/757086 that just uses write over mmap to write to the backing storage
* The panic is a result of the inodes backing device marker becoming NULL for superblock type:ext4 for DIRTY inodes.
* The NULL bdi pointer crashes the 2.6.36-rc8 kernel in __mark_inode_dirty: fs/fs-writeback.c: 978.
* But this test-code successfully reproduces the hung task warnings (> 120 seconds uninterruptible task hangs)
* after pulling the USB drive while in the middle of the writes. Invariably 3-4 child processes writing to the backing
* storage end up UNINTERRUPTIBLE state with the kernel dumping the backtrace of the tasks every 120 seconds.
* The uninterruptible hangs show a lockup in the ext4 sync page code-path associated with the journalled commits.
* To be run as: ./crash_kernel -c <num_log_files_to_be_created> -s <each_log_file_size>
* Example run: ./crash_kernel -c 10 -s 25m ,
* would create 10 files with 10 log flusher processes flushing from its
* shared memory to backing storage mount binded to USB. Then eject the USB drive, Ctrl-C the program. The files would
* be created in /root/crash_test where /root is expected to be mount-binded to a USB drive. You can change the
* USB_DIR define below to your USB mount point.
* Do a "ps" and check for crash_kernel child processes entering "D" or uninterruptible state FOREVER thus creating
* uninterruptible task hung backtraces every 120 seconds.
* Don't believe a git pull might resolve the issue even though I am running a slightly old kernel(2.6.36-rc8)
* than the current since I don't see fixes/changes to ext4/fs-writeback.c in the latest.
* Note that I could have just used a series of "dd" but that does reproduce anything. The effort was to try and simulate
* a similar timing condition that seems to be ALWAYS reproduced when running our products applications having their log
* and runtime directories mount-binded to the USB drive.
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <assert.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <getopt.h>
#include <sys/stat.h>
#include <pthread.h>
#include <ctype.h>
#include <string.h>
#include <semaphore.h>
#include <time.h>
#include <signal.h>
#include <sys/wait.h>
#include <errno.h>
#undef MIN
#define MIN(a, b) ( (a) < (b) ? (a) : (b) )
#define USB_DIR "/root"
#define FLUSH_DIR "crash_test"
#define SHM_SEGMENT_SIZE (1 << 20)
#define SHM_SEGMENT_FLUSH_RATE (20)
#define LOG_FILE_SIZE_DEFAULT (50LL << 20) /* 50 mb */
#define LOG_FILES_DEFAULT 0x5
#define MAX_ROTATIONS (0x4) /* power of 2 */
#define ROTATION_MASK (MAX_ROTATIONS - 1)
#define SHM_WRITE_SIZE (SHM_SEGMENT_SIZE - sizeof(struct log_shm_ctl))
struct log_shm_ctl
{
sem_t sem;
int read_index;
int write_index;
} __attribute__ ((aligned(8)));
static volatile int exitme;
static char crash_dir[0xff+1];
static unsigned long long filesize = LOG_FILE_SIZE_DEFAULT;
static int verbose;
#define LDEBUG(fmt, arg...) do { if(__builtin_expect(verbose, 0)) printf(fmt, ##arg); } while(0)
#ifdef SEM_RETRY /* required to test sem_init or futex initializes with non-process shared flag used with processes :P */
#define SEM_WAIT(sem) ({ \
int __err = 0; \
do { \
struct timespec __t; \
clock_gettime(CLOCK_REALTIME, &__t); \
__t.tv_sec += 1; \
__err = sem_timedwait((sem), &__t); \
} while (__err && ( errno == EINTR || errno == ETIMEDOUT)); \
__err; \
})
#else
#define SEM_WAIT(sem) sem_wait(sem)
#endif
#define SEM_POST(sem) sem_post(sem)
static void sigint_handler(int sig)
{
exitme = 1;
}
/*
* We halt when 1 byte of space is remaining as an indicator to throttle the writer or mark space_full
*/
static int space_free(struct log_shm_ctl *ctl)
{
if(ctl->read_index == ctl->write_index)
return SHM_WRITE_SIZE-1;
return (( ctl->read_index + SHM_WRITE_SIZE - ctl->write_index) % SHM_WRITE_SIZE) - 1;
}
/*
* Called with shm lock held.
*/
static void log_shm_write(struct log_shm_ctl *ctl, const char *fmt, ...)
{
char buf[0xff+1];
int len;
int space;
char *data;
va_list ptr;
int pid = getpid();
int err;
va_start(ptr, fmt);
len = vsnprintf(buf, sizeof(buf), fmt, ptr);
va_end(ptr);
data = (char*)(ctl+1);
/*
* Block till space is available.
*/
while(!exitme && !(space=space_free(ctl)))
{
SEM_POST(&ctl->sem);
usleep(1000);
LDEBUG("Log writer process [%d] going to grab the lock in shm_write as its full\n", pid);
err = SEM_WAIT(&ctl->sem);
if(!err)
LDEBUG("Log writer process [%d] grabbed the lock in shm_write as its full\n", pid);
else
LDEBUG("Log writer process [%d] failed to grab the lock in shm_write with error [%s]\n",
pid, strerror(errno));
}
if(exitme) return;
len = MIN(len, space);
if(ctl->write_index >= ctl->read_index)
len = MIN(len, SHM_WRITE_SIZE - ctl->write_index);
else
len = MIN(len, ctl->read_index - ctl->write_index - 1);
memcpy(data + ctl->write_index, buf, len);
data[ctl->write_index + len - 1] = '\n' ; /*truncate with a newline*/
ctl->write_index += len;
if(ctl->write_index == ctl->read_index)
ctl->write_index = 0;
}
/*
* Called with shm lock held.
*/
static void log_shm_read(struct log_shm_ctl *ctl, char *data, int *size)
{
int count;
int max_size = *size;
char *map = (char*)(ctl+1);
pid_t pid = getpid();
int err;
while(!exitme && ctl->read_index == ctl->write_index)
{
SEM_POST(&ctl->sem);
usleep(1000);
LDEBUG("Log flusher task [%d] going to grab the lock in shm read as shm is empty\n", pid);
err = SEM_WAIT(&ctl->sem);
if(!err)
LDEBUG("Log flusher task [%d] grabbed the lock in shm read as shm is empty\n", pid);
else
LDEBUG("Log flusher task [%d] failed to grab the lock in shm read with error [%s]\n",
pid, strerror(errno));
}
if(ctl->read_index <= ctl->write_index)
count = MIN(max_size, ctl->write_index - ctl->read_index);
else
count = MIN(max_size, SHM_WRITE_SIZE - ctl->read_index);
memcpy(data, map + ctl->read_index, count);
ctl->read_index += count;
if(ctl->read_index == SHM_WRITE_SIZE)
ctl->read_index = 0;
*size = count;
}
static void do_write(char *filemap, unsigned long long offset, const char *buf, int bytes)
{
memcpy(filemap + offset, buf, bytes);
}
static int flusher_task(int tid, int fd, char *filemap, unsigned long long filesize)
{
int shm_fd;
char filename[0xff+1];
int rotations = 0;
int cur_filesize = 0;
char *map;
struct log_shm_ctl *ctl;
static char *shared_data;
int count = 0;
pid_t pid = getpid();
int err;
fprintf(stderr, "Log flusher task [%d] for tid [%d]\n", pid, tid);
snprintf(filename, sizeof(filename), "/CRASH_%d", tid);
shm_fd = shm_open(filename, O_RDWR, 0777);
if(shm_fd < 0)
{
perror("shm_open:");
goto out;
}
map = mmap(0, SHM_SEGMENT_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
if(map == MAP_FAILED)
{
perror("mmap:");
goto out_close;
}
snprintf(filename, sizeof(filename), "%s/%d", crash_dir, tid);
ctl = (struct log_shm_ctl*)map;
shared_data = calloc(1, SHM_SEGMENT_SIZE);
assert(shared_data);
SEM_WAIT(&ctl->sem);
while(!exitme)
{
count = SHM_WRITE_SIZE;
log_shm_read(ctl, shared_data, &count);
if(!count) continue;
SEM_POST(&ctl->sem); /* drop the lock*/
if(cur_filesize + count < filesize)
{
do_write(filemap, cur_filesize, shared_data, count);
cur_filesize += count;
}
else
{
register int i;
int write_size;
char fromfile[0xff+1];
char tofile[0xff+1];
write_size = (int)(filesize - cur_filesize);
do_write(filemap, cur_filesize, shared_data, write_size);
msync(filemap, filesize, MS_SYNC);
count -= write_size;
for(i = rotations - 1; i >= 0; --i)
{
snprintf(fromfile, sizeof(fromfile), "%s/%d.%d", crash_dir, tid, i);
snprintf(tofile, sizeof(tofile), "%s/%d.%d", crash_dir, tid, i+1);
unlink(tofile);
rename(fromfile, tofile);
}
snprintf(fromfile, sizeof(fromfile), "%s/%d", crash_dir, tid);
snprintf(tofile, sizeof(tofile), "%s/%d.0", crash_dir, tid);
unlink(tofile);
rename(fromfile, tofile);
++rotations;
rotations &= ROTATION_MASK;
close(fd);
fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0777);
if(fd < 0)
{
perror("rotate open:");
goto out_free;
}
if(ftruncate(fd, filesize) < 0)
{
perror("rotate truncate:");
goto out_free;
}
filemap = mmap(0, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if(filemap == MAP_FAILED)
{
perror("rotate mmap:");
goto out_free;
}
if(count > 0)
{
do_write(filemap, 0, shared_data + write_size, count);
}
cur_filesize = count;
}
LDEBUG("Log flusher task [%d] going to grab the lock after shm write\n", pid);
err = SEM_WAIT(&ctl->sem);
if(!err)
LDEBUG("Log flusher task [%d] grabbed the lock after shm write\n", pid);
else
LDEBUG("Log flusher task [%d] failed to grab the lock after shm write with error [%s]\n",
pid, strerror(errno));
}
SEM_POST(&ctl->sem);
out_free:
free(shared_data);
munmap(map, SHM_SEGMENT_SIZE);
out_close:
close(shm_fd);
out:
return 0;
}
static void *log_thread(void *arg)
{
int tid = (int)(unsigned long)arg;
pid_t pid = getpid();
int fd, target_fd;
char filename[0xff+1];
char *map, *target_map;
register int i;
struct log_shm_ctl *ctl;
pid_t flusher_pid;
int err;
/*
* Mmap the shared segment and create a flusher thread.
*/
snprintf(filename, sizeof(filename), "/CRASH_%d", tid);
fd = shm_open(filename, O_RDWR, 0777);
if(fd < 0)
{
perror("shm_open:");
goto out;
}
map = mmap(0, SHM_SEGMENT_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if(map == MAP_FAILED)
{
perror("mmap:");
goto out_close;
}
snprintf(filename, sizeof(filename), "%s/%d", crash_dir, tid);
target_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0777);
if(target_fd < 0)
{
perror("target_fd open:");
goto out_unmap;
}
if(ftruncate(target_fd, filesize) < 0)
{
perror("target ftruncate:");
goto out_close2;
}
target_map = mmap(0, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, target_fd, 0);
if(target_map == MAP_FAILED)
{
perror("target mmap:");
goto out_close2;
}
ctl = (struct log_shm_ctl*)map;
err = sem_init(&ctl->sem, 1, 1);
if(err < 0)
{
perror("sem_init:");
assert(0);
}
ctl->read_index = ctl->write_index = 0;
SEM_WAIT(&ctl->sem);
/*
* test write and create flusher.
*/
log_shm_write(ctl, "This is test write start\n");
if( !(flusher_pid = fork() ))
{
flusher_task(tid, target_fd, target_map, filesize);
if(target_map)
munmap(target_map, filesize);
if(target_fd >= 0)
close(target_fd);
exit(0);
} else signal(SIGCHLD, SIG_IGN);
while(!exitme)
{
for(i = 0; i < SHM_SEGMENT_FLUSH_RATE; ++i)
log_shm_write(ctl, "This is test write %d to crash the kernel\n", i);
if(exitme) break;
SEM_POST(&ctl->sem);
usleep(1000); /* breather for the flusher to run */
LDEBUG("Log writer [%d], flusher [%d] grabbing the lock before batch shm write\n", pid, flusher_pid);
err = SEM_WAIT(&ctl->sem);
if(!err)
LDEBUG("Log writer [%d], flusher [%d] grabbed the lock after batch shm write\n", pid, flusher_pid);
else
LDEBUG("Log writer [%d], flusher [%d] failed to grab the lock after batch shm write with error [%s]\n",
pid, flusher_pid, strerror(errno));
}
SEM_POST(&ctl->sem);
waitpid(flusher_pid, NULL, 0);
sem_destroy(&ctl->sem);
munmap(target_map, filesize);
out_close2:
close(target_fd);
out_unmap:
munmap(map, SHM_SEGMENT_SIZE);
out_close:
close(fd);
out:
LDEBUG("Exiting log thread [%d]\n", tid);
return NULL;
}
static void crash_kernel(const char *usb_mnt, int files)
{
register int i;
pthread_t *tids;
snprintf(crash_dir, sizeof(crash_dir), "%s/%s", USB_DIR, FLUSH_DIR);
if(!access(crash_dir, F_OK))
{
char cmdbuf[0xff+1];
snprintf(cmdbuf, sizeof(cmdbuf), "rm -rf %s", crash_dir);
if(system(cmdbuf))
perror("system: ");
}
if(mkdir(crash_dir, 0755))
{
perror("mkdir:");
fprintf(stderr, "Unable to create crash test directory [%s]. Exiting ...\n", crash_dir);
exit(127);
}
for(i = 0; i < files; ++i)
{
char filename[0xff+1];
int fd;
snprintf(filename, sizeof(filename), "/CRASH_%d", i);
fd = shm_open(filename, O_CREAT | O_RDWR | O_TRUNC, 0777);
if(fd < 0)
{
perror("shm_open:");
assert(0);
}
if(ftruncate(fd, SHM_SEGMENT_SIZE) < 0)
{
perror("ftruncate:");
assert(0);
}
close(fd);
}
tids = calloc(files, sizeof(*tids));
assert(tids);
for(i = 0; i < files; ++i)
{
int err = pthread_create(&tids[i], NULL, log_thread, (void*)(unsigned long)i);
assert(err == 0);
}
for(i =0 ; i < files; ++i)
pthread_join(tids[i], NULL);
for(i = 0; i < files; ++i)
{
char filename[0xff+1];
snprintf(filename, sizeof(filename), "/CRASH_%d", i);
if(shm_unlink(filename))
perror("shm_unlink:");
}
}
static char *prog;
static volatile int usage(void)
{
fprintf(stderr, "[%s] -s log_file_size -c num_log_files -v | verbose\n", prog);
exit(127);
}
static unsigned long long get_size(const char *s)
{
unsigned long long size = 0;
int shift = 10;
char *last_char = NULL;
size = strtoull(s, &last_char, 10);
if(*last_char)
{
switch( tolower(*last_char) )
{
case 'k':
shift = 10;
break;
case 'm':
shift = 20 ;
break;
case 'g':
shift = 30;
break;
default:
fprintf(stderr, "Invalid suffix [%c] to size string [%s]. Using KB\n", *last_char, s);
shift = 10;
break;
}
}
size <<= shift;
return size;
}
int main(int argc, char **argv)
{
int count = LOG_FILES_DEFAULT;
int c;
if ( ( prog = strrchr(argv[0], '/') ) )
++prog;
else prog = argv[0];
opterr = 0;
while ( ( c = getopt(argc, argv, "vs:c:h") ) != EOF )
switch(c)
{
case 's':
filesize = get_size(optarg);
if(!filesize) filesize = LOG_FILE_SIZE_DEFAULT;
break;
case 'c':
{
char *last_char = NULL;
count = strtol(optarg, &last_char, 10);
if(*last_char)
{
fprintf(stderr, "Invalid count specified [%s]. Should be an integer\n", optarg);
exit(127);
}
if(!count) count = LOG_FILES_DEFAULT;
}
break;
case 'v':
verbose = 1;
break;
case 'h':
case '?':
default:
{
usage();
}
break;
}
if(optind != argc) usage();
signal(SIGINT, sigint_handler);
printf("Testing shared memory flushes with size [%lld], count [%d]\n", filesize, count);
printf("Press Ctrl-C to exit\n");
crash_kernel(USB_DIR, count);
return 0;
}
/*
* Local variables:
* c-file-style: "linux"
* compile-command: "gcc -Wall -g -o crash_kernel crash_kernel.c -lpthread -lrt"
* tab-width: 4
* c-basic-offset: 4
* End:
*/
@karthick18
Copy link
Author

Also raised a high-priority ext4 bug for the above issue: https://bugzilla.kernel.org/show_bug.cgi?id=25792

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment