saagarjha · November 11, 2023 10:01
diff --git a/file_drain.c b/file_drain.c
 // Sometimes you have a large file on a small disk and would like to "transform"
 // it in some way: for example, by decompressing it. However, you might not have
 // enough space on disk to keep both the the compressed file and the
 // decompressed results. If the process can be done in a streaming fashion, it
 // would be nice if the file could be "drained"; that is, the file would be
 // sequentially deleted as it is consumed. At the start you'd have 100% of the
 // original file, somewhere in the middle you'd have about half of the original
 // file and half of your output, and by the end the original file will be gone
 // and you'll be left with just the results. If you do it this way, you might
 // be able to do the entire operation without extra space!
 //
 // file_drain does exactly that. It's quite simple: first it reverses the file
 // in place (technically, it reverses file *blocks*-but that's just for
 // performance). Then it goes backwards through the file and reads it back out.
 // Why all the reversing? Because POSIX offers an API to shrink a file from the
 // end: ftruncate(2). This lets it trim the file as it goes along, which is
 // exactly what it does.
 //
 // To use it, first compile it:
 // 
 // $ gcc file_drain.c -o file_drain
 //
 // Then, pass it the file you want to drain, and it will do its work and
 // redirect it out to standard output. For example:
 //
 // $ ./file_drain my_big_archive.tgz | tar zxf -
 //
 // WARNING: Due to the nature of how file_drain works, it is necessarily
 // somewhat unsafe, since it modifies your input file to reverse it and then
 // progressively prunes it. THIS IS A DESTRUCTIVE OPERATION. But it also means
 // that if something fails in the middle you'll end up with a mess that is
 // likely not recoverable, and I don't make any efforts to do so. I would
 // suggest only running this on files you can easily re-create (for example by
 // downloading them again) when things go wrong.


 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
 #include <unistd.h>

 #define BLOCK_SIZE (16 * 1 << 14) /* 16 * 16k */

 static void reverse(int fd, off_t head, off_t tail) {
 	static char buffer1[BLOCK_SIZE];
 	static char buffer2[BLOCK_SIZE];

 	tail -= BLOCK_SIZE;
 	while (head <= tail) {
 		pread(fd, buffer1, BLOCK_SIZE, head);
 		pread(fd, buffer2, BLOCK_SIZE, tail);
 		pwrite(fd, buffer2, BLOCK_SIZE, head);
 		pwrite(fd, buffer1, BLOCK_SIZE, tail);

 		head += BLOCK_SIZE;
 		tail -= BLOCK_SIZE;
 	}
 }

 static void do_write(int fd, char *buffer, size_t size) {
 	size_t written = 0;
 	while (written < size) {
 		ssize_t count = write(fd, buffer + written, size - written);
 		// Not much we can do here, but we might as well check for it.
 		// (Unless it's EINTR or something, but handling that is annoying.)
 		assert(count > 0);
 		written += count;
 	}
 }

 static void output(int fd, off_t tail, size_t size) {
 	static char buffer[BLOCK_SIZE];

 	tail -= BLOCK_SIZE;
 	while (size) {
 		pread(fd, buffer, BLOCK_SIZE, tail);
 		size_t to_write = size > BLOCK_SIZE ? BLOCK_SIZE : size;
 		do_write(STDOUT_FILENO, buffer, to_write);
 		
 		ftruncate(fd, tail);
 		tail -= BLOCK_SIZE;
 		size -= to_write;
 	}
 }

 int main(int argc, char **argv) {
 	assert(argc == 2);
 	int fd = open(*++argv, O_RDWR);
 	assert(fd >= 0);
 	off_t size = lseek(fd, 0, SEEK_END);
 	assert(size >= 0);
 	off_t rounded_size = lseek(fd, (size + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, SEEK_SET);
 	assert(rounded_size >= 0);

 	reverse(fd, 0, rounded_size);
 	output(fd, rounded_size, size);
 	
 	unlink(*argv);
 }
	// Sometimes you have a large file on a small disk and would like to "transform"
	// it in some way: for example, by decompressing it. However, you might not have
	// enough space on disk to keep both the the compressed file and the
	// decompressed results. If the process can be done in a streaming fashion, it
	// would be nice if the file could be "drained"; that is, the file would be
	// sequentially deleted as it is consumed. At the start you'd have 100% of the
	// original file, somewhere in the middle you'd have about half of the original
	// file and half of your output, and by the end the original file will be gone
	// and you'll be left with just the results. If you do it this way, you might
	// be able to do the entire operation without extra space!
	//
	// file_drain does exactly that. It's quite simple: first it reverses the file
	// in place (technically, it reverses file blocks-but that's just for
	// performance). Then it goes backwards through the file and reads it back out.
	// Why all the reversing? Because POSIX offers an API to shrink a file from the
	// end: ftruncate(2). This lets it trim the file as it goes along, which is
	// exactly what it does.
	//
	// To use it, first compile it:
	//
	// $ gcc file_drain.c -o file_drain
	//
	// Then, pass it the file you want to drain, and it will do its work and
	// redirect it out to standard output. For example:
	//
	// $ ./file_drain my_big_archive.tgz \| tar zxf -
	//
	// WARNING: Due to the nature of how file_drain works, it is necessarily
	// somewhat unsafe, since it modifies your input file to reverse it and then
	// progressively prunes it. THIS IS A DESTRUCTIVE OPERATION. But it also means
	// that if something fails in the middle you'll end up with a mess that is
	// likely not recoverable, and I don't make any efforts to do so. I would
	// suggest only running this on files you can easily re-create (for example by
	// downloading them again) when things go wrong.


	#include <fcntl.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <assert.h>
	#include <unistd.h>

	#define BLOCK_SIZE (16 * 1 << 14) /* 16 * 16k */

	static void reverse(int fd, off_t head, off_t tail) {
	static char buffer1[BLOCK_SIZE];
	static char buffer2[BLOCK_SIZE];

	tail -= BLOCK_SIZE;
	while (head <= tail) {
	pread(fd, buffer1, BLOCK_SIZE, head);
	pread(fd, buffer2, BLOCK_SIZE, tail);
	pwrite(fd, buffer2, BLOCK_SIZE, head);
	pwrite(fd, buffer1, BLOCK_SIZE, tail);

	head += BLOCK_SIZE;
	tail -= BLOCK_SIZE;
	}
	}

	static void do_write(int fd, char *buffer, size_t size) {
	size_t written = 0;
	while (written < size) {
	ssize_t count = write(fd, buffer + written, size - written);
	// Not much we can do here, but we might as well check for it.
	// (Unless it's EINTR or something, but handling that is annoying.)
	assert(count > 0);
	written += count;
	}
	}

	static void output(int fd, off_t tail, size_t size) {
	static char buffer[BLOCK_SIZE];

	tail -= BLOCK_SIZE;
	while (size) {
	pread(fd, buffer, BLOCK_SIZE, tail);
	size_t to_write = size > BLOCK_SIZE ? BLOCK_SIZE : size;
	do_write(STDOUT_FILENO, buffer, to_write);

	ftruncate(fd, tail);
	tail -= BLOCK_SIZE;
	size -= to_write;
	}
	}

	int main(int argc, char **argv) {
	assert(argc == 2);
	int fd = open(*++argv, O_RDWR);
	assert(fd >= 0);
	off_t size = lseek(fd, 0, SEEK_END);
	assert(size >= 0);
	off_t rounded_size = lseek(fd, (size + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, SEEK_SET);
	assert(rounded_size >= 0);

	reverse(fd, 0, rounded_size);
	output(fd, rounded_size, size);

	unlink(*argv);
	}
No results found