Created
November 11, 2023 10:01
-
-
Save saagarjha/f0ff1911a6630f7417e12bc37dc39d9c to your computer and use it in GitHub Desktop.
"Drain" files while they are processed to reduce free disk space requirements
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Sometimes you have a large file on a small disk and would like to "transform" | |
// it in some way: for example, by decompressing it. However, you might not have | |
// enough space on disk to keep both the the compressed file and the | |
// decompressed results. If the process can be done in a streaming fashion, it | |
// would be nice if the file could be "drained"; that is, the file would be | |
// sequentially deleted as it is consumed. At the start you'd have 100% of the | |
// original file, somewhere in the middle you'd have about half of the original | |
// file and half of your output, and by the end the original file will be gone | |
// and you'll be left with just the results. If you do it this way, you might | |
// be able to do the entire operation without extra space! | |
// | |
// file_drain does exactly that. It's quite simple: first it reverses the file | |
// in place (technically, it reverses file *blocks*-but that's just for | |
// performance). Then it goes backwards through the file and reads it back out. | |
// Why all the reversing? Because POSIX offers an API to shrink a file from the | |
// end: ftruncate(2). This lets it trim the file as it goes along, which is | |
// exactly what it does. | |
// | |
// To use it, first compile it: | |
// | |
// $ gcc file_drain.c -o file_drain | |
// | |
// Then, pass it the file you want to drain, and it will do its work and | |
// redirect it out to standard output. For example: | |
// | |
// $ ./file_drain my_big_archive.tgz | tar zxf - | |
// | |
// WARNING: Due to the nature of how file_drain works, it is necessarily | |
// somewhat unsafe, since it modifies your input file to reverse it and then | |
// progressively prunes it. THIS IS A DESTRUCTIVE OPERATION. But it also means | |
// that if something fails in the middle you'll end up with a mess that is | |
// likely not recoverable, and I don't make any efforts to do so. I would | |
// suggest only running this on files you can easily re-create (for example by | |
// downloading them again) when things go wrong. | |
#include <fcntl.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <assert.h> | |
#include <unistd.h> | |
#define BLOCK_SIZE (16 * 1 << 14) /* 16 * 16k */ | |
static void reverse(int fd, off_t head, off_t tail) { | |
static char buffer1[BLOCK_SIZE]; | |
static char buffer2[BLOCK_SIZE]; | |
tail -= BLOCK_SIZE; | |
while (head <= tail) { | |
pread(fd, buffer1, BLOCK_SIZE, head); | |
pread(fd, buffer2, BLOCK_SIZE, tail); | |
pwrite(fd, buffer2, BLOCK_SIZE, head); | |
pwrite(fd, buffer1, BLOCK_SIZE, tail); | |
head += BLOCK_SIZE; | |
tail -= BLOCK_SIZE; | |
} | |
} | |
static void do_write(int fd, char *buffer, size_t size) { | |
size_t written = 0; | |
while (written < size) { | |
ssize_t count = write(fd, buffer + written, size - written); | |
// Not much we can do here, but we might as well check for it. | |
// (Unless it's EINTR or something, but handling that is annoying.) | |
assert(count > 0); | |
written += count; | |
} | |
} | |
static void output(int fd, off_t tail, size_t size) { | |
static char buffer[BLOCK_SIZE]; | |
tail -= BLOCK_SIZE; | |
while (size) { | |
pread(fd, buffer, BLOCK_SIZE, tail); | |
size_t to_write = size > BLOCK_SIZE ? BLOCK_SIZE : size; | |
do_write(STDOUT_FILENO, buffer, to_write); | |
ftruncate(fd, tail); | |
tail -= BLOCK_SIZE; | |
size -= to_write; | |
} | |
} | |
int main(int argc, char **argv) { | |
assert(argc == 2); | |
int fd = open(*++argv, O_RDWR); | |
assert(fd >= 0); | |
off_t size = lseek(fd, 0, SEEK_END); | |
assert(size >= 0); | |
off_t rounded_size = lseek(fd, (size + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, SEEK_SET); | |
assert(rounded_size >= 0); | |
reverse(fd, 0, rounded_size); | |
output(fd, rounded_size, size); | |
unlink(*argv); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment