Last active
December 23, 2015 02:39
-
-
Save sirupsen/6568256 to your computer and use it in GitHub Desktop.
Memory mapped minimum range segment tree.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This is a simple example of using memory mapped I/O with a data structure. | |
* The data structure used is a minimum query segment tree. This data structure | |
* can answer queries of which value in some interval from i..j is the minimum | |
* in O(log(n)) time. Updates are also O(log(n)). | |
* | |
* The data structure is persisted to the file passed as the first argument to | |
* the compiled program. Memory mapped I/O is nice because: | |
* | |
* 1. You get the speed and convenience of accessing memory. | |
* 2. Changes to the memory pages accessed are written to disk at the kernel's convinience. | |
* This is specified with the SHARED flag. | |
* 3. You leverage page caching which means your file cache resides in kernel-handled memory. | |
* thus a process restart won't kill your cache. | |
* 4. There is not duplication of memory. Normally when you read from a file, you copy the contents | |
* into user-land. If you are not interested in modifying the file, but only avoid duplication, | |
* you can pass the ANONYMOUS flag instead of shared to the mmap(2) call. | |
* | |
* The file consists of an array of numbers. These numbers represent the value | |
* of the node at some index. They are initialized to INT_MAX, which makes the | |
* implementation of the segment tree simple. Each one of those entries takes up | |
* exactly sizeof(int) bytes. Thus it can be handled exactly like an ordinary | |
* array, but instead of having everything in memory, only the parts that are | |
* used are put into physical memory. This part is left to the kernel. | |
* | |
* This is further allowed to be optimized by the kernel by a call to | |
* madvise(3), which tells the kernel the access pattern will be random and | |
* therefore the kernel should fetch only a small amount of data on each read. | |
* | |
* This is nice, because you get to keep a large data structure on disk and | |
* frequentely accessed queries will be very fast because they'll be hot in | |
* physical memory. Furthermore, the implementation becomes very simple because | |
* virtual memory allows the program to make it act on it just like a normal | |
* array. | |
* | |
*/ | |
#include <stdio.h> | |
#include <errno.h> | |
#include <sys/mman.h> | |
#include <stdlib.h> | |
#include <limits.h> | |
#include <assert.h> | |
#include <fcntl.h> | |
#include <unistd.h> | |
#include <time.h> | |
// Struct for the segment tree | |
typedef struct _segment_tree { | |
size_t len; | |
size_t size; | |
int* tree; | |
int fd; | |
char* path; | |
char* error; | |
} segment_tree_t; | |
static void _segment_tree_mmap_tree(segment_tree_t*); | |
static int min(int, int); | |
// Initializes the segment tree. The most important part here is the call to | |
// _segment_tree_mmap_tree, which sets up the memory mapped I/O for the tree. | |
void | |
segment_tree_initialize(segment_tree_t* tree, char* path, size_t len) { | |
tree->len = len; // 1..len | |
tree->size = len * 3 * sizeof(int); // enough room for tree, len in bytes | |
tree->fd = -1; | |
tree->tree = NULL; | |
tree->error = NULL; | |
tree->path = path; | |
_segment_tree_mmap_tree(tree); | |
} | |
// mmap(2)s the file to the "tree" property on the struct. If the file doesn't | |
// exist, it is created with the length passed to initialize. | |
static void | |
_segment_tree_mmap_tree(segment_tree_t* tree) | |
{ | |
int new_file = 0; | |
tree->fd = open(tree->path, O_RDWR); | |
if(tree->fd < 0 && errno == ENOENT) { | |
new_file = 1; | |
tree->fd = open(tree->path, O_CREAT | O_RDWR, 0666); | |
if(tree->fd < 0) { | |
tree->error = "Error creating file"; | |
return; | |
} | |
// Truncate the file to a the passed size | |
int err = ftruncate(tree->fd, tree->size); | |
if (err < 0) { | |
tree->error = "Error truncating file"; | |
return; | |
} | |
} | |
if(tree->fd < 0) { | |
tree->error = "Error opening file"; | |
return; | |
} | |
// Do the actual mmap(2)'ing of the file. | |
tree->tree = mmap(NULL, tree->size, PROT_WRITE | PROT_READ, MAP_SHARED, tree->fd, 0); | |
if(tree->tree == NULL) { | |
tree->error = "Error mmapp'ing file"; | |
} | |
// Brief Kernel about how the memory will be used | |
int err = madvise(tree->tree, tree->size, MADV_RANDOM); | |
if (err < 0) { | |
tree->error = "Error calling madvise"; | |
} | |
if(new_file) { | |
// Set all nodes in the tree to INT_MAX | |
for(size_t i = 0; i < (tree->size / sizeof(int)); i++) { | |
tree->tree[i] = INT_MAX; | |
} | |
} | |
} | |
static void | |
_segment_tree_update(segment_tree_t* tree, size_t pos, int range_start, int range_end, int key, int value) { | |
if (key >= range_start && key <= range_end) { | |
if(tree->tree[pos] > value) { | |
tree->tree[pos] = value; | |
} | |
if (range_start != range_end) { | |
int middle = (range_start + range_end) / 2; | |
_segment_tree_update(tree, pos * 2, range_start, middle, key, value); | |
_segment_tree_update(tree, pos * 2 + 1, middle + 1, range_end, key, value); | |
} | |
} | |
} | |
// Update the value of a node in the tree. | |
void | |
segment_tree_update(segment_tree_t* tree, size_t key, size_t value) { | |
_segment_tree_update(tree, 1, 1, tree->len, key, value); | |
} | |
static int | |
_segment_tree_query(segment_tree_t* tree, size_t pos, size_t range_start, size_t range_end, size_t query_start, size_t query_end) { | |
if (range_start >= query_start && range_end <= query_end) { | |
return tree->tree[pos]; | |
} else if (range_end < query_start || range_start > query_end) { | |
return INT_MAX; | |
} else { | |
int middle = (range_start + range_end) / 2; | |
return min( | |
_segment_tree_query(tree, pos * 2, range_start, middle, query_start, query_end), | |
_segment_tree_query(tree, pos * 2 + 1, middle + 1, range_end, query_start, query_end) | |
); | |
} | |
} | |
// Query for the minimum element in some range from query_start..query_end. | |
int | |
segment_tree_query(segment_tree_t* tree, size_t query_start, size_t query_end) { | |
return _segment_tree_query(tree, 1, 1, tree->len, query_start, query_end); | |
} | |
void | |
segment_tree_free(segment_tree_t* tree) { | |
msync(tree->tree, tree->size, MS_SYNC); | |
close(tree->fd); | |
free(tree->error); | |
free(tree); | |
} | |
static int | |
min(int a, int b) | |
{ | |
return a < b ? a : b; | |
} | |
int main(int argc, char **argv) | |
{ | |
if(argc != 2) { | |
printf("Must pass path to the file to store the tree in."); | |
return 1; | |
} | |
int n = 100000; | |
segment_tree_t* st = malloc(sizeof(segment_tree_t)); | |
segment_tree_initialize(st, argv[1], n); | |
if (st->error != NULL) { | |
printf("Failed to create segment tree: %s\n", st->error); | |
return 1; | |
} | |
// Make sure it's sane | |
segment_tree_update(st, 4, 2); | |
segment_tree_update(st, 7, 1); | |
segment_tree_update(st, 20, 0); | |
segment_tree_update(st, 9, 3); | |
assert(segment_tree_query(st, 15, 15) == INT_MAX); | |
assert(segment_tree_query(st, 1, 4) == 2); | |
assert(segment_tree_query(st, 9, 15) == 3); | |
assert(segment_tree_query(st, 1, 7) == 1); | |
assert(segment_tree_query(st, 1, 9) == 1); | |
assert(segment_tree_query(st, 4, 4) == 2); | |
assert(segment_tree_query(st, 20, 23) == 0); | |
srand(time(NULL)); | |
for(int i = 0; i < 1000000; i++) { | |
int start = rand() % n; | |
int end = start + (rand() % (n - start)); | |
segment_tree_update(st, start, end); | |
} | |
for(int i = 0; i < 1000000; i++) { | |
int start = rand() % n; | |
int end = start + (rand() % (n - start)); | |
segment_tree_query(st, start, end); | |
} | |
// Commit to file if the OS hasn't already done so, and free the object and | |
// close the file descriptor. | |
segment_tree_free(st); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment