Created
March 14, 2012 22:14
-
-
Save bos/2039952 to your computer and use it in GitHub Desktop.
incremental parsing for mercurial index
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# HG changeset patch | |
# User Bryan O'Sullivan <[email protected]> | |
# Date 1331763158 25200 | |
# Branch stable | |
# Node ID 268ae4d69a012f47cb1ef2bf65a8a96f561ce672 | |
# Parent ca5cc2976574d820dad5774afd8c7b3c39ec11cd | |
[WIP] lazy index file parser | |
Only parse entries in a revlog index file when they are actually needed. | |
This makes a huge difference to performance on large revlogs, | |
especially when only a few entries are needed. | |
TODO: | |
* Handle writes | |
* Modifications (e.g. strip) | |
* Support inline data | |
diff -r ca5cc2976574 -r 268ae4d69a01 mercurial/index.c | |
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 | |
+++ b/mercurial/index.c Wed Mar 14 15:12:38 2012 -0700 | |
@@ -0,0 +1,225 @@ | |
+#include <Python.h> | |
+#include <stdio.h> | |
+ | |
+typedef struct { | |
+ PyObject_HEAD | |
+ /* Type-specific fields go here. */ | |
+ PyObject *data_obj; | |
+ const char *data; | |
+ Py_ssize_t nbytes; | |
+ Py_ssize_t length; /* number of elements */ | |
+ int inlined; | |
+} indexObject; | |
+ | |
+static Py_ssize_t index_length(indexObject *self) | |
+{ | |
+ return self->length; | |
+} | |
+ | |
+ | |
+static PyObject *index_get(indexObject *self, Py_ssize_t pos) | |
+{ | |
+ static const char nullstr[20]; | |
+ static PyObject *nullid; | |
+ uint32_t decode[8]; /* to enforce alignment with inline data */ | |
+ uint64_t offset_flags; | |
+ int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2; | |
+ const char *c_node_id; | |
+ const char *data; | |
+ | |
+ if (pos >= self->length) { | |
+ PyErr_SetString(PyExc_IndexError, "revlog index out of range"); | |
+ return NULL; | |
+ } | |
+ | |
+ if (pos == self->length - 1) { | |
+ if (nullid == NULL) | |
+ nullid = Py_BuildValue("Liiiiiis#", (uint64_t)0, 0, 0, | |
+ -1, -1, -1, -1, nullstr, 20); | |
+ Py_INCREF(nullid); | |
+ return nullid; | |
+ } | |
+ | |
+ if (self->inlined) { | |
+ PyErr_SetString(PyExc_NotImplementedError, | |
+ "inlined data not supported yet"); | |
+ return NULL; | |
+ } else | |
+ data = self->data + pos * 64; | |
+ | |
+ memcpy(decode, data, 8 * sizeof(uint32_t)); | |
+ | |
+ offset_flags = ntohl(decode[1]); | |
+ if (pos == 0) /* mask out version number for the first entry */ | |
+ offset_flags &= 0xFFFF; | |
+ else { | |
+ uint32_t offset_high = ntohl(decode[0]); | |
+ offset_flags |= ((uint64_t)offset_high) << 32; | |
+ } | |
+ | |
+ comp_len = ntohl(decode[2]); | |
+ uncomp_len = ntohl(decode[3]); | |
+ base_rev = ntohl(decode[4]); | |
+ link_rev = ntohl(decode[5]); | |
+ parent_1 = ntohl(decode[6]); | |
+ parent_2 = ntohl(decode[7]); | |
+ c_node_id = data + 32; | |
+ | |
+ return Py_BuildValue("Liiiiiis#", offset_flags, comp_len, | |
+ uncomp_len, base_rev, link_rev, | |
+ parent_1, parent_2, c_node_id, 20); | |
+} | |
+ | |
+static int inline_init(indexObject *self) | |
+{ | |
+ const char *data = self->data; | |
+ const char *end = data + self->nbytes; | |
+ const long hdrsize = 64; | |
+ long incr = hdrsize; | |
+ Py_ssize_t len = 0; | |
+ | |
+ while (data + hdrsize <= end) { | |
+ uint32_t comp_len; | |
+ const char *old_data; | |
+ /* 3rd element of header is length of compressed inline data */ | |
+ memcpy(&comp_len, data + 8, sizeof(uint32_t)); | |
+ incr = hdrsize + ntohl(comp_len); | |
+ if (incr < hdrsize) | |
+ break; | |
+ len++; | |
+ old_data = data; | |
+ data += incr; | |
+ if (data <= old_data) | |
+ break; | |
+ } | |
+ | |
+ if (data != end && data + hdrsize != end) { | |
+ if (!PyErr_Occurred()) | |
+ PyErr_SetString(PyExc_ValueError, "corrupt index file"); | |
+ return -1; | |
+ } | |
+ | |
+ self->length = len + 1; | |
+ return 0; | |
+} | |
+ | |
+static int | |
+index_init(indexObject *self, PyObject *args, PyObject *kwds) | |
+{ | |
+ const char *data; | |
+ int size; | |
+ PyObject *inlined_obj; | |
+ | |
+ if (!PyArg_ParseTuple(args, "s#O", &data, &size, &inlined_obj)) | |
+ return -1; | |
+ | |
+ self->inlined = inlined_obj && PyObject_IsTrue(inlined_obj); | |
+ | |
+ self->data = data; | |
+ self->data_obj = PyTuple_GET_ITEM(args, 0); | |
+ self->nbytes = size; | |
+ | |
+ if (self->inlined) { | |
+ if (inline_init(self)) | |
+ goto bail; | |
+ } else { | |
+ if (size % 64) { | |
+ PyErr_SetString(PyExc_ValueError, "corrupt index file"); | |
+ goto bail; | |
+ } | |
+ self->length = size / 64 + 1; | |
+ } | |
+ | |
+ Py_INCREF(self->data_obj); | |
+ | |
+ return 0; | |
+bail: | |
+ return -1; | |
+} | |
+ | |
+static void | |
+index_dealloc(indexObject *self) | |
+{ | |
+ Py_DECREF(self->data_obj); | |
+ PyObject_Del(self); | |
+} | |
+ | |
+static PySequenceMethods index_sequence_methods = { | |
+ (lenfunc) index_length, /* sq_length */ | |
+ 0, /* sq_concat */ | |
+ 0, /* sq_repeat */ | |
+ (ssizeargfunc)index_get, /* sq_item */ | |
+}; | |
+ | |
+static PyMethodDef index_methods[] = { | |
+/* | |
+ {"length", (PyCFunction)index_len, METH_NOARGS, | |
+ "Length of the index"}, | |
+ {"get", (PyCFunction)index_get, METH_VARARGS, | |
+ "get an index item"}, | |
+*/ | |
+ {NULL} /* Sentinel */ | |
+}; | |
+ | |
+static PyTypeObject indexType = { | |
+ PyObject_HEAD_INIT(NULL) | |
+ 0, /*ob_size*/ | |
+ "index.index", /*tp_name*/ | |
+ sizeof(indexObject), /*tp_basicsize*/ | |
+ 0, /*tp_itemsize*/ | |
+ (destructor)index_dealloc, /*tp_dealloc*/ | |
+ 0, /*tp_print*/ | |
+ 0, /*tp_getattr*/ | |
+ 0, /*tp_setattr*/ | |
+ 0, /*tp_compare*/ | |
+ 0, /*tp_repr*/ | |
+ 0, /*tp_as_number*/ | |
+ 0, /*tp_as_sequence*/ | |
+ 0, /*tp_as_mapping*/ | |
+ 0, /*tp_hash */ | |
+ 0, /*tp_call*/ | |
+ 0, /*tp_str*/ | |
+ 0, /*tp_getattro*/ | |
+ 0, /*tp_setattro*/ | |
+ 0, /*tp_as_buffer*/ | |
+ Py_TPFLAGS_DEFAULT, /*tp_flags*/ | |
+ "index objects", /* tp_doc */ | |
+ 0, /* tp_traverse */ | |
+ 0, /* tp_clear */ | |
+ 0, /* tp_richcompare */ | |
+ 0, /* tp_weaklistoffset */ | |
+ 0, /* tp_iter */ | |
+ 0, /* tp_iternext */ | |
+ index_methods, /* tp_methods */ | |
+ 0, /* tp_members */ | |
+ 0, /* tp_getset */ | |
+ 0, /* tp_base */ | |
+ 0, /* tp_dict */ | |
+ 0, /* tp_descr_get */ | |
+ 0, /* tp_descr_set */ | |
+ 0, /* tp_dictoffset */ | |
+ (initproc)index_init, /* tp_init */ | |
+ 0, /* tp_alloc */ | |
+ 0, /* tp_new */ | |
+}; | |
+ | |
+static PyMethodDef module_methods[] = { | |
+ {NULL} /* Sentinel */ | |
+}; | |
+ | |
+PyMODINIT_FUNC | |
+initindex(void) | |
+{ | |
+ PyObject* m; | |
+ | |
+ indexType.tp_new = PyType_GenericNew; | |
+ indexType.tp_as_sequence = &index_sequence_methods; | |
+ if (PyType_Ready(&indexType) < 0) | |
+ return; | |
+ | |
+ m = Py_InitModule3("index", module_methods, | |
+ "Example module that creates an extension type."); | |
+ | |
+ Py_INCREF(&indexType); | |
+ PyModule_AddObject(m, "index", (PyObject *)&indexType); | |
+} | |
diff -r ca5cc2976574 -r 268ae4d69a01 mercurial/revlog.py | |
--- a/mercurial/revlog.py Tue Mar 13 16:28:08 2012 -0500 | |
+++ b/mercurial/revlog.py Wed Mar 14 15:12:38 2012 -0700 | |
@@ -14,7 +14,7 @@ | |
# import stuff from node for others to import from revlog | |
from node import bin, hex, nullid, nullrev | |
from i18n import _ | |
-import ancestor, mdiff, parsers, error, util, dagutil | |
+import ancestor, mdiff, index, parsers, error, util, dagutil | |
import struct, zlib, errno | |
_pack = struct.pack | |
@@ -173,8 +173,10 @@ | |
def parseindex(self, data, inline): | |
# call the C implementation to parse the index data | |
- index, cache = parsers.parse_index2(data, inline) | |
- return index, None, cache | |
+ if inline: | |
+ idx, cache = parsers.parse_index2(data, inline) | |
+ return idx, None, cache | |
+ return index.index(data, inline), None, None | |
def packentry(self, entry, node, version, rev): | |
p = _pack(indexformatng, *entry) | |
diff -r ca5cc2976574 -r 268ae4d69a01 setup.py | |
--- a/setup.py Tue Mar 13 16:28:08 2012 -0500 | |
+++ b/setup.py Wed Mar 14 15:12:38 2012 -0700 | |
@@ -389,6 +389,7 @@ | |
Extension('mercurial.base85', ['mercurial/base85.c']), | |
Extension('mercurial.bdiff', ['mercurial/bdiff.c']), | |
Extension('mercurial.diffhelpers', ['mercurial/diffhelpers.c']), | |
+ Extension('mercurial.index', ['mercurial/index.c']), | |
Extension('mercurial.mpatch', ['mercurial/mpatch.c']), | |
Extension('mercurial.parsers', ['mercurial/parsers.c']), | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment