Skip to content

Instantly share code, notes, and snippets.

@bos
Created March 14, 2012 22:14
Show Gist options
  • Save bos/2039952 to your computer and use it in GitHub Desktop.
Save bos/2039952 to your computer and use it in GitHub Desktop.
incremental parsing for mercurial index
# HG changeset patch
# User Bryan O'Sullivan <[email protected]>
# Date 1331763158 25200
# Branch stable
# Node ID 268ae4d69a012f47cb1ef2bf65a8a96f561ce672
# Parent ca5cc2976574d820dad5774afd8c7b3c39ec11cd
[WIP] lazy index file parser
Only parse entries in a revlog index file when they are actually needed.
This makes a huge difference to performance on large revlogs,
especially when only a few entries are needed.
TODO:
* Handle writes
* Modifications (e.g. strip)
* Support inline data
diff -r ca5cc2976574 -r 268ae4d69a01 mercurial/index.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/index.c Wed Mar 14 15:12:38 2012 -0700
@@ -0,0 +1,225 @@
+#include <Python.h>
+#include <stdio.h>
+
+typedef struct {
+ PyObject_HEAD
+ /* Type-specific fields go here. */
+ PyObject *data_obj;
+ const char *data;
+ Py_ssize_t nbytes;
+ Py_ssize_t length; /* number of elements */
+ int inlined;
+} indexObject;
+
+static Py_ssize_t index_length(indexObject *self)
+{
+ return self->length;
+}
+
+
+static PyObject *index_get(indexObject *self, Py_ssize_t pos)
+{
+ static const char nullstr[20];
+ static PyObject *nullid;
+ uint32_t decode[8]; /* to enforce alignment with inline data */
+ uint64_t offset_flags;
+ int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2;
+ const char *c_node_id;
+ const char *data;
+
+ if (pos >= self->length) {
+ PyErr_SetString(PyExc_IndexError, "revlog index out of range");
+ return NULL;
+ }
+
+ if (pos == self->length - 1) {
+ if (nullid == NULL)
+ nullid = Py_BuildValue("Liiiiiis#", (uint64_t)0, 0, 0,
+ -1, -1, -1, -1, nullstr, 20);
+ Py_INCREF(nullid);
+ return nullid;
+ }
+
+ if (self->inlined) {
+ PyErr_SetString(PyExc_NotImplementedError,
+ "inlined data not supported yet");
+ return NULL;
+ } else
+ data = self->data + pos * 64;
+
+ memcpy(decode, data, 8 * sizeof(uint32_t));
+
+ offset_flags = ntohl(decode[1]);
+ if (pos == 0) /* mask out version number for the first entry */
+ offset_flags &= 0xFFFF;
+ else {
+ uint32_t offset_high = ntohl(decode[0]);
+ offset_flags |= ((uint64_t)offset_high) << 32;
+ }
+
+ comp_len = ntohl(decode[2]);
+ uncomp_len = ntohl(decode[3]);
+ base_rev = ntohl(decode[4]);
+ link_rev = ntohl(decode[5]);
+ parent_1 = ntohl(decode[6]);
+ parent_2 = ntohl(decode[7]);
+ c_node_id = data + 32;
+
+ return Py_BuildValue("Liiiiiis#", offset_flags, comp_len,
+ uncomp_len, base_rev, link_rev,
+ parent_1, parent_2, c_node_id, 20);
+}
+
+static int inline_init(indexObject *self)
+{
+ const char *data = self->data;
+ const char *end = data + self->nbytes;
+ const long hdrsize = 64;
+ long incr = hdrsize;
+ Py_ssize_t len = 0;
+
+ while (data + hdrsize <= end) {
+ uint32_t comp_len;
+ const char *old_data;
+ /* 3rd element of header is length of compressed inline data */
+ memcpy(&comp_len, data + 8, sizeof(uint32_t));
+ incr = hdrsize + ntohl(comp_len);
+ if (incr < hdrsize)
+ break;
+ len++;
+ old_data = data;
+ data += incr;
+ if (data <= old_data)
+ break;
+ }
+
+ if (data != end && data + hdrsize != end) {
+ if (!PyErr_Occurred())
+ PyErr_SetString(PyExc_ValueError, "corrupt index file");
+ return -1;
+ }
+
+ self->length = len + 1;
+ return 0;
+}
+
+static int
+index_init(indexObject *self, PyObject *args, PyObject *kwds)
+{
+ const char *data;
+ int size;
+ PyObject *inlined_obj;
+
+ if (!PyArg_ParseTuple(args, "s#O", &data, &size, &inlined_obj))
+ return -1;
+
+ self->inlined = inlined_obj && PyObject_IsTrue(inlined_obj);
+
+ self->data = data;
+ self->data_obj = PyTuple_GET_ITEM(args, 0);
+ self->nbytes = size;
+
+ if (self->inlined) {
+ if (inline_init(self))
+ goto bail;
+ } else {
+ if (size % 64) {
+ PyErr_SetString(PyExc_ValueError, "corrupt index file");
+ goto bail;
+ }
+ self->length = size / 64 + 1;
+ }
+
+ Py_INCREF(self->data_obj);
+
+ return 0;
+bail:
+ return -1;
+}
+
+static void
+index_dealloc(indexObject *self)
+{
+ Py_DECREF(self->data_obj);
+ PyObject_Del(self);
+}
+
+static PySequenceMethods index_sequence_methods = {
+ (lenfunc) index_length, /* sq_length */
+ 0, /* sq_concat */
+ 0, /* sq_repeat */
+ (ssizeargfunc)index_get, /* sq_item */
+};
+
+static PyMethodDef index_methods[] = {
+/*
+ {"length", (PyCFunction)index_len, METH_NOARGS,
+ "Length of the index"},
+ {"get", (PyCFunction)index_get, METH_VARARGS,
+ "get an index item"},
+*/
+ {NULL} /* Sentinel */
+};
+
+static PyTypeObject indexType = {
+ PyObject_HEAD_INIT(NULL)
+ 0, /*ob_size*/
+ "index.index", /*tp_name*/
+ sizeof(indexObject), /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ (destructor)index_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ 0, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ 0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ 0, /*tp_as_sequence*/
+ 0, /*tp_as_mapping*/
+ 0, /*tp_hash */
+ 0, /*tp_call*/
+ 0, /*tp_str*/
+ 0, /*tp_getattro*/
+ 0, /*tp_setattro*/
+ 0, /*tp_as_buffer*/
+ Py_TPFLAGS_DEFAULT, /*tp_flags*/
+ "index objects", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ index_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)index_init, /* tp_init */
+ 0, /* tp_alloc */
+ 0, /* tp_new */
+};
+
+static PyMethodDef module_methods[] = {
+ {NULL} /* Sentinel */
+};
+
+PyMODINIT_FUNC
+initindex(void)
+{
+ PyObject* m;
+
+ indexType.tp_new = PyType_GenericNew;
+ indexType.tp_as_sequence = &index_sequence_methods;
+ if (PyType_Ready(&indexType) < 0)
+ return;
+
+ m = Py_InitModule3("index", module_methods,
+ "Example module that creates an extension type.");
+
+ Py_INCREF(&indexType);
+ PyModule_AddObject(m, "index", (PyObject *)&indexType);
+}
diff -r ca5cc2976574 -r 268ae4d69a01 mercurial/revlog.py
--- a/mercurial/revlog.py Tue Mar 13 16:28:08 2012 -0500
+++ b/mercurial/revlog.py Wed Mar 14 15:12:38 2012 -0700
@@ -14,7 +14,7 @@
# import stuff from node for others to import from revlog
from node import bin, hex, nullid, nullrev
from i18n import _
-import ancestor, mdiff, parsers, error, util, dagutil
+import ancestor, mdiff, index, parsers, error, util, dagutil
import struct, zlib, errno
_pack = struct.pack
@@ -173,8 +173,10 @@
def parseindex(self, data, inline):
# call the C implementation to parse the index data
- index, cache = parsers.parse_index2(data, inline)
- return index, None, cache
+ if inline:
+ idx, cache = parsers.parse_index2(data, inline)
+ return idx, None, cache
+ return index.index(data, inline), None, None
def packentry(self, entry, node, version, rev):
p = _pack(indexformatng, *entry)
diff -r ca5cc2976574 -r 268ae4d69a01 setup.py
--- a/setup.py Tue Mar 13 16:28:08 2012 -0500
+++ b/setup.py Wed Mar 14 15:12:38 2012 -0700
@@ -389,6 +389,7 @@
Extension('mercurial.base85', ['mercurial/base85.c']),
Extension('mercurial.bdiff', ['mercurial/bdiff.c']),
Extension('mercurial.diffhelpers', ['mercurial/diffhelpers.c']),
+ Extension('mercurial.index', ['mercurial/index.c']),
Extension('mercurial.mpatch', ['mercurial/mpatch.c']),
Extension('mercurial.parsers', ['mercurial/parsers.c']),
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment