Skip to content

Instantly share code, notes, and snippets.

@kanru
Created December 10, 2009 06:25
Show Gist options
  • Save kanru/253164 to your computer and use it in GitHub Desktop.
Save kanru/253164 to your computer and use it in GitHub Desktop.
--- flint_database.cc 2009-12-08 13:34:24.790284881 +0800
+++ flint_database.cc 2009-12-10 14:22:14.493653956 +0800
@@ -1188,7 +1188,7 @@
termlist.next();
while (!termlist.at_end()) {
- string tname = termlist.get_termname();
+ string tname = termlist.get_termname();
position_table.delete_positionlist(did, tname);
termcount wdf = termlist.get_wdf();
@@ -1278,20 +1278,50 @@
}
if (!modifying || document.internal->terms_modified()) {
- // FIXME - in the case where there is overlap between the new
- // termlist and the old termlist, it would be better to compare the
- // two lists, and make the minimum set of modifications required.
- // This would lead to smaller changesets for replication, and
- // probably be faster overall.
-
- // First, add entries to remove the postings in the underlying record.
Xapian::Internal::RefCntPtr<const FlintWritableDatabase> ptrtothis(this);
FlintTermList termlist(ptrtothis, did);
+ Xapian::TermIterator term = document.termlist_begin();
+ Xapian::TermIterator term_end = document.termlist_end();
+ flint_doclen_t new_doclen = termlist.get_doclength();
+ string old_tname, new_tname;
+
+ total_length -= new_doclen;
+
+ termlist.next();
+ while (true) {
+ bool identical = false;
+ int cmp;
+ if (termlist.at_end() && term == term_end)
+ break;
+ if (!termlist.at_end() && term != term_end) {
+ old_tname = termlist.get_termname();
+ new_tname = *term;
+ cmp = old_tname.compare(new_tname);
+
+ // Check postlist to see whether they are identical
+ if (cmp == 0) {
+ int new_count = term.positionlist_count();
+ int old_count = termlist.positionlist_count();
+ if (old_count == new_count) {
+ PositionIterator it = term.positionlist_begin();
+ PositionIterator it_end = term.positionlist_end();
+ PositionIterator old = termlist.positionlist_begin();
+ if (equal(it, it_end, old))
+ identical = true;
+ }
+ }
+ } else if (termlist.at_end()) {
+ cmp = 2;
+ new_tname = *term;
+ } else {
+ cmp = -2;
+ old_tname = termlist.get_termname();
+ }
- termlist.next();
- while (!termlist.at_end()) {
- string tname = termlist.get_termname();
+ if (cmp < 0) {
+ const string& tname = old_tname;
termcount wdf = termlist.get_wdf();
+ new_doclen -= wdf;
map<string, pair<termcount_diff, termcount_diff> >::iterator i;
i = freq_deltas.find(tname);
@@ -1318,58 +1348,62 @@
// Modifying a document we added/modified since the last flush.
k->second = make_pair('D', 0u);
}
-
- termlist.next();
- }
-
- total_length -= termlist.get_doclength();
-
- flint_doclen_t new_doclen = 0;
- Xapian::TermIterator term = document.termlist_begin();
- Xapian::TermIterator term_end = document.termlist_end();
- for ( ; term != term_end; ++term) {
- // Calculate the new document length
+ } else if (!identical) {
+ const string& tname = new_tname;
termcount wdf = term.get_wdf();
- new_doclen += wdf;
-
- string tname = *term;
- if (tname.size() > MAX_SAFE_TERM_LENGTH)
- throw Xapian::InvalidArgumentError("Term too long (> "STRINGIZE(MAX_SAFE_TERM_LENGTH)"): " + tname);
- map<string, pair<termcount_diff, termcount_diff> >::iterator i;
- i = freq_deltas.find(tname);
- if (i == freq_deltas.end()) {
- freq_deltas.insert(make_pair(tname, make_pair(1, termcount_diff(wdf))));
- } else {
- ++i->second.first;
- i->second.second += wdf;
- }
+ new_doclen += wdf;
+
+ if (cmp > 0) {
+ if (tname.size() > MAX_SAFE_TERM_LENGTH)
+ throw Xapian::InvalidArgumentError("Term too long (> "STRINGIZE(MAX_SAFE_TERM_LENGTH)"): " + tname);
+ map<string, pair<termcount_diff, termcount_diff> >::iterator i;
+ i = freq_deltas.find(tname);
+ if (i == freq_deltas.end()) {
+ freq_deltas.insert(make_pair(tname, make_pair(1, termcount_diff(wdf))));
+ } else {
+ ++i->second.first;
+ i->second.second += wdf;
+ }
+
+ // Add did to tname's postlist
+ map<string, map<docid, pair<char, termcount> > >::iterator j;
+ j = mod_plists.find(tname);
+ if (j == mod_plists.end()) {
+ map<docid, pair<char, termcount> > m;
+ j = mod_plists.insert(make_pair(tname, m)).first;
+ }
+ map<docid, pair<char, termcount> >::iterator k;
+ k = j->second.find(did);
+ if (k != j->second.end()) {
+ Assert(k->second.first == 'D');
+ k->second.first = 'M';
+ k->second.second = wdf;
+ } else {
+ j->second.insert(make_pair(did, make_pair('A', wdf)));
+ }
+ }
+
+ PositionIterator it = term.positionlist_begin();
+ PositionIterator it_end = term.positionlist_end();
+ if (it != it_end) {
+ position_table.set_positionlist(
+ did, tname, it, it_end);
+ } else {
+ position_table.delete_positionlist(did, tname);
+ }
+ }
+ if (termlist.at_end())
+ ++term;
+ else if (term == term_end)
+ termlist.next();
+ else {
+ if (cmp >= 0)
+ ++term;
+ if (cmp <= 0)
+ termlist.next();
+ }
+ }
- // Add did to tname's postlist
- map<string, map<docid, pair<char, termcount> > >::iterator j;
- j = mod_plists.find(tname);
- if (j == mod_plists.end()) {
- map<docid, pair<char, termcount> > m;
- j = mod_plists.insert(make_pair(tname, m)).first;
- }
- map<docid, pair<char, termcount> >::iterator k;
- k = j->second.find(did);
- if (k != j->second.end()) {
- Assert(k->second.first == 'D');
- k->second.first = 'M';
- k->second.second = wdf;
- } else {
- j->second.insert(make_pair(did, make_pair('A', wdf)));
- }
-
- PositionIterator it = term.positionlist_begin();
- PositionIterator it_end = term.positionlist_end();
- if (it != it_end) {
- position_table.set_positionlist(
- did, tname, it, it_end);
- } else {
- position_table.delete_positionlist(did, tname);
- }
- }
LOGLINE(DB, "Calculated doclen for replacement document " << did << " as " << new_doclen);
// Set the termlist
*Before*
13:27:55 kanru@anar:~/abs/xapian-core/src/xapian-build
% time notmuch tag +test id:[email protected]
MOD_PLISTS: 368
notmuch tag +test id:[email protected] 0.05s user 0.03s system 11% cpu 0.673 total
13:28:11 kanru@anar:~/abs/xapian-core/src/xapian-build
% time notmuch tag -test id:[email protected]
MOD_PLISTS: 368
notmuch tag -test id:[email protected] 0.06s user 0.01s system 10% cpu 0.681 total
*After*
13:30:40 kanru@anar:~/abs/xapian-core/src/xapian-build
% time notmuch tag +test id:[email protected]
MOD_PLIST: 1
notmuch tag +test id:[email protected] 0.01s user 0.02s system 6% cpu 0.436 total
13:31:23 kanru@anar:~/abs/xapian-core/src/xapian-build
% time notmuch tag -test id:[email protected]
MOD_PLIST: 1
notmuch tag -test id:[email protected] 0.01s user 0.01s system 5% cpu 0.383 total
14:28:28 kanru@anar:~
% time notmuch tag +test tag:notmuch
notmuch tag +test tag:notmuch 1.71s user 0.03s system 65% cpu 2.632 total
14:30:34 kanru@anar:~
% time notmuch tag -test tag:notmuch
notmuch tag -test tag:notmuch 1.61s user 0.02s system 73% cpu 2.204 total
14:30:47 kanru@anar:~
% notmuch count tag:notmuch
682
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment