Last active
December 21, 2017 11:13
-
-
Save niccottrell/0b1065e56ea028c2986574eb8dce7578 to your computer and use it in GitHub Desktop.
Experimenting with hashs for years in compound shard keys
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* | |
* This script is to experiment with a compound shard key. | |
* | |
* Goals: | |
* 1. the hash (h) to always be the same for the same date (down to the day level) | |
* 2. documents with the same exact date (dMy) go on the same shard but different day to different shards | |
* 3. will work well with 100m+ documents | |
* 4. small overhead related to a simple (non-compound) date shard key | |
* 5. still support targeted queries (as long as "h" field is included in the query by the application) | |
* | |
* Launch a cluster with 3 shards (no RS for simplicity) | |
* mlaunch --single --sharded 3 | |
* | |
* Set the smallest chunksize to help visualize our sharding | |
* mongo config --eval 'db.settings.save( { _id:"chunksize", value: 1 } )' | |
* | |
* Connect via the mongos (on 27017) | |
* mongo | |
*/ | |
use test; | |
db.shardedColl.insertMany([ | |
{ h: 47, d: ISODate("2017-12-18-14:12:11"), payload: "just a test" }, | |
{ h: 47, d: ISODate("2017-12-18-14:12:19"), payload: "another test" }, | |
{ h: 47, d: ISODate("2017-12-18-14:12:32"), payload: "third test" } | |
]); | |
var doc = db.shardedColl.findOne(); | |
Object.bsonsize(doc) | |
// 69 | |
db.shardedColl2.insert({ h: "47-2017-12-18-14:12:11", d: ISODate("2017-12-18-14:12:11"), payload: "just a test" }); | |
var doc2 = db.shardedColl2.findOne(); | |
Object.bsonsize(doc2); | |
// 88 | |
db.shardedColl3.insertMany([ | |
{ h: 47, d: "2017-12-18-14:12:11", payload: "just a test" } | |
]); | |
var doc3 = db.shardedColl3.findOne(); | |
Object.bsonsize(doc3); | |
// 85 | |
// save one more byte | |
db.shardedColl3.insertMany([ | |
{ h: '5f', d: "2017-12-18-14:16:41", payload: "just a test" } | |
]); | |
var doc4 = db.shardedColl3.findOne({h: '5f'}); | |
Object.bsonsize(doc4); | |
// 84 | |
use test; | |
var Base64 = { | |
characters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" , | |
encode: function( string ) | |
{ | |
var characters = Base64.characters; | |
var result = ''; | |
var i = 0; | |
do { | |
var a = string.charCodeAt(i++); | |
var b = string.charCodeAt(i++); | |
var c = string.charCodeAt(i++); | |
a = a ? a : 0; | |
b = b ? b : 0; | |
c = c ? c : 0; | |
var b1 = ( a >> 2 ) & 0x3F; | |
var b2 = ( ( a & 0x3 ) << 4 ) | ( ( b >> 4 ) & 0xF ); | |
var b3 = ( ( b & 0xF ) << 2 ) | ( ( c >> 6 ) & 0x3 ); | |
var b4 = c & 0x3F; | |
if( ! b ) { | |
b3 = b4 = 64; | |
} else if( ! c ) { | |
b4 = 64; | |
} | |
result += Base64.characters.charAt( b1 ) + Base64.characters.charAt( b2 ) + Base64.characters.charAt( b3 ) + Base64.characters.charAt( b4 ); | |
} while ( i < string.length ); | |
return result; | |
} | |
}; | |
hashCode = function(str) { | |
var hash = 0; | |
if (str.length == 0) { | |
return hash; | |
} | |
for (var i = 0; i < str.length; i++) { | |
char = str.charCodeAt(i); | |
hash = ((hash<<5)-hash)+char; | |
hash = hash & hash; // Convert to 32bit integer | |
} | |
return hash; | |
} | |
makeHash = function(dateStr) { | |
base64Full = Base64.encode(dateStr); | |
return base64Full.substring(0,2); | |
} | |
// test our base64 algo | |
makeHash | |
// cleanup previous trials | |
use test; | |
db.shardedColl.drop(); | |
db.shardedColl.createIndex({h:1, d:1}); | |
sh.enableSharding("test"); | |
sh.shardCollection("test.shardedColl", {h:1, d:1}); | |
// alternatively remove docs and merge chunks | |
db.shardedColl.remove({}); | |
// now merge empty chunks | |
// let's assume all chunks have been forced back onto the first shard | |
// load chunks from the first shard | |
ourChunks = db.chunks.find({ "ns" : "test.shardedColl", "shard": "shard01"}).sort({"min": 1}); // load in order | |
prev = null; | |
ourChunks.forEach(function(item) { | |
relinkPrev = true; | |
print("item: "); | |
printjson( item); | |
if (prev == null) { | |
// first chunk we found | |
} else { | |
print("prev: "); | |
printjson(prev); | |
// is this chunk empty? | |
docCount = db.shardedColl.count({ | |
h: {$gte: item.min.h, $lt: item.max.h }, | |
d: {$gte: item.min.d, $lt: item.max.d }}); | |
print("docCount: " + docCount); | |
if (docCount <= 0) { // found an empty chunk | |
bounds = [ { h: prev.min.h, d: prev.min.d }, | |
{ h: item.max.h, d: item.max.d } ]; | |
print("bounds: "); | |
printjson(bounds); | |
// let's see if we can merge this with the previous | |
res = db.getSiblingDB("admin").runCommand( | |
{ mergeChunks : "test.shardedColl", bounds : bounds } ); | |
print("res: "); | |
printjson(res); | |
if (res && res.ok == 1) { | |
// merged successfully | |
print("Merged: "); | |
printjson(bounds); | |
// keep prev the same to preserve min boundary for next pass | |
relinkPrev= false; | |
} | |
} | |
} | |
if (relinkPrev) prev = item; | |
}); | |
// check how many chunks remain | |
db.getSiblingDB("config").chunks.count({ "ns" : "test.shardedColl"}); | |
// inspect the chunks in order | |
db.getSiblingDB("config").chunks.find({ "ns" : "test.shardedColl"}).sort({min: 1}).pretty() | |
// disable balancer (for insert performance but also to verify that we are distributing evenly) | |
sh.disableBalancing("test.shardedColl"); | |
// pre-create 4096 (64*64) chunks | |
for (d0 = 0; d0 < 64; d0++) { | |
for (d1 = 0; d1 < 64; d1++) { | |
hash = Base64.characters[d0] + Base64.characters[d1]; | |
sh.splitAt( "test.shardedColl", { h: hash, d: MinKey }); | |
} | |
} | |
// Config for inserts | |
padSize = 32; // make 32kb payloads | |
// Insert fresh data | |
for (year = 2000; year < 2018; year++) { | |
for (mon = 1; mon <= 12; mon++ ) { | |
for (day = 1; day < 30; day++) { | |
var objs = []; | |
for (ms = 0; ms < 100; ms++) { | |
// create a date with current time, but force date/month/year | |
date = new Date(); | |
date.setDate(day); | |
date.setMonth(mon); | |
date.setYear(year); | |
date.setMilliseconds(ms); | |
dateStr = date.toString(); | |
print(dateStr); | |
// prepare a hash | |
base64 = makeHash(dateStr); | |
print(base64); | |
// payload just over 32kb | |
objs.push({h: base64, d: date, payload: ("some test " + dateStr).pad(padSize*1024) }); | |
} | |
// insert in bulk | |
db.shardedColl.insertMany(objs); | |
} | |
} | |
} | |
db.shardedColl.insertMany([ | |
{ h: 47, d: ISODate("2017-12-18-14:12:11"), payload: "just a test" }, | |
{ h: 47, d: ISODate("2017-12-18-14:12:19"), payload: "another test" }, | |
{ h: 47, d: ISODate("2017-12-18-14:12:32"), payload: "third test" } | |
]); | |
db.shardedColl.createIndex({h:1, d:1}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment