Skip to content

Instantly share code, notes, and snippets.

@johnlpage
Last active December 8, 2015 13:19
Show Gist options
  • Save johnlpage/e0bb9971f4f1c4ed3a09 to your computer and use it in GitHub Desktop.
Save johnlpage/e0bb9971f4f1c4ed3a09 to your computer and use it in GitHub Desktop.
Data Loading and Aggregation demo in MongoDB

#Start a machine use any method you like here, I'm doing it from the command line 40 cores, 120GB RAM, SSD with 8,500 IOPS (SAN) - Amazon linux (Centos)

ec2-run-instances ami-a10897d6 -t m4.10xlarge -g jlp -k john_page_demos -b "/dev/xvdb=:725:true:io1:8500"

#Log on

ssh -L 27017:localhost:27017 -i <your public key>.pem ec2-user@<your ip address>

#Set up disk

sudo -s
mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/xvdb
mkdir /data
mount /dev/xvdb /data
chown ec2-user /data

#Download MongoDB

vi /etc/yum.repos.d/mongodb-org-3.0.repo
[mongodb-org-3.0]
name=MongoDB Repository
baseurl=https://repo.mongodb.org/yum/amazon/2013.03/mongodb-org/3.0/x86_64/
gpgcheck=0
enabled=1
yum install -y mongodb-org
service mongod stop

#Setup Server Kernel etc.

echo never > /sys/kernel/mm/transparent_hugepage/enabled
echo never > /sys/kernel/mm/transparent_hugepage/defrag
blockdev --setra 32 /dev/xvdb
echo "* soft nofile 20000" >> /etc/security/limits.conf
echo "* hard nofile 20000" >> /etc/security/limits.conf
exit

#Start Mongo Instances

for s in `seq 1 40`
do
 mkdir /data/shard$s 
 let port=$s+27100
 numactl --interleave=all mongod --storageEngine=wiredTiger --wiredTigerCacheSizeGB=3 --nojournal --port=$port --dbpath=/data/shard$s --logpath=/data/log$s --fork
done

#Start Config Server and Router

mkdir /data/config
numactl --interleave=all  mongod --configsvr --port 27019 --dbpath=/data/config --logpath=/data/config.log --fork
mongos --configdb=localhost:27019 --logpath=/data/mongos.log --fork

#Start mongo shell and add shards

mongo
for(s=1;s<=40;s++) {
 sh.addShard("localhost:"+(s+27100))
}
sh.status()
exit

Download raw data

mkdir /data/raw
cd /data/raw
curl -s -O http://data.dft.gov.uk/anonymised-mot-test/12-03/test_result_[2005-2013].txt.gz &

#Install Dev tools

cd
sudo yum groupinstall -y "development tools"

#Install MongoDB C Driver

cd
git clone https://github.com/mongodb/mongo-c-driver.git
cd mongo-c-driver
./autogen.sh
make
sudo make install

#Install Fast Loader

cd
git clone http://github.com/johnlpage/FastLoad
cd FastLoad
make

#Unzip Files once downloaded

cd /data/raw
wait

for f in *.gz
do
gunzip $f&
done
wait

#Load them in

export LD_LIBRARY_PATH=/usr/local/lib
time for f in *
do
~/FastLoad/fastload $f
done

#Test connection over ssh tunnel

mongo
sh.status()
use vosa
db.mot_results.findOne()
db.mot_results.count()
exit
mongostat

#Install matplotlib

pip install matplotlib

#Install pymongo

pip install pymongo

#Start python

python
from pymongo import *
from pprint import pprint
from matplotlib import pyplot as pyplot
import time
client = MongoClient()
db = client.vosa
db.mot_results.count()
doc = db.mot_results.find_one()
pprint(doc)
ageinusecs = { "$subtract" : [ "$TestDate", "$FirstUseDate" ] }
age = { "$divide" :[ ageinusecs , (1000*3600*24*365) ] }
ageinyears = { "$subtract" : [ age, { "$mod" : [ age,1 ]} ] }
ispass =  { "$cond" : [{"$eq": ["$Result","P"]},1,0]}
project = { "$project" : { "_id":0,"Make":1,"Result":1, "TestDate":1,"Mileage":1,"FirstUseDate":1,"Age":ageinyears,"pass":ispass }}
results = db.mot_results.aggregate([project,{"$limit":5}])
pprint(list(results))

carsonly = { "$match" : { "TestClass" : { "$eq" : 4}}}
knownage = { "$match" : { "FirstUseDate" : { "$exists": True}}}
group = { "$group" : { "_id" : { "make": "$Make", "age" : "$Age" }, "count" : {"$sum":1} , "miles": {"$avg":"$Mileage"},"passes":{"$sum":"$pass"}}}
out = { "$out" : "summary" }

t0 = time.time()
results = db.mot_results.aggregate([carsonly,knownage,project,group,out])
print time.time() - t0

age=[]
reliability=[]
labels = []
colours = []

for r in db.summary.find():
	count = r['count']
	if count > 2000:
		id = r['_id']
		age.append(id['age'])
		passes = r['passes']
		reliability.append(passes/float(count))
		make = id['make']
		labels.append(make)
		colours.append(hash(make) % 65535)

figure = pyplot.figure();
axis = figure.add_subplot(111);
axis.scatter(age,reliability,c=colours,picker=5,s=80,alpha=0.3)

def onpick(event):
	print labels[event.ind[0]]

	
figure.canvas.mpl_connect('pick_event',onpick)
pyplot.show()

filter = {"$match" : { "count" : { "$gte" : 2000 } } }
sort = {"$sort": { "_id" : 1 }}
groupmake = { "$group" : { "_id" : "$_id.make" , "years" : { "$push" : { "age" :"$_id.age", "miles" : "$miles" } } } }
results = db.summary.aggregate([filter,sort,groupmake])

figure = pyplot.figure();
axis = figure.add_subplot(111);

makes = {}
for r in results:
	make = r['_id']
	age=[]
	miles=[]
	yeardata = r['years']
	for y in yeardata:
		age.append(y['age'])
		miles.append(y['miles'])
	tp = axis.plot(age,miles,picker=5)
	makes[tp[0]]=make

def onpick(event):
	artist = event.artist
	print makes[artist]

figure.canvas.mpl_connect('pick_event',onpick)
pyplot.show()

miles=[]
reliability=[]
labels = []
colours = []

for r in db.summary.find():
	count = r['count']
	if count > 2000:
		id = r['_id']
		miles.append(r['miles'])
		passes = r['passes']
		reliability.append(passes/float(count))
		make = id['make']
		labels.append(make)
		colours.append(hash(make) % 65535)

figure = pyplot.figure();
axis = figure.add_subplot(111);
axis.scatter(miles,reliability,c=colours,picker=5,s=80,alpha=0.3)

def onpick(event):
	print labels[event.ind[0]]

	
figure.canvas.mpl_connect('pick_event',onpick)
pyplot.show()
@neilstuartcraig
Copy link

Looks very cool, thanks for taking the time to put this together and make it public. I have seen some parts of this before but really nice to have it all in one place and so easily accessible.

Sorry to sound like "that guy" but just wanted to make the most minor of points that EBS is NAS rather than SAN. As i say, very minor and my main purpose for the message was to say thanks - people don't do tht often enough and putting these sorts of info together takes a chunk of time and skills - so cheers!

@kimbad
Copy link

kimbad commented Nov 11, 2015

This was a great demo at MongoDB Day London. Thanks for sharing.

@shimirel
Copy link

Before running
pip install matplotlib
you need to run
sudo yum -y install freetype freetype-devel libpng-devel
otherwise you get the error "Command python setup.py egg_info failed with error code 1".

If you use an out of the box install of Amazon command line. You need to add the region to
ec2-run-instances ami-a10897d6 -t m4.10xlarge -g jlp -k john_page_demos -b "/dev/xvdb=:725:true:io1:8500"
e.g.
ec2-run-instances ami-a10897d6 -t m4.10xlarge -g default -k devenv-key -b "/dev/xvdb=:725:true:io1:8500" -region eu-west-1
otherwise it will complain about being unable to find the template "ami-a10897d6".

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment