Igor Brigadir igorbrigadir

#Non-mathematical Introductions

#Videos

Useful Pandas Snippets

A personal diary of DataFrame munging over the years.

Data Types and Conversion

Convert Series datatype to numeric (will error if column has non-numeric values)
(h/t @makmanalp)

Internet Scale Services Checklist

A checklist for designing and developing internet scale services, inspired by James Hamilton's 2007 paper "On Desgining and Deploying Internet-Scale Services."

http://mvdirona.com/jrh/talksandpapers/jamesrh_lisa.pdf

Basic tenets

Does the design expect failures to happen regularly and handle them gracefully?
Have we kept things as simple as possible?

YouGov API

Searching

Request

Headers: Accept: application/json, text/plain, / Referer: https://yougov.co.uk/profiler

	def consumer(func):
	'''
	Decorator taking care of initial next() call to "sending" generators

	From PEP-342
	http://www.python.org/dev/peps/pep-0342/
	'''
	def wrapper(args,*kw):
	gen = func(args, *kw)
	next(gen)

	import numpy as np
	import marisa_trie
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.externals import six

	class MarisaCountVectorizer(CountVectorizer):

	# ``CountVectorizer.fit`` method calls ``fit_transform`` so
	# ``fit`` is not provided
	def fit_transform(self, raw_documents, y=None):

	"full_name";"party";"official_post";"constituency";"twitter_handle";"twitter_user_id";"uri";"last_updated";"notes"
	"Ms Diane Abbott MP";"Labour";;"Hackney North and Stoke Newington";"https://twitter.com/HackneyAbbott";153810216;"http://dbpedia.org/resource/Diane_Abbott";"2014-10-18T10:04:00+01:00";
	"Debbie Abrahams MP";"Labour";;"Oldham East and Saddleworth";"https://twitter.com/Debbie_abrahams";225857392;"http://dbpedia.org/resource/Debbie_Abrahams";"2014-10-18T10:04:00+01:00";
	"Nigel Adams MP";"Conservative";;"Selby and Ainsty";"TWITTER_UNKNOWN";-1;"http://dbpedia.org/resource/Nigel_Adams";"2014-10-18T10:04:00+01:00";
	"Adam Afriyie MP";"Conservative";;"Windsor";"https://twitter.com/AdamAfriyie";22031058;"http://dbpedia.org/resource/Adam_Afriyie";"2014-10-18T10:04:00+01:00";
	"Rt Hon Bob Ainsworth MP";"Labour";;"Coventry North East";"TWITTER_UNKNOWN";-1;"http://dbpedia.org/resource/Bob_Ainsworth";"2014-10-18T10:04:00+01:00";
	"Peter Aldous MP";"Conservative";;"Waveney";"https://twitter.com/peter_aldous";255998

	#Plot data using ggplot2
	library(ggplot2)

	#Calculate points crossing UCL or LCL
	pageviews_w_forecast$outliers <-
	ifelse(pageviews_w_forecast$pageviews > pageviews_w_forecast$upperBound.pageviews, pageviews_w_forecast$pageviews,
	ifelse(pageviews_w_forecast$pageviews < pageviews_w_forecast$lowerBound.pageviews, pageviews_w_forecast$pageviews, NA))

	#Add LCL and UCL labels
	LCL <- vector(mode = "character", nrow(pageviews_w_forecast))

	import numpy as np
	import pandas as pd
	from lxml import html
	from sklearn import metrics
	from sklearn.cross_validation import train_test_split
	from sklearn.linear_model import LogisticRegression as LR
	from sklearn.feature_extraction.text import TfidfVectorizer

	def clean(text):
	return html.fromstring(text).text_content().lower().strip()

	# generate 100-dimensional random walk data so that each data point in a sequence is similar to the last data point
	import numpy as np

	last=np.random.normal(0, .1, 100)
	for i in range(1000):
	new=last+np.random.normal(0, .1, 100)
	last=new

	print ' '.join(str(x) for x in new)