Skip to content

Instantly share code, notes, and snippets.

@tingletech
Forked from tingletech/bayesian_survey_analysis.py
Created October 21, 2012 22:54
Show Gist options
  • Save tingletech/3928832 to your computer and use it in GitHub Desktop.
Save tingletech/3928832 to your computer and use it in GitHub Desktop.
plotting the credible interval for user survey

User Survey Analysis

http://code.google.com/p/json4lib/source/browse/#hg%2Fsurvey

http://stats.stackexchange.com/questions/39319/bayesian-user-survey-with-a-credible-interval

https://gist.github.com/3896501

http://en.wikipedia.org/wiki/Monte_Carlo_method

http://www.resample.com/

http://tingletech.tumblr.com/tagged/pop-up-survey

OS X install notes

use virtualenv

brew is telling me "10.7.5 Xcode (4.4.1) is outdated Please install Xcode 4.5." but I can't be bothered

pip install numpy
brew install freetype
export CPPFLAGS="-I/usr/X11/include/freetype2 -I/usr/X11/include -I/usr/local/opt/freetype/include"
export LDFLAGS="-L/usr/X11/lib -lfreetype -lz -lbz2"
pip install matplotlib

License

Copyright © 2012, Regents of the University of California

All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

  • Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
  • Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
  • Neither the name of the University of California nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#!/usr/bin/env python
"""
plot user survey results and estimate bayesian credible interval using monte carlo method
uses 2.5% and 97.5% percentiles
data is hard coded for one survey right now
"""
import sys
from numpy.random.mtrand import dirichlet
from numpy import array, sum, percentile
from pprint import pprint
import numpy as np
def main(argv=None):
# need to refactor the data out
categories = [
'k-12 teacher or librarian',
'k-12 student',
'college or graduate student',
'faculty or academic researcher',
'archivist or librarian',
'genealogist or family researcher',
'other'
]
observations_oac = [ 65, 71, 532, 307, 369, 234, 584]
observations_cs = [133, 280, 445, 82, 55, 68, 220]
# calculate the statistics
estimates_oac, error_bars_oac = monte_carlo(observations_oac)
estimates_cs, error_bars_cs = monte_carlo(observations_cs)
# http://matplotlib.org/examples/api/barchart_demo.html
# inner function we will need for labels
def autolabel(rects):
# attach some text labels
for rect in rects:
height = rect.get_height()
height_format = "{:.0%}".format(height)
plt.text(rect.get_x()+rect.get_width()/2., height + 0.025, height_format,
ha='center', va='bottom')
# okay, now we are ready to get our plot on
import matplotlib
# The backend must be set BEFORE importing matplotlib.pyplot
# matplotlib.use('Agg') # Generates image.
import matplotlib.pyplot as plt
# x axis index
x = np.arange(1, len(categories)+1)
plt.figure(1)
plt.subplot(211)
plt.title("Answer that best describes yourself: OAC N=" + str(sum(observations_oac)))
plt.xticks(x, [label.replace(" ","\n") for label in categories ])
plt.ylim((0,0.5))
rects1 = plt.bar(x, estimates_oac, 0.8, yerr=error_bars_oac, facecolor='none', align='center', ecolor="k" )
autolabel(rects1)
plt.subplot(212)
plt.title("Answer that best describes yourself: Calisphere N=" + str(sum(observations_cs)))
plt.xticks(x, [label.replace(" ","\n") for label in categories ])
plt.ylim((0,0.5))
rects2 = plt.bar(x, estimates_cs, 0.8, yerr=error_bars_oac, facecolor='none', align='center', ecolor="k" )
autolabel(rects2)
plt.tight_layout()
plt.savefig("figure.png", dpi = (100))
# http://workshops.arl.arizona.edu/python1/python_modules/matplotlib_example.py
return 0
def monte_carlo(observations):
""" takes a standard array of observations and returns a 2 dimensional numpy array of simulation results
"""
ITERATIONS = 1000
results = []
for n in xrange(ITERATIONS):
results.append( dirichlet([x+1 for x in observations]) )
# convert results to a numby array for better indexing/sums
results = array(results)
# summarize the results
estimates = sum(results, axis=0) / ITERATIONS
# "You need 2*N, not N*2 arrays here."
lows = []
highs = []
# calculate credible interval
for k in xrange(len(estimates)):
q025 = percentile(results[:,k], 2.5)
q975 = percentile(results[:,k], 97.5)
# http://www.mailinglistarchive.com/html/[email protected]/2009-09/msg00181.html
lows.append( estimates[k] - q025)
highs.append( q975 - estimates[k])
return estimates, [ lows, highs ]
if __name__ == '__main__':
status = main()
sys.exit(status)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment