Last active
December 24, 2015 07:49
-
-
Save cjdd3b/6766232 to your computer and use it in GitHub Desktop.
Shows crude similarities of voting histories between members of Congress using roll call vote matrices from Poole, McCarty and Lewis: http://www.voteview.com/dwnl.htm. Uses vectorized operations to make similarity calculations happen super fast.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
compare.py | |
Quickly produces a pairwise similarity matrix of lawmakers' roll call votes, given | |
an input *.ord matrix file from Poole, McCarty and Lewis: http://www.voteview.com/dwnl.htm | |
''' | |
import numpy, string | |
from scipy.spatial.distance import cdist | |
########## HELPERS ########## | |
class LookerUpper(object): | |
''' | |
Helps look up pairwise similarity scores by member, given an input name. | |
''' | |
def __init__(self, names, matrix): | |
self.names = names | |
self.matrix = matrix | |
def lookup(self, name): | |
for i, j in enumerate(self.matrix[self.names.index(name)]): | |
yield '%s: %s' % (self.names[i], j) | |
def lookup_pair(self, name1, name2): | |
name1_idx = self.names.index(name1) | |
name2_idx = self.names.index(name2) | |
return '%s -> %s: %s' % (name1, name2, self.matrix[name1_idx, name2_idx]) | |
def check(status): | |
''' | |
Zero out any non-votes, for Jaccard. | |
''' | |
if int(status) in [7, 8, 9]: | |
return 0 | |
return status | |
def cleansplit(str): | |
''' | |
Split fixed-width input file. Sigh. | |
''' | |
return map(string.strip, [str[:12], str[12:20], str[:20:25], str[25:36], str[36:]]) | |
########## MAIN ########## | |
if __name__ == '__main__': | |
with open('hou112kh.ord.txt', 'rU') as infile: | |
rows = [cleansplit(a) for a in infile.readlines()] # Parse fixed-width .ord input file | |
names = [r[3] for r in rows] # r[3] is the name code | |
# Build a numpy matrix from all the vote data in the input file. | |
data = numpy.matrix([[check(i) for i in list(r[4])] for r in rows]) | |
# Calculate vectorized pairwise similarity between all senators using Jaccard distance, | |
# which is just a measure of what percentage of each lawmaker's votes lined up with another's. | |
# Subtracting from 1.0 just turns the measure into a similarity score rather than a distance. | |
similarities = 1.0 - cdist(data, data, 'jaccard') | |
# Print similarities, given a lookup name | |
looker_upper = LookerUpper(names, similarities) | |
for i in looker_upper.lookup('BOEHNER'): | |
print i | |
# And here's a lookup of two names | |
print looker_upper.lookup_pair('MCCAUL', 'CANTOR') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment