Skip to content

Instantly share code, notes, and snippets.

{
"took": 14,
"timed_out": false,
"_shards": {
"total": 6,
"successful": 6,
"failed": 0
},
"hits": {
"total": 1419,
@cjdd3b
cjdd3b / csvjoin.py
Created April 2, 2015 21:54
CSV-flattening code for Harsh's research
import csv, os
# This chunk iterates through all of the csv files in a directory, turns them
# into 2-dimensional arrays (lists of lists), and puts all those arrays into
# a list called "tables"
tables = []
# Loop over all files in the current directory (which is what "." means)
for f in os.listdir('.'):
@cjdd3b
cjdd3b / fingerprint.py
Created February 22, 2015 14:17
Python implementation of Google Refine fingerprinting algorithms here: https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth
# -*- coding: utf-8 -*-
import re, string
from unidecode import unidecode
PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))
class Fingerprinter(object):
'''
Python implementation of Google Refine fingerprinting algorithm described here:
import random
class MinHasher(object):
def __init__(self, n, universe_size, seed=None):
if seed != None: random.seed(seed)
self.hash_functions = [self._create_random_hash_function(universe_size) for i in range(n)]
def _create_random_hash_function(self, universe_size):
a = random.randint(0, universe_size)
b = random.randint(0, universe_size)
'''
graph-cluster.py
Some notes for doing graph clustering in a couple different ways: simple spectral
partitioning based on the Fiedler vector, and a density-based clustering using DBSCAN.
Why might this be useful? I'm using it to identify weakly connected (and therefore
probably false) graph components in my campaign finance standardization workflow, the
basic idea of which is here: https://github.com/cjdd3b/fec-standardizer/wiki
<!DOCTYPE html>
<html>
<head>
<title>Leaflet Example</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<!-- Import Leaflet assets -->
<link rel="stylesheet" href="http://leafletjs.com/dist/leaflet.css" />
<script src="http://leafletjs.com/dist/leaflet.js"></script>
def levenshtein(a,b):
"Calculates the Levenshtein distance between a and b."
n, m = len(a), len(b)
if n > m:
# Make sure n <= m, to use O(min(n,m)) space
a,b = b,a
n,m = m,n
current = range(n+1)
for i in range(1,m+1):
@cjdd3b
cjdd3b / compare.py
Last active December 24, 2015 07:49
Shows crude similarities of voting histories between members of Congress using roll call vote matrices from Poole, McCarty and Lewis: http://www.voteview.com/dwnl.htm. Uses vectorized operations to make similarity calculations happen super fast.
'''
compare.py
Quickly produces a pairwise similarity matrix of lawmakers' roll call votes, given
an input *.ord matrix file from Poole, McCarty and Lewis: http://www.voteview.com/dwnl.htm
'''
import numpy, string
from scipy.spatial.distance import cdist
@cjdd3b
cjdd3b / nnsearch.py
Last active December 20, 2015 12:49
import numpy
def get_similar(vec, matrix, K=10):
# Set up the query vector and the whole dataset for K-nearest neighbors query
qvector = numpy.array([vec]).transpose()
alldata = numpy.array(matrix).transpose()
# You can't get more neighbors than there are entities
ndata = alldata.shape[1]
K = K if K < ndata else ndata
{
"discipline_id":"AS",
"discipline_name":"Alpine Skiing",
"results":
[
{
"id":"ASM010",
"name":"Men's Downhill",
"competitor_type":"ATH",
"results":