Created
October 11, 2012 12:42
-
-
Save michellesun/3872046 to your computer and use it in GitHub Desktop.
colintch8.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
yahookey = 'cxtFNDLV34GrCw8Ns25KZt30SxLxZ85dZLUVlPCl.Gi0l.s1wrTTGuGclQK6bP9u6yeN' | |
from xml.dom.minidom import parseString | |
from urllib import urlopen, quote_plus | |
class matchrow: | |
def __init__(self,row,allnum=False): | |
if allnum: | |
self.data = [float(row[i]) for i in range(len(row)-1)] | |
# if allnum, then get the float of all items in row | |
else: | |
self.data = row[0:len(row)-1] | |
self.match = int(row[len(row)-1]) | |
# the last item in the row is 0/1, whether there | |
# is a match | |
def loadmatch(f,allnum=False): | |
row = [] | |
for line in file(f): | |
rows.append(matchrow(line.split(','),allnum)) | |
# creates a list of matchrow classes | |
# with raw data and match or not | |
return rows | |
def lineartrain(rows): | |
averages = {} | |
counts = {} | |
for row in rows: | |
#Get the class of this point | |
cl = row.match | |
averages.setdefault(cl,[0.0]*(len(row.data))) | |
# is this a default function? what does it do? | |
# if key is in the dict, returns value | |
# if not, insert key with value of default | |
# returns default (None) | |
counts.setdefault(cl,0) | |
# Add this point to the averages | |
for i in range(len(row.data)): | |
average[cl][i] += float(row.data[i]) | |
# Keep track of counts of points | |
counts[cl] += 1 | |
# Divide sums by counts to get averages | |
for cl, avg in averages.items(): | |
# iterate through the key/value pairs of dict | |
for i in range(len(avg)): | |
avg[i] /= counts[cl] | |
return averages | |
def dotproduct(v1,v2): | |
return sum([v1[i]*v2[i] for i in range(len(v1))]) | |
def dpclassify(point,avgs): | |
b = (dotproduct(avgs[1],avgs[1]) - dotproduct(avgs[0],avgs[0]))/2 | |
y = dotproduct(point,avgs[0]) - dotproduct(point,avgs[1]) + b | |
if y > 0: | |
return 0 | |
else: | |
return 1 | |
def yesno(v): | |
if v == 'yes': | |
return 1 | |
elif v == 'no': | |
return -1 | |
else: #for ambiguous or missing data ('i dont know') | |
return 0 | |
def matchcount(interest1, interest2): | |
# good way to count common items in a list | |
l1 = interest1.split(':') | |
l2 = interest2.split(':') | |
x = 0 | |
for v in l1: | |
if v in l2: | |
x += 1 | |
return x | |
def milesdistance(a1,a2): | |
return 0 | |
loc_cache = {} | |
def getlocation(address): | |
if address in loc_cache: | |
return loc_cache[address] | |
data = urlopen('http://api.local.yahoo.com/MapService/V1/'+\ | |
'geocode?appid=%s&location=%s' % | |
(yahookey,quote_plus(address))).read() | |
# quote_plus replaces spaces by plus signs | |
doc = parseString(data) | |
lat = doc.getElementsByTagName('Latitude')[0].firstChild.nodeValue | |
lng = doc.getElementsByTagName('Longtitude')[0].firstChild.nodeValue | |
loc_cache[address] = (float(lat),float(lng)) | |
return loc_cache[address] | |
# create an estimation because strict conversion | |
# from lat/lng to miles can be tricky | |
def milesdistance(a1,a2): | |
# a degree of longitude is approximately 53 miles | |
# a degree of latitude is approximately 69.1 miles | |
lat1, lng1 = getlocation(a1) | |
lat2, lng2 = getlocation(a2) | |
latdif = 69.1 * (lat2-lat1) | |
lngdif = 53.0 * (lng2-lng1) | |
return (latdif**2 + lngdif**2)**.5 | |
# create the new dataset to train the classifier | |
## Great way to transform data by applying functions | |
## on each | |
def loadnumerical(): | |
oldrows = loadmatch('matchmaker.csv') | |
newrows = [] | |
for row in oldrows: | |
d = row.data | |
data = [float(d[0]),yesno(d[1]),yesno(d[2]), | |
float(d[5]),yesno(d[6]),yesno(d[7]), | |
matchcount(d[3],d[8]), | |
milesdistance(d[4],d[9]), | |
row.match] | |
newrows.append(matchrow(data)) | |
return newrows | |
# scaling the data (age differences vs. opinions on | |
# children) | |
# scale by determining the highest/lowest value of each variable | |
def scaledata(rows): | |
low = [999999999.0]*len(row[0].data) | |
high = [-999999999.0]*len(row[0].data) | |
# find highest and lowest | |
## interesting way to find max/min | |
for row in rows: | |
d = row.data | |
for i in range(len(d)): | |
if d[i]<low[i]: | |
low[i] = d[i] | |
if d[i]>high[i]: | |
high[i] = d[i] | |
# create function that scales data | |
def scaleinput(d): | |
return [(d.data[i]-low[i])/(high[i]-low[i]) | |
for i in range(len(low))] | |
# scale all the data | |
newrows = [matchrow(scaleinput(row.data)+[row.match]) | |
for row in rows] | |
# return new data and function | |
return newrows, scaleinput | |
def rbf(v1,v2,gamma=20): | |
# similar to the dot product but it can map more complex spaces | |
dv = [v1[i]-v2[i] for i in range(len(v1))] | |
l = veclength(dv) | |
return math.e**(-gamma*l) | |
def nlclassify(point,rows,offset,gamma=10): | |
sum0=0.0 | |
sum1=0.0 | |
count0=0 | |
count1=0 | |
for row in rows: | |
if row.match == 0: | |
sum0 += rbf(point,row.data,gamma) | |
count0 += 1 | |
else: | |
sum1 += rbf(point,row.data,gamma) | |
count1 +=1 | |
y = (1.0/count0)*sum0 -(1.0/count1)*sum1+offset | |
if y <0: | |
return 0 | |
else: | |
return 1 | |
def getoffset(rows,gamma=10): | |
l0 = [] | |
l1 = [] | |
for row in rows: | |
if row.match == 0: | |
l0.append(row.data) | |
else: | |
l1.append(row.data) | |
sum0 = sum(sum([rbf(v1,v2,gamma) for v1 in l0]) for v2 in l0) | |
sum1 = sum(sum([rbf(v1,v2,gamma) for v1 in l1]) for v2 in l1) | |
return (1.0/(len(l1)**2))*sum1-(1.0/(len(l0)**2))*sum0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment