Skip to content

Instantly share code, notes, and snippets.

@gavinwhyte
Created July 26, 2016 10:19
Show Gist options
  • Save gavinwhyte/fbe961cc39ec829ffd3b00a049ac7e84 to your computer and use it in GitHub Desktop.
Save gavinwhyte/fbe961cc39ec829ffd3b00a049ac7e84 to your computer and use it in GitHub Desktop.
python
# After determining with attributes are categorical and which
# are numeric , you'll want descriptive stat for the numeric
# variables and a count of the unique categories in each
# categorical attribute
import urllib2
import sys
import numpy as np
# Read data from the repository
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/"
"sonar.all-data")
data = urllib2.urlopen(target_url)
#arrange data into list for labels and list of lists for
#attributes
xList = []
labels = []
for line in data:
#split on comma
row = line.strip().split(",")
xList.append(row)
nrow = len(xList)
ncol = len(xList[1])
# generate summary stats for col 3
col = 3
colData = []
for row in xList:
colData.append(float(row[col]))
colArray = np.array(colData)
colMean = np.mean(colArray)
colsd = np.std(colArray)
sys.stdout.write("Mean = " + '\t' + str(colMean) + '\t\t' +
"standard Deviation = " + '\t' + str(colsd) +
"\n")
# Calaculate qunatile boundaries
ntiles = 4
percentBdry = []
for i in range(ntiles + 1):
percentBdry.append(np.percentile(colArray, i*(100)/ntiles))
sys.stdout.write("\nBoundaries for 4 Equal Percentiles \n")
print(percentBdry)
sys.stdout.write(" \n")
# Run with 10 equal intervals
ntiles = 10
percentBdry = []
for i in range(ntiles + 1):
percentBdry.append(np.percentile(colArray, i*(100)/ntiles))
sys.stdout.write("\nBoundaries for 10 Equal Percentiles \n")
print(percentBdry)
# The last column contains categorical values
col = 60
colData = []
for row in xList:
colData.append(row[col])
unique = set(colData)
sys.stdout.write("Unique Label Values \n")
print(unique)
#count up the number of elements having each value
print "len unique" , len(unique)
catDict = dict(zip(list(unique), range(len(unique))))
catCount = [0]*2
for elt in colData:
catCount[catDict[elt]] +=1
print list(unique)
print (catCount)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment