Created
July 26, 2016 10:19
-
-
Save gavinwhyte/fbe961cc39ec829ffd3b00a049ac7e84 to your computer and use it in GitHub Desktop.
python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# After determining with attributes are categorical and which | |
# are numeric , you'll want descriptive stat for the numeric | |
# variables and a count of the unique categories in each | |
# categorical attribute | |
import urllib2 | |
import sys | |
import numpy as np | |
# Read data from the repository | |
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-" | |
"databases/undocumented/connectionist-bench/sonar/" | |
"sonar.all-data") | |
data = urllib2.urlopen(target_url) | |
#arrange data into list for labels and list of lists for | |
#attributes | |
xList = [] | |
labels = [] | |
for line in data: | |
#split on comma | |
row = line.strip().split(",") | |
xList.append(row) | |
nrow = len(xList) | |
ncol = len(xList[1]) | |
# generate summary stats for col 3 | |
col = 3 | |
colData = [] | |
for row in xList: | |
colData.append(float(row[col])) | |
colArray = np.array(colData) | |
colMean = np.mean(colArray) | |
colsd = np.std(colArray) | |
sys.stdout.write("Mean = " + '\t' + str(colMean) + '\t\t' + | |
"standard Deviation = " + '\t' + str(colsd) + | |
"\n") | |
# Calaculate qunatile boundaries | |
ntiles = 4 | |
percentBdry = [] | |
for i in range(ntiles + 1): | |
percentBdry.append(np.percentile(colArray, i*(100)/ntiles)) | |
sys.stdout.write("\nBoundaries for 4 Equal Percentiles \n") | |
print(percentBdry) | |
sys.stdout.write(" \n") | |
# Run with 10 equal intervals | |
ntiles = 10 | |
percentBdry = [] | |
for i in range(ntiles + 1): | |
percentBdry.append(np.percentile(colArray, i*(100)/ntiles)) | |
sys.stdout.write("\nBoundaries for 10 Equal Percentiles \n") | |
print(percentBdry) | |
# The last column contains categorical values | |
col = 60 | |
colData = [] | |
for row in xList: | |
colData.append(row[col]) | |
unique = set(colData) | |
sys.stdout.write("Unique Label Values \n") | |
print(unique) | |
#count up the number of elements having each value | |
print "len unique" , len(unique) | |
catDict = dict(zip(list(unique), range(len(unique)))) | |
catCount = [0]*2 | |
for elt in colData: | |
catCount[catDict[elt]] +=1 | |
print list(unique) | |
print (catCount) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment