Last active
October 31, 2015 21:14
-
-
Save ivangeorgiev/189c08ad7e20b591ba1d to your computer and use it in GitHub Desktop.
Data Management and Visualization Assignments
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Sat Oct 31 20:45:05 2015 | |
@author: baobab | |
""" | |
import pandas; | |
import numpy as np | |
import matplotlib.pyplot as plt | |
data = pandas.read_csv('../codebooks/marscrater_pds.csv', low_memory=False) | |
print("========= 1. Data Exploration =========") | |
print("---------- 1.1. General Dataset Characteristics ----------") | |
print("Number of observations: {}".format(len(data))) | |
print("Number of variables : {}".format(len(data.columns))) | |
print("Variables : \n ", "\n ".join(data.columns)) | |
print() | |
print("---------- 1.2. LONGITUDE_CIRCLE_IMAGE ----------") | |
print("min: ", data['LONGITUDE_CIRCLE_IMAGE'].min()) | |
print("max: ", data['LONGITUDE_CIRCLE_IMAGE'].max()) | |
# Compute LONGITUDE distribution | |
longitudeCounts = data['LONGITUDE_CIRCLE_IMAGE'].value_counts(sort=True, dropna=False) | |
print("Unique values : {}".format(len(longitudeCounts))) | |
print("Top 5 values and counts : \n", longitudeCounts.head(5)) | |
print() | |
print("---------- 1.3. LATITUDE_CIRCLE_IMAGE ----------") | |
print("min: ", data['LATITUDE_CIRCLE_IMAGE'].min()) | |
print("max: ", data['LATITUDE_CIRCLE_IMAGE'].max()) | |
# Compute LATITUDE distribution | |
latitudeCounts = data['LATITUDE_CIRCLE_IMAGE'].value_counts(sort=True, dropna=False) | |
#latitudeCountsGrouped = data['LATITUDE_CIRCLE_IMAGE'].value_counts(sort=True, dropna=False, bins=) | |
print("Unique values : {}".format(len(latitudeCounts))) | |
print("Top 5 values and counts : \n", latitudeCounts.head(5)) | |
# DIAM_CIRCLE_IMAGE | |
print() | |
print("---------- 1.4. DIAM_CIRCLE_IMAGE ----------") | |
print("min: ", data['DIAM_CIRCLE_IMAGE'].min()) | |
print("max: ", data['DIAM_CIRCLE_IMAGE'].max()) | |
# Compute DIAMETER distribution | |
diamCounts = data['DIAM_CIRCLE_IMAGE'].value_counts(sort=True, dropna=False) | |
print("Unique values : {}".format(len(latitudeCounts))) | |
print("Top 10 values and counts : \n", diamCounts.head(10)) | |
print("========= 2. Data Management =========") | |
data['LONGITUDE_CIRCLE_IMAGE'] = data['LONGITUDE_CIRCLE_IMAGE'].convert_objects(convert_numeric=True) | |
data['LATITUDE_CIRCLE_IMAGE'] = data['LATITUDE_CIRCLE_IMAGE'].convert_objects(convert_numeric=True) | |
data['DIAM_CIRCLE_IMAGE'] = data['DIAM_CIRCLE_IMAGE'].convert_objects(convert_numeric=True) | |
print() | |
print("---------- 2.1. LONGITUDE_CIRCLE_IMAGE ----------") | |
longitudeBins = [l for l in range(-190,191,15)] | |
longitudeCountBins = data['LONGITUDE_CIRCLE_IMAGE'].value_counts(dropna=False, bins=longitudeBins) | |
data['LONGITUDE_GROUP'] = pandas.cut(data['LONGITUDE_CIRCLE_IMAGE'], longitudeBins) | |
longitudeGroupPercents = data['LONGITUDE_GROUP'].value_counts(dropna=False, normalize=True) | |
print("Frequences in bins with length 15: ") | |
print(longitudeCountBins) | |
print() | |
print("Group frequences, %: ") | |
print(longitudeGroupPercents*100) | |
print() | |
print("After grouping the longitude values, we can observe that relatively uniform") | |
print("distribution of the craters, although there are areas with smaller 'population'.") | |
print() | |
plt.hist(data['LONGITUDE_CIRCLE_IMAGE'], bins=longitudeBins) | |
plt.xlabel('Longitude') | |
plt.ylabel('Frequency') | |
plt.title('Longitude Distribution') | |
plt.grid(True) | |
plt.show() | |
print() | |
print("---------- 2.2. LATITUDE_CIRCLE_IMAGE ----------") | |
latitudeBins = [l for l in range(-90,91,10)] | |
latitudeCountBins = data['LATITUDE_CIRCLE_IMAGE'].value_counts(dropna=False, bins=latitudeBins) | |
data['LATITUDE_GROUP'] = pandas.cut(data['LATITUDE_CIRCLE_IMAGE'], latitudeBins) | |
latitudeGroupPercents = data['LATITUDE_GROUP'].value_counts(dropna=False, normalize=True) | |
print("Frequences in bins: ") | |
print(latitudeCountBins) | |
print() | |
print("Group frequences, %: ") | |
print(latitudeGroupPercents*100) | |
print() | |
print("From the binned latitude values we can see that about 35% of the craters are") | |
print("located between 0 and -30 degrees.") | |
plt.hist(data['LATITUDE_CIRCLE_IMAGE'], bins=latitudeBins) | |
plt.xlabel('Latitude') | |
plt.ylabel('Frequency') | |
plt.title('Latitude Distribution') | |
plt.grid(True) | |
plt.show() | |
print() | |
print("---------- 2.3. DIAM_CIRCLE_IMAGE ----------") | |
print("Frequences in bins range(0,2000, 100): ") | |
diamCountBins = data['DIAM_CIRCLE_IMAGE'].value_counts(dropna=False, bins=range(0,2000, 100)) | |
print(diamCountBins) | |
print() | |
print("Most of the diameter values are in the range (0,100].") | |
print("Let's refine further the (0,100] range.") | |
print() | |
print("Frequences in bins range(0,100, 5): ") | |
diamCountBins = data['DIAM_CIRCLE_IMAGE'].value_counts(dropna=False, bins=range(0,100, 5)) | |
print(diamCountBins) | |
print() | |
diamBins = [0,1,2,3,4,5,6,7,8,9,10,11,12,15,20,40,60,100,500,1000,2000] | |
print("We can manually split into bins: ") | |
print(diamBins) | |
print("With following frequences (in %): ") | |
data['DIAM_GROUP'] = pandas.cut(data['DIAM_CIRCLE_IMAGE'], diamBins) | |
diamCountBins = data['DIAM_GROUP'].value_counts(dropna=False, normalize=True) | |
print(100*diamCountBins) | |
plt.hist(data['DIAM_CIRCLE_IMAGE'], bins=diamBins[1:12]) | |
plt.xlabel('Diameter') | |
plt.ylabel('Frequency') | |
plt.title('Diameter Distribution') | |
plt.grid(True) | |
plt.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment