Skip to content

Instantly share code, notes, and snippets.

@ivangeorgiev
Last active October 31, 2015 21:14
Show Gist options
  • Save ivangeorgiev/189c08ad7e20b591ba1d to your computer and use it in GitHub Desktop.
Save ivangeorgiev/189c08ad7e20b591ba1d to your computer and use it in GitHub Desktop.
Data Management and Visualization Assignments
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 31 20:45:05 2015
@author: baobab
"""
import pandas;
import numpy as np
import matplotlib.pyplot as plt
data = pandas.read_csv('../codebooks/marscrater_pds.csv', low_memory=False)
print("========= 1. Data Exploration =========")
print("---------- 1.1. General Dataset Characteristics ----------")
print("Number of observations: {}".format(len(data)))
print("Number of variables : {}".format(len(data.columns)))
print("Variables : \n ", "\n ".join(data.columns))
print()
print("---------- 1.2. LONGITUDE_CIRCLE_IMAGE ----------")
print("min: ", data['LONGITUDE_CIRCLE_IMAGE'].min())
print("max: ", data['LONGITUDE_CIRCLE_IMAGE'].max())
# Compute LONGITUDE distribution
longitudeCounts = data['LONGITUDE_CIRCLE_IMAGE'].value_counts(sort=True, dropna=False)
print("Unique values : {}".format(len(longitudeCounts)))
print("Top 5 values and counts : \n", longitudeCounts.head(5))
print()
print("---------- 1.3. LATITUDE_CIRCLE_IMAGE ----------")
print("min: ", data['LATITUDE_CIRCLE_IMAGE'].min())
print("max: ", data['LATITUDE_CIRCLE_IMAGE'].max())
# Compute LATITUDE distribution
latitudeCounts = data['LATITUDE_CIRCLE_IMAGE'].value_counts(sort=True, dropna=False)
#latitudeCountsGrouped = data['LATITUDE_CIRCLE_IMAGE'].value_counts(sort=True, dropna=False, bins=)
print("Unique values : {}".format(len(latitudeCounts)))
print("Top 5 values and counts : \n", latitudeCounts.head(5))
# DIAM_CIRCLE_IMAGE
print()
print("---------- 1.4. DIAM_CIRCLE_IMAGE ----------")
print("min: ", data['DIAM_CIRCLE_IMAGE'].min())
print("max: ", data['DIAM_CIRCLE_IMAGE'].max())
# Compute DIAMETER distribution
diamCounts = data['DIAM_CIRCLE_IMAGE'].value_counts(sort=True, dropna=False)
print("Unique values : {}".format(len(latitudeCounts)))
print("Top 10 values and counts : \n", diamCounts.head(10))
print("========= 2. Data Management =========")
data['LONGITUDE_CIRCLE_IMAGE'] = data['LONGITUDE_CIRCLE_IMAGE'].convert_objects(convert_numeric=True)
data['LATITUDE_CIRCLE_IMAGE'] = data['LATITUDE_CIRCLE_IMAGE'].convert_objects(convert_numeric=True)
data['DIAM_CIRCLE_IMAGE'] = data['DIAM_CIRCLE_IMAGE'].convert_objects(convert_numeric=True)
print()
print("---------- 2.1. LONGITUDE_CIRCLE_IMAGE ----------")
longitudeBins = [l for l in range(-190,191,15)]
longitudeCountBins = data['LONGITUDE_CIRCLE_IMAGE'].value_counts(dropna=False, bins=longitudeBins)
data['LONGITUDE_GROUP'] = pandas.cut(data['LONGITUDE_CIRCLE_IMAGE'], longitudeBins)
longitudeGroupPercents = data['LONGITUDE_GROUP'].value_counts(dropna=False, normalize=True)
print("Frequences in bins with length 15: ")
print(longitudeCountBins)
print()
print("Group frequences, %: ")
print(longitudeGroupPercents*100)
print()
print("After grouping the longitude values, we can observe that relatively uniform")
print("distribution of the craters, although there are areas with smaller 'population'.")
print()
plt.hist(data['LONGITUDE_CIRCLE_IMAGE'], bins=longitudeBins)
plt.xlabel('Longitude')
plt.ylabel('Frequency')
plt.title('Longitude Distribution')
plt.grid(True)
plt.show()
print()
print("---------- 2.2. LATITUDE_CIRCLE_IMAGE ----------")
latitudeBins = [l for l in range(-90,91,10)]
latitudeCountBins = data['LATITUDE_CIRCLE_IMAGE'].value_counts(dropna=False, bins=latitudeBins)
data['LATITUDE_GROUP'] = pandas.cut(data['LATITUDE_CIRCLE_IMAGE'], latitudeBins)
latitudeGroupPercents = data['LATITUDE_GROUP'].value_counts(dropna=False, normalize=True)
print("Frequences in bins: ")
print(latitudeCountBins)
print()
print("Group frequences, %: ")
print(latitudeGroupPercents*100)
print()
print("From the binned latitude values we can see that about 35% of the craters are")
print("located between 0 and -30 degrees.")
plt.hist(data['LATITUDE_CIRCLE_IMAGE'], bins=latitudeBins)
plt.xlabel('Latitude')
plt.ylabel('Frequency')
plt.title('Latitude Distribution')
plt.grid(True)
plt.show()
print()
print("---------- 2.3. DIAM_CIRCLE_IMAGE ----------")
print("Frequences in bins range(0,2000, 100): ")
diamCountBins = data['DIAM_CIRCLE_IMAGE'].value_counts(dropna=False, bins=range(0,2000, 100))
print(diamCountBins)
print()
print("Most of the diameter values are in the range (0,100].")
print("Let's refine further the (0,100] range.")
print()
print("Frequences in bins range(0,100, 5): ")
diamCountBins = data['DIAM_CIRCLE_IMAGE'].value_counts(dropna=False, bins=range(0,100, 5))
print(diamCountBins)
print()
diamBins = [0,1,2,3,4,5,6,7,8,9,10,11,12,15,20,40,60,100,500,1000,2000]
print("We can manually split into bins: ")
print(diamBins)
print("With following frequences (in %): ")
data['DIAM_GROUP'] = pandas.cut(data['DIAM_CIRCLE_IMAGE'], diamBins)
diamCountBins = data['DIAM_GROUP'].value_counts(dropna=False, normalize=True)
print(100*diamCountBins)
plt.hist(data['DIAM_CIRCLE_IMAGE'], bins=diamBins[1:12])
plt.xlabel('Diameter')
plt.ylabel('Frequency')
plt.title('Diameter Distribution')
plt.grid(True)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment