ejcer · October 24, 2015 20:44
diff --git a/gistfile1.txt b/gistfile1.txt
 # Gather some data that is of interest to you.
 # Analyze the data using a scatterplot matrix. List at least 3 non-trivial
 #findings about the data, along with plot pictures that show the findings.


 ### Imports ###
 import numpy as np
 import pandas as pd
 from pandas import Series, DataFrame
 import os
 os.chdir('/home/edward/workspace/school/datavis')
 os.getcwd()
 from scipy import stats
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import seaborn as sns
 import statsmodels.api as sm
 from numpy.random import randn


 #
 #I chose the titanic.csv dataset, because the cmda club is using this dataset
 #to learn python and data science. We have not begun to use the dataset in the
 #club, but when we begin, I will be extremely familiar with it. I talked with
 #Dr. Davanloo, and he approved my usage of this dataset for this problem.
 #
 #Inside the titanic.csv file there are 891 records and there are 12 columns.
 #In summary the columns list attributes about the passengers that rode on the
 #titanic, as well as, whether or not the passenger survived the wreckage.
 #
 #The columns of data that I will be focusing on are the following:
 #'Survived': 0 if the passenger did not survive, 1 if the passenger survived
 #'Pclass': The class of the passenger from third to first class.
 #'Age': The age of the passenger
 #'Fare': The amount of money the passenger paid for their passage
 #
 ### Data Reading & Cleaning ###

 #df = sns.load_dataset("iris")

 titanic_df = pd.read_csv('./datafiles/titanic.csv')

 cleaned_titanic_df = titanic_df.drop('Name',1).drop('Sex',1).drop('Parch',1).drop('Ticket',1).drop('Cabin',1).drop('Embarked',1)
 scatter_matrix(cleaned_titanic_df, alpha=0.5, figsize=(5, 5), diagonal='hist')
 ### Scatterplot Analysis ###

 #This first plot displays the age of the passenger on the y-axis and whether or
 #not the passenger survived on the x axis. From this plot, one can observe a
 #slight trend where the younger a passenger is, the more likely they are to
 #survive the wreckage. This makes sense, since generally passengers let children
 #on to the lifeboats first.
 sns.lmplot('Survived','Age',titanic_df)

 #The second plot displays the passenger class of the passenger on the y-axis and
 #whether or not the passenger survived on the x axis. From this plot, one can
 #observe a trend where passengers were more likely to survive if they were
 #in first class. This makes sense, because third class passengers had their
 #sleeping compartments located deeper within the ship, and those compartments
 #flooded sooner than the upper compartments
 sns.lmplot('Survived','Pclass',titanic_df)

 #The second plot displays the ticket fare paid for the passenger on the y-axis and
 #whether or not the passenger survived on the x axis. From this plot, one can
 #observe a trend where passengers were more likely to survive if they paid
 #more money for their ticket. This makes sense, because the amount they paid
 #for their ticket likely correlates with what passenger class the passenger was
 #in, and so the lower class passengers were more likely to reside in a
 #compartment that flooded sooner.
 sns.lmplot('Survived','Fare', titanic_df)

 plt.show()
	# Gather some data that is of interest to you.
	# Analyze the data using a scatterplot matrix. List at least 3 non-trivial
	#findings about the data, along with plot pictures that show the findings.


	### Imports ###
	import numpy as np
	import pandas as pd
	from pandas import Series, DataFrame
	import os
	os.chdir('/home/edward/workspace/school/datavis')
	os.getcwd()
	from scipy import stats
	import matplotlib as mpl
	import matplotlib.pyplot as plt
	import seaborn as sns
	import statsmodels.api as sm
	from numpy.random import randn


	#
	#I chose the titanic.csv dataset, because the cmda club is using this dataset
	#to learn python and data science. We have not begun to use the dataset in the
	#club, but when we begin, I will be extremely familiar with it. I talked with
	#Dr. Davanloo, and he approved my usage of this dataset for this problem.
	#
	#Inside the titanic.csv file there are 891 records and there are 12 columns.
	#In summary the columns list attributes about the passengers that rode on the
	#titanic, as well as, whether or not the passenger survived the wreckage.
	#
	#The columns of data that I will be focusing on are the following:
	#'Survived': 0 if the passenger did not survive, 1 if the passenger survived
	#'Pclass': The class of the passenger from third to first class.
	#'Age': The age of the passenger
	#'Fare': The amount of money the passenger paid for their passage
	#
	### Data Reading & Cleaning ###

	#df = sns.load_dataset("iris")

	titanic_df = pd.read_csv('./datafiles/titanic.csv')

	cleaned_titanic_df = titanic_df.drop('Name',1).drop('Sex',1).drop('Parch',1).drop('Ticket',1).drop('Cabin',1).drop('Embarked',1)
	scatter_matrix(cleaned_titanic_df, alpha=0.5, figsize=(5, 5), diagonal='hist')
	### Scatterplot Analysis ###

	#This first plot displays the age of the passenger on the y-axis and whether or
	#not the passenger survived on the x axis. From this plot, one can observe a
	#slight trend where the younger a passenger is, the more likely they are to
	#survive the wreckage. This makes sense, since generally passengers let children
	#on to the lifeboats first.
	sns.lmplot('Survived','Age',titanic_df)

	#The second plot displays the passenger class of the passenger on the y-axis and
	#whether or not the passenger survived on the x axis. From this plot, one can
	#observe a trend where passengers were more likely to survive if they were
	#in first class. This makes sense, because third class passengers had their
	#sleeping compartments located deeper within the ship, and those compartments
	#flooded sooner than the upper compartments
	sns.lmplot('Survived','Pclass',titanic_df)

	#The second plot displays the ticket fare paid for the passenger on the y-axis and
	#whether or not the passenger survived on the x axis. From this plot, one can
	#observe a trend where passengers were more likely to survive if they paid
	#more money for their ticket. This makes sense, because the amount they paid
	#for their ticket likely correlates with what passenger class the passenger was
	#in, and so the lower class passengers were more likely to reside in a
	#compartment that flooded sooner.
	sns.lmplot('Survived','Fare', titanic_df)

	plt.show()