naranjja · April 20, 2018 16:56
diff --git a/cast-bools-to-ints.py b/cast-bools-to-ints.py
 # effecitvely cast boolean to integer
 df3['species'] = df3['species'] * 1  # operating on series overcharges, lambda all over series, True * 1 = 1, False * 1 = 0
 df3.head()
diff --git a/concat-merge-join.py b/concat-merge-join.py
 # if you are confused about the difference between concat, merge and join
 # you can use these examples to save some time

 # append columns without using index (glue columns together no matter the order)
 df3 = pd.concat([df1, df2], axis=1, ignore_index=True)

 # append columns using index (glue columns together respecting order)
 df3 = pd.concat([df1, df2], axis=1, ignore_index=False)

 # join columns using index (not glue, but actually look for matches)
 df3 = df1.join(df2, how='inner', lsuffix='_df1', rsuffix='_df2')  # suffix must be specified for clashing columns

 # join dataframes, specify strategy, specify index (this is the easiest)
 df3 = df1.merge(df2, how='left', on='some_id')

 # append rows (stack rows together)
 df3 = pd.concat([df1, df2], axis=0)
diff --git a/delete-and-clean.py b/delete-and-clean.py
 from gc import collect

 some_variable = "some value"

 # use these lines to delete a Python object and then garbage collect the mess
 del some_variable
 collect()
diff --git a/get-data-from-seaborn.py b/get-data-from-seaborn.py
 # seaborn is a visualization library that comes preinstalled with some toy data

 # use these lines to load the iris dataset into a pandas dataframe for example
 import seaborn as sns
 df = sns.load_dataset('iris')
diff --git a/iterate-df.py b/iterate-df.py
 # sometimes you want to iterate some df

 # you can use an iterator
 for index, row in df.iterrows():
    print(index, row['sepal_length'], row['sepal_width'])
    
 # however, an iterator returns a copy and does not modify the df!

 # so if you want to do something like creating a new column,
 for index, row in df.iterrows():
  row['meters'] = row['cm'] * 100
  # you are modifying this scoped row, but not the actual row
  
 # also, if you wanted to modify an existing row,
 for index, row in df.iterrows():
  row['x'] += 1
  # you would still be modifying a copy
  
 # for these things you would use apply
 df['meters'] = df['cm'].apply(lambda x: x * 100)
 df.loc[df[:, 'x']] = df['x'].apply(lambda x: x + 1)

 # iterrows iterates the df row per row, row meaning a series
 # series are glorified numpy arrays which are themselves lists

 # there is a faster way to iterate in Python: tuples
 # if only iterating, you may use itertuples which is around 275x faster
 for row in df.itertuples():
    print(row.sepal_length, row.sepal_width)
    
 # tuples are a little bit tricky to work with though, be warned
 # they are also copies and not the original df
diff --git a/loc-or-not.py b/loc-or-not.py
 # everyone gets confused about when to use loc
 # especially because of settingwithcopywarnings

 # loc allows you to localize a subset of a dataframe using a label

 # whether by row,
 df.loc["some-row"]  # get a series of all columns from row "some-row"

 # by column,
 df.loc[:, "some-column"]  # get a series of all rows in column "some-column"

 # or both,
 df.loc["some-row", "some-column"]  # get a scalar (the value in that x-y coordinate)

 # iloc is another tool you can use
 # it is the same as loc but uses indexes instead of labels

 # whether by row,
 df.iloc[3]  # get a series of all columns from the fourth row

 # by column,
 df.loc[:, 4]  # get a series of all rows in the fifth column

 # or both,
 df.loc[3, 4]  # get a scalar (the value in that x-y coordinate)

 # sometimes you want to get more than one column,
 df.loc[:, ["column3", "column5"]]  # get a dataframe

 # and sometimes more than one row
 df.loc[["row4", "row6"]]  # get a dataframe

 # sometimes you want to specify a condition,
 # for this, loc can use a boolean selector

 # for example, if you run
 df.loc[True, "column6"]  # you would get a series of all rows in column6 that are True
 # True just by itself means that they exist, are not null, etc.

 # this is a very powerful concept because that means that you can use a condition there
 # conditions return booleans

 # for example, if you run
 df.loc[df["age"] > 18, "column7"]  

 # you would get a series of all rows in column7 where age is greater than 18
 # this works because df["age"] > 18 returns a boolean array

 # something like this:
 df.loc[[True, True, False, True, False], "column17"]
 # and pandas understands that as a dataframe location

 # additionally, you can use loc to set values on the original df
 df.loc[:, "column9"] = "some_value"

 # this is the same as this, but this way is not encouraged
 df["column9"] = "some_value"

 # the reason it is not encouraged is because this pattern can get ambiguous

 # this makes all rows with age > 18 have "adult" as the value of "client_type" 
 df.loc[df["age"] > 18, "client_type"] = "adult"

 # which is not equivalent to this
 df[df["age"] > 18]["client_type"] = "adult"

 # although it looks like they are the same, 
 df[df["age"] > 18]  # returns a copy, we can call this temp_df

 temp_df["client_type"] = "adult"  # modifies temp_df but not the original df
diff --git a/map-values.py b/map-values.py
 df3['species'] = df3['species'].map({'virginica': 1})  # map some value to a new value
 # map must include all values, else passes NaN
diff --git a/show-all-outputs.py b/show-all-outputs.py
 # use these lines at the top of a Jupyter Notebook to show all outputs
 from IPython.core.interactiveshell import InteractiveShell
 InteractiveShell.ast_node_interactivity = 'all'
diff --git a/split-data-into-quantiles.py b/split-data-into-quantiles.py
 data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]

 # whether using a number of bins
 s = pd.cut(np.array(data), bins=3, labels=["good", "medium", "bad"])

 # or a number of quantiles
 s = pd.qcut(np.array(data), q=3, labels=["good", "medium", "bad"])
	# effecitvely cast boolean to integer
	df3['species'] = df3['species'] * 1 # operating on series overcharges, lambda all over series, True * 1 = 1, False * 1 = 0
	df3.head()
	# if you are confused about the difference between concat, merge and join
	# you can use these examples to save some time

	# append columns without using index (glue columns together no matter the order)
	df3 = pd.concat([df1, df2], axis=1, ignore_index=True)

	# append columns using index (glue columns together respecting order)
	df3 = pd.concat([df1, df2], axis=1, ignore_index=False)

	# join columns using index (not glue, but actually look for matches)
	df3 = df1.join(df2, how='inner', lsuffix='_df1', rsuffix='_df2') # suffix must be specified for clashing columns

	# join dataframes, specify strategy, specify index (this is the easiest)
	df3 = df1.merge(df2, how='left', on='some_id')

	# append rows (stack rows together)
	df3 = pd.concat([df1, df2], axis=0)
	# seaborn is a visualization library that comes preinstalled with some toy data

	# use these lines to load the iris dataset into a pandas dataframe for example
	import seaborn as sns
	df = sns.load_dataset('iris')
	# sometimes you want to iterate some df

	# you can use an iterator
	for index, row in df.iterrows():
	print(index, row['sepal_length'], row['sepal_width'])

	# however, an iterator returns a copy and does not modify the df!

	# so if you want to do something like creating a new column,
	for index, row in df.iterrows():
	row['meters'] = row['cm'] * 100
	# you are modifying this scoped row, but not the actual row

	# also, if you wanted to modify an existing row,
	for index, row in df.iterrows():
	row['x'] += 1
	# you would still be modifying a copy

	# for these things you would use apply
	df['meters'] = df['cm'].apply(lambda x: x * 100)
	df.loc[df[:, 'x']] = df['x'].apply(lambda x: x + 1)

	# iterrows iterates the df row per row, row meaning a series
	# series are glorified numpy arrays which are themselves lists

	# there is a faster way to iterate in Python: tuples
	# if only iterating, you may use itertuples which is around 275x faster
	for row in df.itertuples():
	print(row.sepal_length, row.sepal_width)

	# tuples are a little bit tricky to work with though, be warned
	# they are also copies and not the original df
	# everyone gets confused about when to use loc
	# especially because of settingwithcopywarnings

	# loc allows you to localize a subset of a dataframe using a label

	# whether by row,
	df.loc["some-row"] # get a series of all columns from row "some-row"

	# by column,
	df.loc[:, "some-column"] # get a series of all rows in column "some-column"

	# or both,
	df.loc["some-row", "some-column"] # get a scalar (the value in that x-y coordinate)

	# iloc is another tool you can use
	# it is the same as loc but uses indexes instead of labels

	# whether by row,
	df.iloc[3] # get a series of all columns from the fourth row

	# by column,
	df.loc[:, 4] # get a series of all rows in the fifth column

	# or both,
	df.loc[3, 4] # get a scalar (the value in that x-y coordinate)

	# sometimes you want to get more than one column,
	df.loc[:, ["column3", "column5"]] # get a dataframe

	# and sometimes more than one row
	df.loc[["row4", "row6"]] # get a dataframe

	# sometimes you want to specify a condition,
	# for this, loc can use a boolean selector

	# for example, if you run
	df.loc[True, "column6"] # you would get a series of all rows in column6 that are True
	# True just by itself means that they exist, are not null, etc.

	# this is a very powerful concept because that means that you can use a condition there
	# conditions return booleans

	# for example, if you run
	df.loc[df["age"] > 18, "column7"]

	# you would get a series of all rows in column7 where age is greater than 18
	# this works because df["age"] > 18 returns a boolean array

	# something like this:
	df.loc[[True, True, False, True, False], "column17"]
	# and pandas understands that as a dataframe location

	# additionally, you can use loc to set values on the original df
	df.loc[:, "column9"] = "some_value"

	# this is the same as this, but this way is not encouraged
	df["column9"] = "some_value"

	# the reason it is not encouraged is because this pattern can get ambiguous

	# this makes all rows with age > 18 have "adult" as the value of "client_type"
	df.loc[df["age"] > 18, "client_type"] = "adult"

	# which is not equivalent to this
	df[df["age"] > 18]["client_type"] = "adult"

	# although it looks like they are the same,
	df[df["age"] > 18] # returns a copy, we can call this temp_df

	temp_df["client_type"] = "adult" # modifies temp_df but not the original df
	df3['species'] = df3['species'].map({'virginica': 1}) # map some value to a new value
	# map must include all values, else passes NaN
	# use these lines at the top of a Jupyter Notebook to show all outputs
	from IPython.core.interactiveshell import InteractiveShell
	InteractiveShell.ast_node_interactivity = 'all'
	data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]

	# whether using a number of bins
	s = pd.cut(np.array(data), bins=3, labels=["good", "medium", "bad"])

	# or a number of quantiles
	s = pd.qcut(np.array(data), q=3, labels=["good", "medium", "bad"])