oserttas-math · December 16, 2017 03:20
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
 # List unique values in a DataFrame column
 # h/t @makmanalp for the updated syntax!
 df['Column Name'].unique()

 # Convert Series datatype to numeric (will error if column has non-numeric values)
 # h/t @makmanalp
 pd.to_numeric(df['Column Name'])

 # Convert Series datatype to numeric, changing non-numeric values to NaN
 # h/t @makmanalp for the updated syntax!
 pd.to_numeric(df['Column Name'], errors='coerce')

 # Grab DataFrame rows where column has certain values
 valuelist = ['value1', 'value2', 'value3']
 df = df[df.column.isin(valuelist)]

 # Grab DataFrame rows where column doesn't have certain values
 valuelist = ['value1', 'value2', 'value3']
 df = df[~df.column.isin(value_list)]

 # Delete column from DataFrame
 del df['column']

 # Select from DataFrame using criteria from multiple columns
 # (use `|` instead of `&` to do an OR)
 newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

 # Rename several DataFrame columns
 df = df.rename(columns = {
    'col1 old name':'col1 new name',
    'col2 old name':'col2 new name',
    'col3 old name':'col3 new name',
 })

 # Lower-case all DataFrame column names
 df.columns = map(str.lower, df.columns)

 # Even more fancy DataFrame column re-naming
 # lower-case all DataFrame column names (for example)
 df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

 # Loop through rows in a DataFrame
 # (if you must)
 for index, row in df.iterrows():
    print index, row['some column']

 # Much faster way to loop through DataFrame rows
 # if you can work with tuples
 # (h/t hughamacmullaniv)
 for row in df.itertuples():
    print(row)

 # Next few examples show how to work with text data in Pandas.
 # Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html

 # Slice values in a DataFrame column (aka Series)
 df.column.str[0:2]

 # Lower-case everything in a DataFrame column
 df.column_name = df.column_name.str.lower()

 # Get length of data in a DataFrame column
 df.column_name.str.len()

 # Sort dataframe by multiple columns
 df = df.sort(['col1','col2','col3'],ascending=[1,1,0])

 # Get top n for each group of columns in a sorted dataframe
 # (make sure dataframe is sorted first)
 top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)

 # Grab DataFrame rows where specific column is null/notnull
 newdf = df[df['column'].isnull()]

 # Select from DataFrame using multiple keys of a hierarchical index
 df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))

 # Change all NaNs to None (useful before
 # loading to a db)
 df = df.where((pd.notnull(df)), None)

 # More pre-db insert cleanup...make a pass through the dataframe, stripping whitespace
 # from strings and changing any empty values to None
 # (not especially recommended but including here b/c I had to do this in real life one time)
 df = df.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None)

 # Get quick count of rows in a DataFrame
 len(df.index)

 # Pivot data (with flexibility about what what
 # becomes a column and what stays a row).
 # Syntax works on Pandas >= .14
 pd.pivot_table(
  df,values='cell_value',
  index=['col1', 'col2', 'col3'], #these stay as columns; will fail silently if any of these cols have null values
  columns=['col4']) #data values in this column become their own column

 # Change data type of DataFrame column
 df.column_name = df.column_name.astype(np.int64)

 # Get rid of non-numeric values throughout a DataFrame:
 for col in refunds.columns.values:
  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

 # Set DataFrame column values based on other column values (h/t: @mlevkov)
 df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value

 # Clean up missing values in multiple DataFrame columns
 df = df.fillna({
    'col1': 'missing',
    'col2': '99.999',
    'col3': '999',
    'col4': 'missing',
    'col5': 'missing',
    'col6': '99'
 })

 # Concatenate two DataFrame columns into a new, single column
 # (useful when dealing with composite keys, for example)
 # (h/t @makmanalp for improving this one!)
 df['newcol'] = df['col1'].astype(str) + df['col2'].astype(str)

 # Doing calculations with DataFrame columns that have missing values
 # In example below, swap in 0 for df['col1'] cells that contain null
 df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']

 # Split delimited values in a DataFrame column into two new columns
 df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

 # Collapse hierarchical column indexes
 df.columns = df.columns.get_level_values(0)

 # Convert Django queryset to DataFrame
 qs = DjangoModelName.objects.all()
 q = qs.values()
 df = pd.DataFrame.from_records(q)

 # Create a DataFrame from a Python dictionary
 df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])

 # Get a report of all duplicate records in a dataframe, based on specific columns
 dupes = df[df.duplicated(['col1', 'col2', 'col3'], keep=False)]

 # Set up formatting so larger numbers aren't displayed in scientific notation (h/t @thecapacity)
 pd.set_option('display.float_format', lambda x: '%.3f' % x)
	# List unique values in a DataFrame column
	# h/t @makmanalp for the updated syntax!
	df['Column Name'].unique()

	# Convert Series datatype to numeric (will error if column has non-numeric values)
	# h/t @makmanalp
	pd.to_numeric(df['Column Name'])

	# Convert Series datatype to numeric, changing non-numeric values to NaN
	# h/t @makmanalp for the updated syntax!
	pd.to_numeric(df['Column Name'], errors='coerce')

	# Grab DataFrame rows where column has certain values
	valuelist = ['value1', 'value2', 'value3']
	df = df[df.column.isin(valuelist)]

	# Grab DataFrame rows where column doesn't have certain values
	valuelist = ['value1', 'value2', 'value3']
	df = df[~df.column.isin(value_list)]

	# Delete column from DataFrame
	del df['column']

	# Select from DataFrame using criteria from multiple columns
	# (use `\|` instead of `&` to do an OR)
	newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

	# Rename several DataFrame columns
	df = df.rename(columns = {
	'col1 old name':'col1 new name',
	'col2 old name':'col2 new name',
	'col3 old name':'col3 new name',
	})

	# Lower-case all DataFrame column names
	df.columns = map(str.lower, df.columns)

	# Even more fancy DataFrame column re-naming
	# lower-case all DataFrame column names (for example)
	df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

	# Loop through rows in a DataFrame
	# (if you must)
	for index, row in df.iterrows():
	print index, row['some column']

	# Much faster way to loop through DataFrame rows
	# if you can work with tuples
	# (h/t hughamacmullaniv)
	for row in df.itertuples():
	print(row)

	# Next few examples show how to work with text data in Pandas.
	# Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html

	# Slice values in a DataFrame column (aka Series)
	df.column.str[0:2]

	# Lower-case everything in a DataFrame column
	df.column_name = df.column_name.str.lower()

	# Get length of data in a DataFrame column
	df.column_name.str.len()

	# Sort dataframe by multiple columns
	df = df.sort(['col1','col2','col3'],ascending=[1,1,0])

	# Get top n for each group of columns in a sorted dataframe
	# (make sure dataframe is sorted first)
	top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)

	# Grab DataFrame rows where specific column is null/notnull
	newdf = df[df['column'].isnull()]

	# Select from DataFrame using multiple keys of a hierarchical index
	df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))

	# Change all NaNs to None (useful before
	# loading to a db)
	df = df.where((pd.notnull(df)), None)

	# More pre-db insert cleanup...make a pass through the dataframe, stripping whitespace
	# from strings and changing any empty values to None
	# (not especially recommended but including here b/c I had to do this in real life one time)
	df = df.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None)

	# Get quick count of rows in a DataFrame
	len(df.index)

	# Pivot data (with flexibility about what what
	# becomes a column and what stays a row).
	# Syntax works on Pandas >= .14
	pd.pivot_table(
	df,values='cell_value',
	index=['col1', 'col2', 'col3'], #these stay as columns; will fail silently if any of these cols have null values
	columns=['col4']) #data values in this column become their own column

	# Change data type of DataFrame column
	df.column_name = df.column_name.astype(np.int64)

	# Get rid of non-numeric values throughout a DataFrame:
	for col in refunds.columns.values:
	refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

	# Set DataFrame column values based on other column values (h/t: @mlevkov)
	df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value

	# Clean up missing values in multiple DataFrame columns
	df = df.fillna({
	'col1': 'missing',
	'col2': '99.999',
	'col3': '999',
	'col4': 'missing',
	'col5': 'missing',
	'col6': '99'
	})

	# Concatenate two DataFrame columns into a new, single column
	# (useful when dealing with composite keys, for example)
	# (h/t @makmanalp for improving this one!)
	df['newcol'] = df['col1'].astype(str) + df['col2'].astype(str)

	# Doing calculations with DataFrame columns that have missing values
	# In example below, swap in 0 for df['col1'] cells that contain null
	df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']

	# Split delimited values in a DataFrame column into two new columns
	df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

	# Collapse hierarchical column indexes
	df.columns = df.columns.get_level_values(0)

	# Convert Django queryset to DataFrame
	qs = DjangoModelName.objects.all()
	q = qs.values()
	df = pd.DataFrame.from_records(q)

	# Create a DataFrame from a Python dictionary
	df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])

	# Get a report of all duplicate records in a dataframe, based on specific columns
	dupes = df[df.duplicated(['col1', 'col2', 'col3'], keep=False)]

	# Set up formatting so larger numbers aren't displayed in scientific notation (h/t @thecapacity)
	pd.set_option('display.float_format', lambda x: '%.3f' % x)
No results found