Forked from fomightez/useful_pandas_snippets.py
Last active
December 29, 2018 16:32
-
-
Save sany2k8/aa4e94a1d14a9a256118c07b2d3523da to your computer and use it in GitHub Desktop.
Useful Pandas Snippets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# import pandas and numpy package | |
import numpy as np | |
import pandas as pd | |
# List unique values in a DataFrame column | |
df['Column Name'].unique() | |
# To extract a specific column (subset the dataframe), you can use [ ] (brackets) or attribute notation. | |
df.height | |
df['height'] | |
# are same thing!!! (from http://www.stephaniehicks.com/learnPython/pages/pandas.html | |
# -or- | |
# http://www.datacarpentry.org/python-ecology-lesson/02-index-slice-subset/) | |
# Quick overview of DataFrame | |
df.describe() | |
# Display data types in DataFrame | |
df.dtypes | |
# Change order of columns in DataFrame | |
df = df[['C', 'B', 'A']] # It will have defaulted to 'A B C' order(I think), see https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns | |
# When adding a new column -- "By default, columns get inserted at the end. The | |
# insert function is available to insert at a particular location in the columns:" | |
df.insert(2, 'mean', df.mean(1)) #insert at third column a 'mean' column | |
# Convert Series datatype to numeric (will error if column has non-numeric values) | |
pd.to_numeric(df['Column Name']) | |
# Convert Series datatype to numeric, changing non-numeric values to NaN | |
pd.to_numeric(df['Column Name'], errors='coerce') | |
# Use that conversion in a dataframe | |
df['Column Name'] = df['Column Name'].apply(pd.to_numeric, errors='coerce') | |
# Grab DataFrame rows where column has certain values | |
valuelist = ['value1', 'value2', 'value3'] | |
df = df[df.column.isin(valuelist)] | |
# Grab DataFrame rows where column doesn't have certain values | |
valuelist = ['value1', 'value2', 'value3'] | |
df = df[~df.column.isin(value_list)] | |
#(`~` inverts the boolean values; it is similar to using a `not` in a conditional expression). | |
# Grab DataFrame rows where column matches at least part of a string in a list | |
df = df[df.column.str.contains(pattern)] | |
# Example OF USE | |
df = pd.DataFrame({'A': 'foo bar one123 bar foo one324 foo 0'.split(), | |
'B': 'one546 one765 twosde three twowef two234 onedfr three'.split(), | |
'C': np.arange(8), 'D': np.arange(8) * 2}) | |
pattern = '|'.join(['one', 'two']) | |
df = df[df.B.str.contains(pattern)] | |
# `str.startswith.` is related to looking for text in a string in a column (see below) | |
# Select rows containing certain values from pandas dataframe IN ANY COLUMN | |
df[df.values == 'X'].dropna(how='all') # this one makes multiple copies of the rows show up if multiple examples occur in the row | |
df[df.isin(['X'])].dropna(how='all') # BEST; this one works better if multiple occurences can in the same row | |
# based on https://stackoverflow.com/questions/38185688/select-rows-containing-certain-values-from-pandas-dataframe | |
# Remove / delete rows where a condition or conditions are met | |
df = df.drop(df[df.score < 50].index) | |
# can be done in place | |
df.drop(df[df.score < 50].index, inplace=True) | |
# use booleans to enforce multiple conditions | |
df = df.drop(df[(df.score < 50) & (df.score > 20)].index) | |
# remove all but one column, dropping the rest | |
sub_df = df[['column_a']] | |
# similarly, to limit to just a few columns (subset), add multiple columns in the bracketed list | |
sub_df = df[['column_a','column_b']] | |
# see more about dropping a column below under 'Delete column from DataFrame' | |
# Select from DataFrame using criteria from multiple columns (multiple condition expression), i.e., filter / subset on multiple conditions | |
# (use `|` instead of `&` to do an OR) | |
newdf = df[(df['column_one']>2004) & (df['column_two']==9)] | |
# other examples | |
df[(df['nationality'] == "USA") & df['age'] > 50] #https://chrisalbon.com/python/data_wrangling/pandas_selecting_rows_on_conditions/ | |
df[df['first_name'].notnull() & (df['nationality'] == "USA")]# df['first_name'].notnull() uses variable attribute while others are Boolean; https://chrisalbon.com/python/data_wrangling/pandas_selecting_rows_on_conditions/ | |
hits_df = blast_df[(blast_df['sstart'] <= midpt)&(midpt <= blast_df['send'])] # detecting if in range / interval example | |
#SEMI-RELATED:if it is a single column involved and the text examples fall into | |
# like you want all that contain `text` like, rows with `texta`, `textb,` etc | |
# you can use `column.str.contains(pattern)` | |
df = df[df.column.str.contains(pattern)] # See above | |
#Also SEMI-RELATED: if you need multiple string matches in a single column you can use | |
# `Grab DataFrame rows where column has certain values` approach (SEE ABOVE) | |
# or combine to expand on the `Select from DataFrame using criteria from multiple columns` | |
# with `newdf = df[(df['column_one']>2004) & (df['column_two']==9)]` approach, like: | |
valuelist1 = ['value1', 'value2', 'value3'] | |
valuelist2 = ['value4', 'value5', 'value6'] | |
newdf = df[(df.column.isin(valuelist1)) & (df.column.isin(valuelist2))] | |
# using startswith in selection | |
df = df[df['gene'].str.startswith("snR17")] | |
# combining with making case not matter by making lower case (or upper), requires complexity that didn't appear obvious to me | |
df = df[df['gene'].str.lower().str.startswith("snr17")] # solution from https://stackoverflow.com/a/22909357/8508004; they also had a regex solution offered that failed | |
# Original was fix to using with `.contains` | |
#Also SEMI-RELATED: if using conditional to have rows extracted extracted go to | |
# new dataframe and you want first row (a.k.a, top row) (or you know there should only be one) and you want a value in that row: | |
new_df = df[df.gene == "test4"] # conditional narrows to just those with "test4" | |
new_df.iloc[0].FoldChange # iloc[0] specifies first row and then `.FoldChange` or ["FoldChange"] to select column | |
# see more on `.iloc` at https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/ | |
# I tried using `.iloc` to update a copy of a dataframe but it didn't work, but this approach did, based on | |
# http://pandas.pydata.org/pandas-docs/stable/indexing.html#evaluation-order-matters: | |
# copy the dataframe to avoid `SettingWithCopyWarning`, see | |
# https://www.dataquest.io/blog/settingwithcopywarning/ | |
updated_sites_df = sites_df.copy() | |
for indx,sites_row in sites_df.iterrows(): | |
if sites_row.olap_state == 'closest': | |
#process row where 'overlapping' gene/feature not identified | |
start = sites_row.start | |
end = sites_row.end | |
id = sites_row.sys_gene_id | |
closest,pos_val = identify_closest_gene_or_feature( | |
id,int(start),int(end),genes_df,avg_gene_and_feature_size) | |
#updated_sites_df.iloc[indx].overlapping_or_closest_gene = closest # even with copying, these cause `SettingWithCopyWarning` and even more problematic, don't make updates needed. | |
#updated_sites_df.iloc[indx].position = pos_val # even with copying, these cause `SettingWithCopyWarning` and even more problematic, don't make updates needed. | |
# Approach based on http://pandas.pydata.org/pandas-docs/stable/indexing.html#evaluation-order-matters | |
# worked to updata values in a dataframe, but still showed warning: | |
#updated_sites_df['overlapping_or_closest_gene'][indx] = closest #still gives warning, but works | |
# updated_sites_df['position'][indx] = pos_val #still gives warning, but works | |
# Work and no warning, as prescribed at https://www.dataquest.io/blog/settingwithcopywarning/ | |
# at end of 'Chained assignment' section | |
updated_sites_df.loc[indx,'overlapping_or_closest_gene'] = closest | |
updated_sites_df.loc[indx,'position'] = pos_val | |
# Reorder rows based on values in a column when you know what you want | |
df = pd.DataFrame(list(categorization.items()),columns = ['category','residue_positions']) | |
# That works but I want categories with most conserved as top line and | |
# `not_conserved` on bottom | |
# Because I think the dictionary will have these as arbitrary orders I | |
# cannot simply base order on what I saw in development. More robust would | |
# be to extract what `new_index` order should be | |
#print(categorized_residue_positions_df) # FOR DEBUGGING ONLY | |
default_indx = {} | |
for i, row in df.iterrows(): | |
default_indx[row.category] = i | |
new_index = ([default_indx['identical'], | |
default_indx['strongly_similar'], | |
default_indx['weakly_similar'], | |
default_indx['not_conserved'],]) | |
categorized_residue_positions_df = categorized_residue_positions_df.reindex(new_index) # based on | |
# https://stackoverflow.com/a/30010004/8508004 | |
categorized_residue_positions_df = categorized_residue_positions_df.reset_index(drop=True) | |
#print(categorized_residue_positions_df) # FOR DEBUGGING ONLY | |
# Delete column from DataFrame | |
del df['column'] | |
#-or- | |
df = df.drop('column',1) | |
# see https://stackoverflow.com/a/18145399/8508004 | |
#-or- | |
df = df.drop('reports', axis=1) | |
# see https://chrisalbon.com/python/data_wrangling/pandas_dropping_column_and_rows/, but note | |
# that unlike shown there seems now need to assign (like in example above) to see change | |
# Add a column to a DataFrame with same contents to each row | |
df["new_column_label"] = "String_entry" | |
df["new_column_label2"] = 0 | |
# also works for setting value of all rows in an existing column to same thing, see https://stackoverflow.com/a/44723277/8508004 | |
# Note if you do this after you made a new dataframe from a subset of another, you may see a `SettingWithCopyWarning:` warning. | |
# because Pandas is just being cautious, see https://stackoverflow.com/questions/42105859/pandas-map-to-a-new-column-settingwithcopywarning | |
#; could fix with approach [here](https://stackoverflow.com/a/45885310/8508004) if important, like: | |
# df = df.assign(signal= 'yes') | |
# See `.append` below for ADDING ROWS. | |
# Related: you can add a column with different contents to each | |
# row WITHOUT USING APPLY if you use itertuples or iterrows to | |
# build a list with the same amount of items as the length of | |
# the dataframe and then add new columh with | |
df["ids"] = list_of_ids | |
# see https://www.dataquest.io/blog/settingwithcopywarning/ for better understanding of `SettingWithCopyWarning:` warnings. | |
# Rename a DataFrame column / rename column | |
df.rename(columns={'old_name':'new_name'}, inplace=True) | |
# see https://stackoverflow.com/questions/33727667/pandas-settingwithcopywarning-a-value-is-trying-to-be-set-on-a-copy-of-a-slice | |
# because with new Pandas and Pyton 3 I am seeing warning when doing inPlace | |
# Better(?): | |
df2 = df.rename(columns={'old':'new'}) | |
#-or-, even seems to work as | |
df = df.rename(columns={'old':'new'}) | |
# Rename several DataFrame columns | |
df = df.rename(columns = { | |
'col1 old name':'col1 new name', | |
'col2 old name':'col2 new name', | |
'col3 old name':'col3 new name', | |
}) | |
# or use `,inplace=True` without `df =` | |
# Lower-case all DataFrame column names | |
df.columns = map(str.lower, df.columns) | |
# Even more fancy DataFrame column re-naming | |
# lower-case all DataFrame column names (for example) | |
df.rename(columns=lambda x: x.split('.')[-1], inplace=True) | |
# Loop through rows in a DataFrame | |
# (if you must) | |
for index, row in df.iterrows(): | |
print index, row['some column'] | |
# Much faster way to loop through DataFrame rows | |
# if you can work with tuples (iterate rows more efficiently) | |
# (h/t hughamacmullaniv) | |
for row in df.itertuples(): | |
print(row) | |
# see more about itertuples below | |
# Next few examples show how to work with text data in Pandas. | |
# Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html | |
# Slice values in a DataFrame column (aka Series) | |
df.column.str[0:2] | |
# Lower-case everything in a DataFrame column | |
df.column_name = df.column_name.str.lower() | |
# Get length of data in a DataFrame column | |
df.column_name.str.len() | |
# Make a column of a dataframe a Python list (df column --> to list) | |
lizt = df["col1"].tolist() | |
# Sort dataframe by multiple columns | |
df = df.sort_values(['col1','col2','col3'],ascending=[1,1,0]) | |
# see `df = df.reset_index(drop=True)` in relation to this | |
# Sort on one column | |
df.sort_values('Col_name1', ascending=False, inplace=True) | |
# If the column you are trying to sort on is a multi-level /hierarchical column, use | |
#the full tuple with all levels to reference it, like | |
# `df.sort_values(('tmn1-5001','p-value'), ascending=False, inplace=True)` | |
# Get top n for each group of columns in a sorted dataframe | |
# (make sure dataframe is sorted first) | |
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5) | |
# more on sorting at the useful Jupyter notebook, EXCEPT now `sort_values`, | |
# http://nbviewer.jupyter.org/github/rasbt/python_reference/blob/master/tutorials/things_in_pandas.ipynb#Sorting-and-Reindexing-DataFrames | |
# Grab DataFrame rows where specific column is null/notnull | |
newdf = df[df['column'].isnull()] | |
# Select from DataFrame using multiple keys of a hierarchical index | |
df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2')) | |
# Change all NaNs to None (useful before | |
# loading to a db) | |
df = df.where((pd.notnull(df)), None) | |
# More pre-db insert cleanup...make a pass through the dataframe, stripping whitespace | |
# from strings and changing any empty values to None | |
# (not especially recommended but including here b/c I had to do this in real life one time) | |
df = df.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None) | |
# Get quick count of rows in a DataFrame | |
len(df.index) | |
len(df) # <---I find even quicker. | |
# change a column into the index of the dataframe: | |
df = df.set_index('column_name') # see http://stackoverflow.com/questions/10457584/redefining-the-index-in-a-pandas-dataframe-object | |
# renumber index, useful for aftere removing items from one dataframe to make another | |
df = df.reset_index(drop=True) | |
# use `drop=True` to not keep the old index, see https://stackoverflow.com/a/20491748/8508004 | |
# Pivot data (with flexibility about what what | |
# becomes a column and what stays a row). | |
# Syntax works on Pandas >= .14 | |
pd.pivot_table( | |
df,values='cell_value', | |
index=['col1', 'col2', 'col3'], #these stay as columns; will fail silently if any of these cols have null values | |
columns=['col4']) #data values in this column become their own column | |
# example of re-orienting dataframe, based on https://stackoverflow.com/questions/28337117/how-to-pivot-a-dataframe-in-pandas | |
reoriented_df = pd.pivot_table(count_of_types_df, values = 'count', columns = 'qseqid').reset_index() | |
reoriented_df = reoriented_df[["G1","V1","M1","M7'","M8","M9'","M11''","M15","M14"]] | |
reoriented_df["TOTAL"] = reoriented_df.sum(1) | |
# that was AFTER below had generated counts for BLAST results | |
count_of_types_df = blast_df['qseqid'].value_counts().reset_index() | |
count_of_types_df.columns = ['qseqid', 'count'] | |
# Change data type of DataFrame column | |
df.column_name = df.column_name.astype(np.int64) | |
# Get rid of non-numeric values throughout a DataFrame: | |
for col in refunds.columns.values: | |
refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True) | |
# Do find/replace on a string throughout a DataFrame | |
df.replace({'OLD_TEXT': 'NEW_TEXT'}, regex=True, inplace = True) | |
# to restrict changes to a specific column, you can do | |
df.the_col = df.the_col.replace({'OLD_TEXT': 'NEW_TEXT'}) | |
# more replace example | |
df.col_x.str.replace('[', '(').replace(']', ')') | |
df.col_x.str.replace('[\[\]]', '') | |
# Do find/replace on string restricted to column and use regex | |
# 'ZEB1/ZEB1_cerevisiae_extracted.clustal' ---> 'ZEB1' | |
df['col_name_here'].replace({"(ZEB\d)/.*": "\\1"}, regex=True, inplace=True) # see https://stackoverflow.com/a/41473130/8508004 | |
#-or | |
#df['col_name_here'].replace({"(ZEB\d)/.*": r"\1"}, regex=True, inplace=True) # see https://stackoverflow.com/a/41473130/8508004 | |
# Set DataFrame column values based on other column values (h/t: @mlevkov),.i.e., change values | |
df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value | |
df.loc[(df['column1'] == some_value), ['column_to_change']] = new_value | |
df1.loc[df1['stream'] == 2, 'feat'] = 10 | |
df1.loc[df1['stream'] == 2, ['feat','another_feat']] = 'aaaa' | |
#conditional value setting of a new column | |
df.loc[df.col > 0, 'new_col'] = 'positif' | |
df.loc[~ df.col > 0, 'new_col'] = 'negatif' | |
df.loc[df.col < 0, 'new_col'] = 'negatif' | |
# Clean up missing values in multiple DataFrame columns | |
df = df.fillna({ | |
'col1': 'missing', | |
'col2': '99.999', | |
'col3': '999', | |
'col4': 'missing', | |
'col5': 'missing', | |
'col6': '99' | |
}) | |
# Concatenate two DataFrame columns into a new, single column | |
# (useful when dealing with composite keys, for example) | |
# (h/t @makmanalp for improving this one!) | |
df['newcol'] = df['col1'].astype(str) + df['col2'].astype(str) | |
# Concatenate / combine multiple dataframes, without regards to index, for each grouping | |
df = pd.concat([df1,df2], ignore_index=True) | |
# I use this 'concat/ignore_index=True' approach often when 'stacking' two dataframes that have the same columns | |
# Similarly, `.concat()` also great for combining into one when the dataframes are in list and all have same columns. | |
df = pd.concat(list_of_dataframes) # An example if don't care how indexes left in resulting dataframe | |
# Merge / combine / join / concatenate multiple dataframes | |
new_df = pd.merge(df1,df2,on='gene') | |
# For two or more (especially FOR MORE)... | |
# For cleaner looks you can chain them, https://stackoverflow.com/questions/23668427/pandas-joining-multiple-dataframes-on-columns#comment36377024_23671390 | |
new_df = df1.merge(df2,on='gene').merge(df3,on='gene') | |
# limit the merging / combining / joining to certain columns of the contributing dataframes | |
new_df = pd.merge(df1[['col1','col4']],df2[['col1','col4']],on='gene') | |
#-OR- | |
new_df = df1[['col1','col4']].merge(df2[['col1','col4']],on='gene') | |
# Combining merge (with extracting) and renaming columns for better tracking source in new dataframe | |
df_m = pd.merge(df1[['gene','column_name']].rename(columns = {'column_name' : 'New_name'}), mitoWTRep3_df[['gene','column_name']].rename(columns = {'TPM' : 'New_name'}), on = 'gene') | |
# Note this is an eample of a way just to extract two of the columns from a dataframe that had more columns than those two to make the new dataframe. | |
# Or do the renaming and combining this way: | |
df = pd.concat([s3, s4, s5], axis=1, keys=['red','blue','yellow']) | |
# "A fairly common use of the keys argument is to override the column names when creating a new DataFrame based on existing | |
# Series. Notice how the default behaviour consists on letting the resulting DataFrame inherit the parent Series‘ name, when these existed." | |
# -or- | |
pd.concat(dict(df1 = df1, df2 = df2),axis=1) # from https://stackoverflow.com/a/15990537 | |
# from http://pandas-docs.github.io/pandas-docs-travis/merging.html#more-concatenating-with-group-keys | |
# example there places the two dataframes side-by-side (example there adds multi-level columns to distinguish), | |
# instead of stacking(see above for stacking) | |
# -or- | |
pd.concat((df1, df2),axis=1) # to not make multi-level column names, but place side-by-side, otherwise similar to | |
# http://pandas-docs.github.io/pandas-docs-travis/merging.html#more-concatenating-with-group-keys | |
# Set up / Start / initialize a dataframe with certain columns for subsequently adding rows | |
df = pd.DataFrame(columns=['col1','col2','col3','col4']) | |
# and add rows to it (one way to do it; see issues about `.append` not being applicable to iterating over a list of dataframes, see `z_issues_and_workarounds.py`) | |
df = df.append( | |
{'col1':'string1','col2':value,'col3':value2,'col4':value3}, | |
ignore_index=True) # based on http://pandas.pydata.org/pandas-docs/stable/merging.html#appending-rows-to-a-dataframe | |
# That above is not the recommended way to create a dataframe, i.e., by building a row at a time by adding a row with append, see https://stackoverflow.com/a/25376997/8508004 (VERY SLOW!), | |
# but I found it worked when iterating over a list of dataframes, see `z_issues_and_workarounds.py` | |
df.loc[len(df)]=['Text_for_Col1','Text_for_Col2','Text_for_Col3', a_value] | |
# Recommended way is at https://stackoverflow.com/a/17496530/8508004, but I don't know how amenable that | |
# is to where you might iterate over several DataFrames | |
# Doing calculations with DataFrame columns that have missing values | |
# In example below, swap in 0 for df['col1'] cells that contain null | |
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2'] | |
# set column value based on another column value | |
df['elderly'] = np.where(df['age']>=50, 'yes', 'no') | |
# apply a function that uses value in a single column to each row of a dataframe, placing result in a new column | |
df['new col'] = df['col1'].apply(<user_defined_function>) | |
# I think if used same column it would replace. # based on | |
# http://jonathansoma.com/lede/foundations/classes/pandas%20columns%20and%20functions/apply-a-function-to-every-row-in-a-pandas-dataframe/ | |
# "This is useful when cleaning up data - converting formats, altering values etc." | |
#Similar to last example, but calculating with more than one column | |
import statistics | |
df['std dev'] = df[['col1_name','col2_name']].apply(statistics.pstdev, axis=1) | |
def midpoint(items): | |
''' | |
takes a iterable of items and returns the midpoint (integer) of the first | |
and second values | |
''' | |
return int((int(items[0])+int(items[1]))/2) | |
df['midpoint'] = df[['start','end']].apply(midpoint, axis=1) | |
# apply a function to each row of a dataframe | |
df = df.apply(<user_defined_function>, axis=1) | |
# `axis=0` for across columns | |
# some functions, like sum, mean, max, min, etc. built-in to Pandas and allow shorthand call to axis | |
df["mean"] = df.mean(1) | |
# adds a column of the mean across the row to each row in a dataframe | |
# `axis=0` for down columns | |
# another example | |
df_c['mean'] = df[['col1_name','col2_name']].mean(1) | |
# an example for down the columns | |
avg_length = df.mean(0).length # see next too | |
avg_length = df[['length']].mean(0).length # same thing as above but subset first | |
# Pandas has mode but it is awkward presently about what it rquires and returns. In this | |
# example 'length' is a column name. (based on https://stackoverflow.com/a/52532322/8508004 ); | |
# wouldn't take axis as inout used in this way and without the `[0]` returned a series . | |
the_mode_of_length_column = df.length.mode()[0] | |
#Use `.apply()` to return multiple columns. Example also illustrates passing additional info | |
# during use of `.apply()` using `args`. Returning multiple columns based on https://stackoverflow.com/a/43770075/8508004 . | |
# use of `args`to pass additional positional arguments to the `.apply()` method. | |
def example(row_items, info_dict): | |
''' | |
Toy example. | |
Takes a dataframe row that contains a midpoint value and 'num' value within | |
it and a dict where keys are midpoints and | |
values are a list (for this toy example) of extra info about each midpoint | |
(actually correspond to info from rows of a different dataframe). | |
Returns a row with multiple new columns added. | |
based on https://stackoverflow.com/a/43770075/8508004 | |
''' | |
smallest_difference = float('Inf') | |
for midpt in info_dict: | |
if abs(row_items.midpoint-midpt) < smallest_difference: | |
smallest_difference = abs(items.midpoint-midpt) | |
row_items['other_item_added_to_sq'] = row_items['num']**2 + info_dict[midpt][0] | |
row_items['other_item_added_to_cubed'] = row_items['num']**3 + info_dict[midpt][1] | |
row_items['other_item_added_to_4power'] = row_items['num']**4 + info_dict[midpt][2] | |
return row_items | |
df = df.apply(example, args=(other_info_dict,), axis=1) | |
# Split delimited values in a DataFrame column into two new columns | |
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1))) | |
# Split Pandas DataFrame into two random subsets: (from https://twitter.com/python_tip/status/951829597523456000) | |
from sklearn.model_selection import train_test_split | |
train, test = train_test_split(df, test_size=0.2) | |
# Collapse hierarchical (multilevel/ multi-level) column indexes | |
df.columns = df.columns.get_level_values(0) | |
# Make a dataframe that is count of the frequency of items (moving towards a distribution accounting) | |
df = df['amount row shifted'].value_counts().reset_index() # column 'amount of rows shifted' in | |
# this case were integers but strings and maybe even floats if unique, see https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.value_counts.html | |
df.columns = ['amount rows shifted', 'count'] # FIX OF COLUMN NAMES AFTER based on https://stackoverflow.com/a/35893235/8508004 | |
# END ABOVE IF JUST NEED A DATAFRAME, | |
# Note though if just want a plot of the amount row shifted can just use below assuming there is only one column and it corresponds to amount row shifted: | |
ax = df.hist(); | |
ax[0][0].set_xlabel("shift") # from https://stackoverflow.com/a/43239901/8508004 | |
ax[0][0].set_ylabel("count") # from https://stackoverflow.com/a/43239901/8508004 | |
ax[0][0].set_title("Distribution of shifts") #just tried it based on matplotlib and how set labels of axes above | |
# or as shown in current documentation, and combining with matplotlib settings | |
ax = df.plot.hist(ec=(0.3,0.3,0.3,0.65),legend=False) | |
ax.set_xlabel("shift") | |
ax.set_ylabel("frequency") | |
ax.set_title("Distribution of shifts"); | |
# Convert Django queryset to DataFrame | |
qs = DjangoModelName.objects.all() | |
q = qs.values() | |
df = pd.DataFrame.from_records(q) | |
# Create a DataFrame from a Python dictionary | |
df = pd.DataFrame({ 'Id' : ["Charger","Ram","Pacer","Elantra","Camaro","Porsche 911"], | |
'Speed':[30,35,31,20,25,80] | |
}) | |
# can change order of columns by providing columns list in order, such as `, columns = ['Speed', 'Id']` between the dictionary closing curly bracket and the DataFrame method closing parantheis | |
df = pd.DataFrame({'A': 'foo bar one123 bar foo one324 foo 0'.split(), | |
'B': 'one546 one765 twosde three twowef two234 onedfr three'.split(), | |
'C': np.arange(8), 'D': np.arange(8) * 2}) | |
# - or- | |
# (THIS ONE NEXT IS FROM THE ORIGINAL SNIPPETS REPO BUT SEEMS LIMITED TO A TWO COLUMN DataFrame!?!? | |
a_dictionary = {"April":"Grass", "May":"Flowers","June":"Corn"} | |
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2']) # BUT IS THIS LIMITED TO TWO COLUMNS SINCE USING KEY-VALUE PAIRS?? | |
# Other Dictionary to Dataframe Examples from https://stackoverflow.com/questions/41192401/python-dictionary-to-pandas-dataframe-with-multiple-columns | |
# i.e. multiple columns | |
from collections import Counter | |
d= {'data' : Counter({ 'important' : 2, | |
'very' : 3}), | |
'analytics' : Counter({ 'boring' : 5, | |
'sleep' : 3}) | |
} | |
df = pd.DataFrame(d).stack().reset_index() | |
df.columns = ['word','category','count'] | |
# -or- | |
df = pd.DataFrame.from_ | |
# read csv as dataframe | |
df = pd.read_csv('my_file.csv', usecols=['a', 'c']) | |
df = pd.read_csv('my_file.csv', usecols=[0, 2]) | |
df = pd.read_csv('my_file.csv', nrows=3) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# These may simply be a result of my misunderstanding, stumbling though non-optimal / non-pythonic solutions, bad coding, or lack of research, but here are some issues I encountered. | |
# Workarounds are provided when / if I solved them. | |
# COULD NOT ITERATE OVER A LIST OF DATAFRAMES AND ADD A ROW TO EACH WITH `.append` | |
# For each dataframe I wanted to make a TOTAL combined entry for for an element using components a and b | |
# It seemed like this should be doable in a loop. | |
# To do one, I can do this: | |
''' | |
elem_meanlength = 1 | |
elem_meaneff = 1 | |
elem_sumTPM = 1 | |
elem_sumNumReads = 1 | |
total_df = total_df.append( | |
{'Name':"Elem_total",'Length':elem_meanlength,'EffectiveLength':elem_meaneff,'TPM':elem_sumTPM,'NumReads':elem_sumNumReads}, | |
ignore_index=True) # based on http://pandas.pydata.org/pandas-docs/stable/merging.html#appending-rows-to-a-dataframe | |
#print(total_df) # ONLY FOR DEBUGGING | |
''' | |
# But seems to only update a copy of when try to set up for iterating. Doesn't alter | |
# original. Find/replace worked in loop (see BELOW) but used "inplace". | |
# Find/replace that worked in loop: | |
#----------------- | |
# list_of_dataframes = [total_df, another_df, yet_another_df] | |
#for each_df in list_of_dataframes: | |
# each_df.replace({'OLD_TEXT': 'NEW_TEXT'}, regex=True, inplace = True) | |
# #print(each_df) # FOR DEBUGGING ONLY | |
#----------------- | |
# COULDN'T COME UP WITH SOLUTION IN A TIMELY MANNER AT FIRST BUT LOOKED MORE. | |
# By searching `pandas append a row to dataframe not a copy` finally found Jun's answer at | |
# https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360#19368360 | |
# & it looked amenable to looping through several dataframes. Tested: | |
''' | |
list_of_dataframes = [total_df, another_df, yet_another_df] | |
print(total_df) # ONLY FOR DEBUGGING | |
elem_meanlength = 1 | |
elem_meaneff = 1 | |
elem_sumTPM = 1 | |
elem_sumNumReads = 1 | |
list_of_dataframes[0].loc[len(list_of_dataframes[0])]= ["Elem_total",elem_meanlength,elem_meaneff,elem_sumTPM,elem_sumNumReads]# based on Jun's answer at https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360#19368360 | |
print(list_of_dataframes[0]) | |
print(total_df) # ONLY FOR DEBUGGING | |
# THE WORKAROUND FOR THAT | |
# That solution (plus the find/replace) implemented | |
for indx, each_df in enumerate(list_of_dataframes): | |
each_df.replace({'OLD_TEXT': 'NEW_TEXT'}, regex=True, inplace = True) | |
#print(each_df) # FOR DEBUGGING | |
#print(each_df[each_df.Name.str.contains("ID")]) # FOR DEBUGGING, shows matches if "IDa" "IDab", etc. | |
elem_meanlength = each_df[each_df.Name.str.contains("ID")].mean(0).Length | |
elem_meaneff = each_df[each_df.Name.str.contains("ID")].mean(0).EffectiveLength | |
elem_sumTPM = each_df[each_df.Name.str.contains("ID")].sum(0).TPM | |
elem_sumNumReads = each_df[each_df.Name.str.contains("ID")].sum(0).NumReads | |
list_of_dataframes[indx].loc[len(list_of_dataframes[indx])]= ["Elem_total",elem_meanlength,elem_meaneff,elem_sumTPM,elem_sumNumReads]# based on Jun's answer at https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360 | |
# BUT DON'T USE THIS FOR A REAL,REAL LOT OF DATAFRAMES OR A LOT OF LARGE ONES. SUPER SLOW. See https://stackoverflow.com/a/17496530/8508004 for recommended way that I don't know if it is amenablet to iterating over a list of DataFrames | |
# CANNOT USE `sample` as a column name if want to be able to call that column using attribute notation | |
# because `pandas.DataFrame.sample` is a function on the DataFrame. | |
# Either change the column names using `df.rename(columns={'old_name':'new_name'}, inplace=True)` | |
# -or use standard notation like this (compare with example of `.str.contains(pattern)` in snippets file: | |
import pandas as pd | |
import numpy as np | |
df = pd.DataFrame({'A': 'foo bar one123 bar foo one324 foo 0'.split(), | |
'sample': 'one546 one765 twosde three twowef two234 onedfr three'.split(), | |
'C': np.arange(8), 'D': np.arange(8) * 2}) | |
print (df) | |
pattern = '|'.join(['one', 'two']) | |
df = df[df['sample'].str.contains(pattern)] | |
df['sample'].str.contains(pattern) | |
# if working with column names that contain spaces, use bracket notation to select and | |
# not attribute notation, unless you want to change column names first (see `df.rename(columns={'old':'new'})`) | |
val =df[df.col3.str.contains('text\dmoretext')].mean(0)['source values'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment