Skip to content

Instantly share code, notes, and snippets.

@sany2k8
Forked from fomightez/useful_pandas_snippets.py
Last active December 29, 2018 16:32
Show Gist options
  • Save sany2k8/aa4e94a1d14a9a256118c07b2d3523da to your computer and use it in GitHub Desktop.
Save sany2k8/aa4e94a1d14a9a256118c07b2d3523da to your computer and use it in GitHub Desktop.
Useful Pandas Snippets
# -*- coding: utf-8 -*-
# import pandas and numpy package
import numpy as np
import pandas as pd
# List unique values in a DataFrame column
df['Column Name'].unique()
# To extract a specific column (subset the dataframe), you can use [ ] (brackets) or attribute notation.
df.height
df['height']
# are same thing!!! (from http://www.stephaniehicks.com/learnPython/pages/pandas.html
# -or-
# http://www.datacarpentry.org/python-ecology-lesson/02-index-slice-subset/)
# Quick overview of DataFrame
df.describe()
# Display data types in DataFrame
df.dtypes
# Change order of columns in DataFrame
df = df[['C', 'B', 'A']] # It will have defaulted to 'A B C' order(I think), see https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
# When adding a new column -- "By default, columns get inserted at the end. The
# insert function is available to insert at a particular location in the columns:"
df.insert(2, 'mean', df.mean(1)) #insert at third column a 'mean' column
# Convert Series datatype to numeric (will error if column has non-numeric values)
pd.to_numeric(df['Column Name'])
# Convert Series datatype to numeric, changing non-numeric values to NaN
pd.to_numeric(df['Column Name'], errors='coerce')
# Use that conversion in a dataframe
df['Column Name'] = df['Column Name'].apply(pd.to_numeric, errors='coerce')
# Grab DataFrame rows where column has certain values
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(valuelist)]
# Grab DataFrame rows where column doesn't have certain values
valuelist = ['value1', 'value2', 'value3']
df = df[~df.column.isin(value_list)]
#(`~` inverts the boolean values; it is similar to using a `not` in a conditional expression).
# Grab DataFrame rows where column matches at least part of a string in a list
df = df[df.column.str.contains(pattern)]
# Example OF USE
df = pd.DataFrame({'A': 'foo bar one123 bar foo one324 foo 0'.split(),
'B': 'one546 one765 twosde three twowef two234 onedfr three'.split(),
'C': np.arange(8), 'D': np.arange(8) * 2})
pattern = '|'.join(['one', 'two'])
df = df[df.B.str.contains(pattern)]
# `str.startswith.` is related to looking for text in a string in a column (see below)
# Select rows containing certain values from pandas dataframe IN ANY COLUMN
df[df.values == 'X'].dropna(how='all') # this one makes multiple copies of the rows show up if multiple examples occur in the row
df[df.isin(['X'])].dropna(how='all') # BEST; this one works better if multiple occurences can in the same row
# based on https://stackoverflow.com/questions/38185688/select-rows-containing-certain-values-from-pandas-dataframe
# Remove / delete rows where a condition or conditions are met
df = df.drop(df[df.score < 50].index)
# can be done in place
df.drop(df[df.score < 50].index, inplace=True)
# use booleans to enforce multiple conditions
df = df.drop(df[(df.score < 50) & (df.score > 20)].index)
# remove all but one column, dropping the rest
sub_df = df[['column_a']]
# similarly, to limit to just a few columns (subset), add multiple columns in the bracketed list
sub_df = df[['column_a','column_b']]
# see more about dropping a column below under 'Delete column from DataFrame'
# Select from DataFrame using criteria from multiple columns (multiple condition expression), i.e., filter / subset on multiple conditions
# (use `|` instead of `&` to do an OR)
newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
# other examples
df[(df['nationality'] == "USA") & df['age'] > 50] #https://chrisalbon.com/python/data_wrangling/pandas_selecting_rows_on_conditions/
df[df['first_name'].notnull() & (df['nationality'] == "USA")]# df['first_name'].notnull() uses variable attribute while others are Boolean; https://chrisalbon.com/python/data_wrangling/pandas_selecting_rows_on_conditions/
hits_df = blast_df[(blast_df['sstart'] <= midpt)&(midpt <= blast_df['send'])] # detecting if in range / interval example
#SEMI-RELATED:if it is a single column involved and the text examples fall into
# like you want all that contain `text` like, rows with `texta`, `textb,` etc
# you can use `column.str.contains(pattern)`
df = df[df.column.str.contains(pattern)] # See above
#Also SEMI-RELATED: if you need multiple string matches in a single column you can use
# `Grab DataFrame rows where column has certain values` approach (SEE ABOVE)
# or combine to expand on the `Select from DataFrame using criteria from multiple columns`
# with `newdf = df[(df['column_one']>2004) & (df['column_two']==9)]` approach, like:
valuelist1 = ['value1', 'value2', 'value3']
valuelist2 = ['value4', 'value5', 'value6']
newdf = df[(df.column.isin(valuelist1)) & (df.column.isin(valuelist2))]
# using startswith in selection
df = df[df['gene'].str.startswith("snR17")]
# combining with making case not matter by making lower case (or upper), requires complexity that didn't appear obvious to me
df = df[df['gene'].str.lower().str.startswith("snr17")] # solution from https://stackoverflow.com/a/22909357/8508004; they also had a regex solution offered that failed
# Original was fix to using with `.contains`
#Also SEMI-RELATED: if using conditional to have rows extracted extracted go to
# new dataframe and you want first row (a.k.a, top row) (or you know there should only be one) and you want a value in that row:
new_df = df[df.gene == "test4"] # conditional narrows to just those with "test4"
new_df.iloc[0].FoldChange # iloc[0] specifies first row and then `.FoldChange` or ["FoldChange"] to select column
# see more on `.iloc` at https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
# I tried using `.iloc` to update a copy of a dataframe but it didn't work, but this approach did, based on
# http://pandas.pydata.org/pandas-docs/stable/indexing.html#evaluation-order-matters:
# copy the dataframe to avoid `SettingWithCopyWarning`, see
# https://www.dataquest.io/blog/settingwithcopywarning/
updated_sites_df = sites_df.copy()
for indx,sites_row in sites_df.iterrows():
if sites_row.olap_state == 'closest':
#process row where 'overlapping' gene/feature not identified
start = sites_row.start
end = sites_row.end
id = sites_row.sys_gene_id
closest,pos_val = identify_closest_gene_or_feature(
id,int(start),int(end),genes_df,avg_gene_and_feature_size)
#updated_sites_df.iloc[indx].overlapping_or_closest_gene = closest # even with copying, these cause `SettingWithCopyWarning` and even more problematic, don't make updates needed.
#updated_sites_df.iloc[indx].position = pos_val # even with copying, these cause `SettingWithCopyWarning` and even more problematic, don't make updates needed.
# Approach based on http://pandas.pydata.org/pandas-docs/stable/indexing.html#evaluation-order-matters
# worked to updata values in a dataframe, but still showed warning:
#updated_sites_df['overlapping_or_closest_gene'][indx] = closest #still gives warning, but works
# updated_sites_df['position'][indx] = pos_val #still gives warning, but works
# Work and no warning, as prescribed at https://www.dataquest.io/blog/settingwithcopywarning/
# at end of 'Chained assignment' section
updated_sites_df.loc[indx,'overlapping_or_closest_gene'] = closest
updated_sites_df.loc[indx,'position'] = pos_val
# Reorder rows based on values in a column when you know what you want
df = pd.DataFrame(list(categorization.items()),columns = ['category','residue_positions'])
# That works but I want categories with most conserved as top line and
# `not_conserved` on bottom
# Because I think the dictionary will have these as arbitrary orders I
# cannot simply base order on what I saw in development. More robust would
# be to extract what `new_index` order should be
#print(categorized_residue_positions_df) # FOR DEBUGGING ONLY
default_indx = {}
for i, row in df.iterrows():
default_indx[row.category] = i
new_index = ([default_indx['identical'],
default_indx['strongly_similar'],
default_indx['weakly_similar'],
default_indx['not_conserved'],])
categorized_residue_positions_df = categorized_residue_positions_df.reindex(new_index) # based on
# https://stackoverflow.com/a/30010004/8508004
categorized_residue_positions_df = categorized_residue_positions_df.reset_index(drop=True)
#print(categorized_residue_positions_df) # FOR DEBUGGING ONLY
# Delete column from DataFrame
del df['column']
#-or-
df = df.drop('column',1)
# see https://stackoverflow.com/a/18145399/8508004
#-or-
df = df.drop('reports', axis=1)
# see https://chrisalbon.com/python/data_wrangling/pandas_dropping_column_and_rows/, but note
# that unlike shown there seems now need to assign (like in example above) to see change
# Add a column to a DataFrame with same contents to each row
df["new_column_label"] = "String_entry"
df["new_column_label2"] = 0
# also works for setting value of all rows in an existing column to same thing, see https://stackoverflow.com/a/44723277/8508004
# Note if you do this after you made a new dataframe from a subset of another, you may see a `SettingWithCopyWarning:` warning.
# because Pandas is just being cautious, see https://stackoverflow.com/questions/42105859/pandas-map-to-a-new-column-settingwithcopywarning
#; could fix with approach [here](https://stackoverflow.com/a/45885310/8508004) if important, like:
# df = df.assign(signal= 'yes')
# See `.append` below for ADDING ROWS.
# Related: you can add a column with different contents to each
# row WITHOUT USING APPLY if you use itertuples or iterrows to
# build a list with the same amount of items as the length of
# the dataframe and then add new columh with
df["ids"] = list_of_ids
# see https://www.dataquest.io/blog/settingwithcopywarning/ for better understanding of `SettingWithCopyWarning:` warnings.
# Rename a DataFrame column / rename column
df.rename(columns={'old_name':'new_name'}, inplace=True)
# see https://stackoverflow.com/questions/33727667/pandas-settingwithcopywarning-a-value-is-trying-to-be-set-on-a-copy-of-a-slice
# because with new Pandas and Pyton 3 I am seeing warning when doing inPlace
# Better(?):
df2 = df.rename(columns={'old':'new'})
#-or-, even seems to work as
df = df.rename(columns={'old':'new'})
# Rename several DataFrame columns
df = df.rename(columns = {
'col1 old name':'col1 new name',
'col2 old name':'col2 new name',
'col3 old name':'col3 new name',
})
# or use `,inplace=True` without `df =`
# Lower-case all DataFrame column names
df.columns = map(str.lower, df.columns)
# Even more fancy DataFrame column re-naming
# lower-case all DataFrame column names (for example)
df.rename(columns=lambda x: x.split('.')[-1], inplace=True)
# Loop through rows in a DataFrame
# (if you must)
for index, row in df.iterrows():
print index, row['some column']
# Much faster way to loop through DataFrame rows
# if you can work with tuples (iterate rows more efficiently)
# (h/t hughamacmullaniv)
for row in df.itertuples():
print(row)
# see more about itertuples below
# Next few examples show how to work with text data in Pandas.
# Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html
# Slice values in a DataFrame column (aka Series)
df.column.str[0:2]
# Lower-case everything in a DataFrame column
df.column_name = df.column_name.str.lower()
# Get length of data in a DataFrame column
df.column_name.str.len()
# Make a column of a dataframe a Python list (df column --> to list)
lizt = df["col1"].tolist()
# Sort dataframe by multiple columns
df = df.sort_values(['col1','col2','col3'],ascending=[1,1,0])
# see `df = df.reset_index(drop=True)` in relation to this
# Sort on one column
df.sort_values('Col_name1', ascending=False, inplace=True)
# If the column you are trying to sort on is a multi-level /hierarchical column, use
#the full tuple with all levels to reference it, like
# `df.sort_values(('tmn1-5001','p-value'), ascending=False, inplace=True)`
# Get top n for each group of columns in a sorted dataframe
# (make sure dataframe is sorted first)
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)
# more on sorting at the useful Jupyter notebook, EXCEPT now `sort_values`,
# http://nbviewer.jupyter.org/github/rasbt/python_reference/blob/master/tutorials/things_in_pandas.ipynb#Sorting-and-Reindexing-DataFrames
# Grab DataFrame rows where specific column is null/notnull
newdf = df[df['column'].isnull()]
# Select from DataFrame using multiple keys of a hierarchical index
df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))
# Change all NaNs to None (useful before
# loading to a db)
df = df.where((pd.notnull(df)), None)
# More pre-db insert cleanup...make a pass through the dataframe, stripping whitespace
# from strings and changing any empty values to None
# (not especially recommended but including here b/c I had to do this in real life one time)
df = df.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None)
# Get quick count of rows in a DataFrame
len(df.index)
len(df) # <---I find even quicker.
# change a column into the index of the dataframe:
df = df.set_index('column_name') # see http://stackoverflow.com/questions/10457584/redefining-the-index-in-a-pandas-dataframe-object
# renumber index, useful for aftere removing items from one dataframe to make another
df = df.reset_index(drop=True)
# use `drop=True` to not keep the old index, see https://stackoverflow.com/a/20491748/8508004
# Pivot data (with flexibility about what what
# becomes a column and what stays a row).
# Syntax works on Pandas >= .14
pd.pivot_table(
df,values='cell_value',
index=['col1', 'col2', 'col3'], #these stay as columns; will fail silently if any of these cols have null values
columns=['col4']) #data values in this column become their own column
# example of re-orienting dataframe, based on https://stackoverflow.com/questions/28337117/how-to-pivot-a-dataframe-in-pandas
reoriented_df = pd.pivot_table(count_of_types_df, values = 'count', columns = 'qseqid').reset_index()
reoriented_df = reoriented_df[["G1","V1","M1","M7'","M8","M9'","M11''","M15","M14"]]
reoriented_df["TOTAL"] = reoriented_df.sum(1)
# that was AFTER below had generated counts for BLAST results
count_of_types_df = blast_df['qseqid'].value_counts().reset_index()
count_of_types_df.columns = ['qseqid', 'count']
# Change data type of DataFrame column
df.column_name = df.column_name.astype(np.int64)
# Get rid of non-numeric values throughout a DataFrame:
for col in refunds.columns.values:
refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)
# Do find/replace on a string throughout a DataFrame
df.replace({'OLD_TEXT': 'NEW_TEXT'}, regex=True, inplace = True)
# to restrict changes to a specific column, you can do
df.the_col = df.the_col.replace({'OLD_TEXT': 'NEW_TEXT'})
# more replace example
df.col_x.str.replace('[', '(').replace(']', ')')
df.col_x.str.replace('[\[\]]', '')
# Do find/replace on string restricted to column and use regex
# 'ZEB1/ZEB1_cerevisiae_extracted.clustal' ---> 'ZEB1'
df['col_name_here'].replace({"(ZEB\d)/.*": "\\1"}, regex=True, inplace=True) # see https://stackoverflow.com/a/41473130/8508004
#-or
#df['col_name_here'].replace({"(ZEB\d)/.*": r"\1"}, regex=True, inplace=True) # see https://stackoverflow.com/a/41473130/8508004
# Set DataFrame column values based on other column values (h/t: @mlevkov),.i.e., change values
df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value
df.loc[(df['column1'] == some_value), ['column_to_change']] = new_value
df1.loc[df1['stream'] == 2, 'feat'] = 10
df1.loc[df1['stream'] == 2, ['feat','another_feat']] = 'aaaa'
#conditional value setting of a new column
df.loc[df.col > 0, 'new_col'] = 'positif'
df.loc[~ df.col > 0, 'new_col'] = 'negatif'
df.loc[df.col < 0, 'new_col'] = 'negatif'
# Clean up missing values in multiple DataFrame columns
df = df.fillna({
'col1': 'missing',
'col2': '99.999',
'col3': '999',
'col4': 'missing',
'col5': 'missing',
'col6': '99'
})
# Concatenate two DataFrame columns into a new, single column
# (useful when dealing with composite keys, for example)
# (h/t @makmanalp for improving this one!)
df['newcol'] = df['col1'].astype(str) + df['col2'].astype(str)
# Concatenate / combine multiple dataframes, without regards to index, for each grouping
df = pd.concat([df1,df2], ignore_index=True)
# I use this 'concat/ignore_index=True' approach often when 'stacking' two dataframes that have the same columns
# Similarly, `.concat()` also great for combining into one when the dataframes are in list and all have same columns.
df = pd.concat(list_of_dataframes) # An example if don't care how indexes left in resulting dataframe
# Merge / combine / join / concatenate multiple dataframes
new_df = pd.merge(df1,df2,on='gene')
# For two or more (especially FOR MORE)...
# For cleaner looks you can chain them, https://stackoverflow.com/questions/23668427/pandas-joining-multiple-dataframes-on-columns#comment36377024_23671390
new_df = df1.merge(df2,on='gene').merge(df3,on='gene')
# limit the merging / combining / joining to certain columns of the contributing dataframes
new_df = pd.merge(df1[['col1','col4']],df2[['col1','col4']],on='gene')
#-OR-
new_df = df1[['col1','col4']].merge(df2[['col1','col4']],on='gene')
# Combining merge (with extracting) and renaming columns for better tracking source in new dataframe
df_m = pd.merge(df1[['gene','column_name']].rename(columns = {'column_name' : 'New_name'}), mitoWTRep3_df[['gene','column_name']].rename(columns = {'TPM' : 'New_name'}), on = 'gene')
# Note this is an eample of a way just to extract two of the columns from a dataframe that had more columns than those two to make the new dataframe.
# Or do the renaming and combining this way:
df = pd.concat([s3, s4, s5], axis=1, keys=['red','blue','yellow'])
# "A fairly common use of the keys argument is to override the column names when creating a new DataFrame based on existing
# Series. Notice how the default behaviour consists on letting the resulting DataFrame inherit the parent Series‘ name, when these existed."
# -or-
pd.concat(dict(df1 = df1, df2 = df2),axis=1) # from https://stackoverflow.com/a/15990537
# from http://pandas-docs.github.io/pandas-docs-travis/merging.html#more-concatenating-with-group-keys
# example there places the two dataframes side-by-side (example there adds multi-level columns to distinguish),
# instead of stacking(see above for stacking)
# -or-
pd.concat((df1, df2),axis=1) # to not make multi-level column names, but place side-by-side, otherwise similar to
# http://pandas-docs.github.io/pandas-docs-travis/merging.html#more-concatenating-with-group-keys
# Set up / Start / initialize a dataframe with certain columns for subsequently adding rows
df = pd.DataFrame(columns=['col1','col2','col3','col4'])
# and add rows to it (one way to do it; see issues about `.append` not being applicable to iterating over a list of dataframes, see `z_issues_and_workarounds.py`)
df = df.append(
{'col1':'string1','col2':value,'col3':value2,'col4':value3},
ignore_index=True) # based on http://pandas.pydata.org/pandas-docs/stable/merging.html#appending-rows-to-a-dataframe
# That above is not the recommended way to create a dataframe, i.e., by building a row at a time by adding a row with append, see https://stackoverflow.com/a/25376997/8508004 (VERY SLOW!),
# but I found it worked when iterating over a list of dataframes, see `z_issues_and_workarounds.py`
df.loc[len(df)]=['Text_for_Col1','Text_for_Col2','Text_for_Col3', a_value]
# Recommended way is at https://stackoverflow.com/a/17496530/8508004, but I don't know how amenable that
# is to where you might iterate over several DataFrames
# Doing calculations with DataFrame columns that have missing values
# In example below, swap in 0 for df['col1'] cells that contain null
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
# set column value based on another column value
df['elderly'] = np.where(df['age']>=50, 'yes', 'no')
# apply a function that uses value in a single column to each row of a dataframe, placing result in a new column
df['new col'] = df['col1'].apply(<user_defined_function>)
# I think if used same column it would replace. # based on
# http://jonathansoma.com/lede/foundations/classes/pandas%20columns%20and%20functions/apply-a-function-to-every-row-in-a-pandas-dataframe/
# "This is useful when cleaning up data - converting formats, altering values etc."
#Similar to last example, but calculating with more than one column
import statistics
df['std dev'] = df[['col1_name','col2_name']].apply(statistics.pstdev, axis=1)
def midpoint(items):
'''
takes a iterable of items and returns the midpoint (integer) of the first
and second values
'''
return int((int(items[0])+int(items[1]))/2)
df['midpoint'] = df[['start','end']].apply(midpoint, axis=1)
# apply a function to each row of a dataframe
df = df.apply(<user_defined_function>, axis=1)
# `axis=0` for across columns
# some functions, like sum, mean, max, min, etc. built-in to Pandas and allow shorthand call to axis
df["mean"] = df.mean(1)
# adds a column of the mean across the row to each row in a dataframe
# `axis=0` for down columns
# another example
df_c['mean'] = df[['col1_name','col2_name']].mean(1)
# an example for down the columns
avg_length = df.mean(0).length # see next too
avg_length = df[['length']].mean(0).length # same thing as above but subset first
# Pandas has mode but it is awkward presently about what it rquires and returns. In this
# example 'length' is a column name. (based on https://stackoverflow.com/a/52532322/8508004 );
# wouldn't take axis as inout used in this way and without the `[0]` returned a series .
the_mode_of_length_column = df.length.mode()[0]
#Use `.apply()` to return multiple columns. Example also illustrates passing additional info
# during use of `.apply()` using `args`. Returning multiple columns based on https://stackoverflow.com/a/43770075/8508004 .
# use of `args`to pass additional positional arguments to the `.apply()` method.
def example(row_items, info_dict):
'''
Toy example.
Takes a dataframe row that contains a midpoint value and 'num' value within
it and a dict where keys are midpoints and
values are a list (for this toy example) of extra info about each midpoint
(actually correspond to info from rows of a different dataframe).
Returns a row with multiple new columns added.
based on https://stackoverflow.com/a/43770075/8508004
'''
smallest_difference = float('Inf')
for midpt in info_dict:
if abs(row_items.midpoint-midpt) < smallest_difference:
smallest_difference = abs(items.midpoint-midpt)
row_items['other_item_added_to_sq'] = row_items['num']**2 + info_dict[midpt][0]
row_items['other_item_added_to_cubed'] = row_items['num']**3 + info_dict[midpt][1]
row_items['other_item_added_to_4power'] = row_items['num']**4 + info_dict[midpt][2]
return row_items
df = df.apply(example, args=(other_info_dict,), axis=1)
# Split delimited values in a DataFrame column into two new columns
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))
# Split Pandas DataFrame into two random subsets: (from https://twitter.com/python_tip/status/951829597523456000)
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)
# Collapse hierarchical (multilevel/ multi-level) column indexes
df.columns = df.columns.get_level_values(0)
# Make a dataframe that is count of the frequency of items (moving towards a distribution accounting)
df = df['amount row shifted'].value_counts().reset_index() # column 'amount of rows shifted' in
# this case were integers but strings and maybe even floats if unique, see https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.value_counts.html
df.columns = ['amount rows shifted', 'count'] # FIX OF COLUMN NAMES AFTER based on https://stackoverflow.com/a/35893235/8508004
# END ABOVE IF JUST NEED A DATAFRAME,
# Note though if just want a plot of the amount row shifted can just use below assuming there is only one column and it corresponds to amount row shifted:
ax = df.hist();
ax[0][0].set_xlabel("shift") # from https://stackoverflow.com/a/43239901/8508004
ax[0][0].set_ylabel("count") # from https://stackoverflow.com/a/43239901/8508004
ax[0][0].set_title("Distribution of shifts") #just tried it based on matplotlib and how set labels of axes above
# or as shown in current documentation, and combining with matplotlib settings
ax = df.plot.hist(ec=(0.3,0.3,0.3,0.65),legend=False)
ax.set_xlabel("shift")
ax.set_ylabel("frequency")
ax.set_title("Distribution of shifts");
# Convert Django queryset to DataFrame
qs = DjangoModelName.objects.all()
q = qs.values()
df = pd.DataFrame.from_records(q)
# Create a DataFrame from a Python dictionary
df = pd.DataFrame({ 'Id' : ["Charger","Ram","Pacer","Elantra","Camaro","Porsche 911"],
'Speed':[30,35,31,20,25,80]
})
# can change order of columns by providing columns list in order, such as `, columns = ['Speed', 'Id']` between the dictionary closing curly bracket and the DataFrame method closing parantheis
df = pd.DataFrame({'A': 'foo bar one123 bar foo one324 foo 0'.split(),
'B': 'one546 one765 twosde three twowef two234 onedfr three'.split(),
'C': np.arange(8), 'D': np.arange(8) * 2})
# - or-
# (THIS ONE NEXT IS FROM THE ORIGINAL SNIPPETS REPO BUT SEEMS LIMITED TO A TWO COLUMN DataFrame!?!?
a_dictionary = {"April":"Grass", "May":"Flowers","June":"Corn"}
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2']) # BUT IS THIS LIMITED TO TWO COLUMNS SINCE USING KEY-VALUE PAIRS??
# Other Dictionary to Dataframe Examples from https://stackoverflow.com/questions/41192401/python-dictionary-to-pandas-dataframe-with-multiple-columns
# i.e. multiple columns
from collections import Counter
d= {'data' : Counter({ 'important' : 2,
'very' : 3}),
'analytics' : Counter({ 'boring' : 5,
'sleep' : 3})
}
df = pd.DataFrame(d).stack().reset_index()
df.columns = ['word','category','count']
# -or-
df = pd.DataFrame.from_
# read csv as dataframe
df = pd.read_csv('my_file.csv', usecols=['a', 'c'])
df = pd.read_csv('my_file.csv', usecols=[0, 2])
df = pd.read_csv('my_file.csv', nrows=3)
# These may simply be a result of my misunderstanding, stumbling though non-optimal / non-pythonic solutions, bad coding, or lack of research, but here are some issues I encountered.
# Workarounds are provided when / if I solved them.
# COULD NOT ITERATE OVER A LIST OF DATAFRAMES AND ADD A ROW TO EACH WITH `.append`
# For each dataframe I wanted to make a TOTAL combined entry for for an element using components a and b
# It seemed like this should be doable in a loop.
# To do one, I can do this:
'''
elem_meanlength = 1
elem_meaneff = 1
elem_sumTPM = 1
elem_sumNumReads = 1
total_df = total_df.append(
{'Name':"Elem_total",'Length':elem_meanlength,'EffectiveLength':elem_meaneff,'TPM':elem_sumTPM,'NumReads':elem_sumNumReads},
ignore_index=True) # based on http://pandas.pydata.org/pandas-docs/stable/merging.html#appending-rows-to-a-dataframe
#print(total_df) # ONLY FOR DEBUGGING
'''
# But seems to only update a copy of when try to set up for iterating. Doesn't alter
# original. Find/replace worked in loop (see BELOW) but used "inplace".
# Find/replace that worked in loop:
#-----------------
# list_of_dataframes = [total_df, another_df, yet_another_df]
#for each_df in list_of_dataframes:
# each_df.replace({'OLD_TEXT': 'NEW_TEXT'}, regex=True, inplace = True)
# #print(each_df) # FOR DEBUGGING ONLY
#-----------------
# COULDN'T COME UP WITH SOLUTION IN A TIMELY MANNER AT FIRST BUT LOOKED MORE.
# By searching `pandas append a row to dataframe not a copy` finally found Jun's answer at
# https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360#19368360
# & it looked amenable to looping through several dataframes. Tested:
'''
list_of_dataframes = [total_df, another_df, yet_another_df]
print(total_df) # ONLY FOR DEBUGGING
elem_meanlength = 1
elem_meaneff = 1
elem_sumTPM = 1
elem_sumNumReads = 1
list_of_dataframes[0].loc[len(list_of_dataframes[0])]= ["Elem_total",elem_meanlength,elem_meaneff,elem_sumTPM,elem_sumNumReads]# based on Jun's answer at https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360#19368360
print(list_of_dataframes[0])
print(total_df) # ONLY FOR DEBUGGING
# THE WORKAROUND FOR THAT
# That solution (plus the find/replace) implemented
for indx, each_df in enumerate(list_of_dataframes):
each_df.replace({'OLD_TEXT': 'NEW_TEXT'}, regex=True, inplace = True)
#print(each_df) # FOR DEBUGGING
#print(each_df[each_df.Name.str.contains("ID")]) # FOR DEBUGGING, shows matches if "IDa" "IDab", etc.
elem_meanlength = each_df[each_df.Name.str.contains("ID")].mean(0).Length
elem_meaneff = each_df[each_df.Name.str.contains("ID")].mean(0).EffectiveLength
elem_sumTPM = each_df[each_df.Name.str.contains("ID")].sum(0).TPM
elem_sumNumReads = each_df[each_df.Name.str.contains("ID")].sum(0).NumReads
list_of_dataframes[indx].loc[len(list_of_dataframes[indx])]= ["Elem_total",elem_meanlength,elem_meaneff,elem_sumTPM,elem_sumNumReads]# based on Jun's answer at https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360
# BUT DON'T USE THIS FOR A REAL,REAL LOT OF DATAFRAMES OR A LOT OF LARGE ONES. SUPER SLOW. See https://stackoverflow.com/a/17496530/8508004 for recommended way that I don't know if it is amenablet to iterating over a list of DataFrames
# CANNOT USE `sample` as a column name if want to be able to call that column using attribute notation
# because `pandas.DataFrame.sample` is a function on the DataFrame.
# Either change the column names using `df.rename(columns={'old_name':'new_name'}, inplace=True)`
# -or use standard notation like this (compare with example of `.str.contains(pattern)` in snippets file:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': 'foo bar one123 bar foo one324 foo 0'.split(),
'sample': 'one546 one765 twosde three twowef two234 onedfr three'.split(),
'C': np.arange(8), 'D': np.arange(8) * 2})
print (df)
pattern = '|'.join(['one', 'two'])
df = df[df['sample'].str.contains(pattern)]
df['sample'].str.contains(pattern)
# if working with column names that contain spaces, use bracket notation to select and
# not attribute notation, unless you want to change column names first (see `df.rename(columns={'old':'new'})`)
val =df[df.col3.str.contains('text\dmoretext')].mean(0)['source values']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment