sany2k8 · December 29, 2018 16:32
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
 # -*- coding: utf-8 -*-

 # import pandas and numpy package
 import numpy as np
 import pandas as pd

 # List unique values in a DataFrame column
 df['Column Name'].unique()

 # To extract a specific column (subset the dataframe), you can use [ ] (brackets) or attribute notation.
 df.height
 df['height']
 # are same thing!!! (from http://www.stephaniehicks.com/learnPython/pages/pandas.html 
 # -or-
 # http://www.datacarpentry.org/python-ecology-lesson/02-index-slice-subset/)

 # Quick overview of DataFrame
 df.describe()

 # Display data types in DataFrame
 df.dtypes

 # Change order of columns in DataFrame
 df = df[['C', 'B', 'A']] # It will have defaulted to 'A B C' order(I think), see https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
 # When adding a new column -- "By default, columns get inserted at the end. The 
 # insert function is available to insert at a particular location in the columns:"
 df.insert(2, 'mean', df.mean(1)) #insert at third column a 'mean' column

 # Convert Series datatype to numeric (will error if column has non-numeric values)
 pd.to_numeric(df['Column Name'])

 # Convert Series datatype to numeric, changing non-numeric values to NaN
 pd.to_numeric(df['Column Name'], errors='coerce')
 # Use that conversion in a dataframe 
 df['Column Name'] = df['Column Name'].apply(pd.to_numeric, errors='coerce')

 # Grab DataFrame rows where column has certain values
 valuelist = ['value1', 'value2', 'value3']
 df = df[df.column.isin(valuelist)]

 # Grab DataFrame rows where column doesn't have certain values
 valuelist = ['value1', 'value2', 'value3']
 df = df[~df.column.isin(value_list)]
 #(`~` inverts the boolean values; it is similar to using a `not` in a conditional expression).

 # Grab DataFrame rows where column matches at least part of a string in a list
 df = df[df.column.str.contains(pattern)]
 # Example OF USE
 df = pd.DataFrame({'A': 'foo bar one123 bar foo one324 foo 0'.split(),
                   'B': 'one546 one765 twosde three twowef two234 onedfr three'.split(),
                   'C': np.arange(8), 'D': np.arange(8) * 2})

 pattern = '|'.join(['one', 'two'])
 df = df[df.B.str.contains(pattern)]
 # `str.startswith.` is related to looking for text in a string in a column (see below)

 # Select rows containing certain values from pandas dataframe IN ANY COLUMN
 df[df.values == 'X'].dropna(how='all') # this one makes multiple copies of the rows show up if multiple examples occur in the row
 df[df.isin(['X'])].dropna(how='all') # BEST; this one works better if multiple occurences can in the same row 
 # based on https://stackoverflow.com/questions/38185688/select-rows-containing-certain-values-from-pandas-dataframe


 # Remove / delete rows where a condition or conditions are met
 df = df.drop(df[df.score < 50].index)
 # can be done in place
 df.drop(df[df.score < 50].index, inplace=True)
 # use booleans to enforce multiple conditions
 df = df.drop(df[(df.score < 50) & (df.score > 20)].index)

 # remove all but one column, dropping the rest
 sub_df = df[['column_a']]
 # similarly, to limit to just a few columns (subset), add multiple columns in the bracketed list 
 sub_df = df[['column_a','column_b']]

 # see more about dropping a column below under 'Delete column from DataFrame'



 # Select from DataFrame using criteria from multiple columns (multiple condition expression), i.e., filter / subset on multiple conditions
 # (use `|` instead of `&` to do an OR)
 newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
 # other examples
 df[(df['nationality'] == "USA") & df['age'] > 50] #https://chrisalbon.com/python/data_wrangling/pandas_selecting_rows_on_conditions/
 df[df['first_name'].notnull() & (df['nationality'] == "USA")]# df['first_name'].notnull() uses variable attribute while others are Boolean; https://chrisalbon.com/python/data_wrangling/pandas_selecting_rows_on_conditions/
 hits_df = blast_df[(blast_df['sstart'] <= midpt)&(midpt <= blast_df['send'])] # detecting if in range / interval example

 #SEMI-RELATED:if it is a single column involved and the text examples fall into
 # like you want all that contain `text` like, rows with `texta`, `textb,` etc
 # you can use `column.str.contains(pattern)`
 df = df[df.column.str.contains(pattern)] # See above

 #Also SEMI-RELATED: if you need multiple string matches in a single column you can use
 # `Grab DataFrame rows where column has certain values` approach (SEE ABOVE)
 # or combine to expand on the `Select from DataFrame using criteria from multiple columns`
 # with `newdf = df[(df['column_one']>2004) & (df['column_two']==9)]` approach, like:
 valuelist1 = ['value1', 'value2', 'value3']
 valuelist2 = ['value4', 'value5', 'value6']
 newdf = df[(df.column.isin(valuelist1)) & (df.column.isin(valuelist2))]

 # using startswith in selection
 df = df[df['gene'].str.startswith("snR17")] 
 # combining with making case not matter by making lower case (or upper), requires complexity that didn't appear obvious to me
 df = df[df['gene'].str.lower().str.startswith("snr17")] # solution from https://stackoverflow.com/a/22909357/8508004; they also had a regex solution offered that failed
 # Original was fix to using with `.contains`

 #Also SEMI-RELATED: if using conditional to have rows extracted extracted go to
 # new dataframe and you want first row (a.k.a, top row) (or you know there should only be one) and you want a value in that row:
 new_df = df[df.gene == "test4"] # conditional narrows to just those with "test4"
 new_df.iloc[0].FoldChange # iloc[0] specifies first row and then `.FoldChange` or ["FoldChange"] to select column
 # see more on `.iloc` at https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
 # I tried using `.iloc` to update a copy of a dataframe but it didn't work, but this approach did, based on 
 # http://pandas.pydata.org/pandas-docs/stable/indexing.html#evaluation-order-matters:
 # copy the dataframe to avoid `SettingWithCopyWarning`, see 
 # https://www.dataquest.io/blog/settingwithcopywarning/
 updated_sites_df = sites_df.copy()
 for indx,sites_row in sites_df.iterrows():
    if sites_row.olap_state == 'closest':
        #process row where 'overlapping' gene/feature not identified
        start = sites_row.start
        end = sites_row.end
        id = sites_row.sys_gene_id
        closest,pos_val = identify_closest_gene_or_feature(
            id,int(start),int(end),genes_df,avg_gene_and_feature_size)
        #updated_sites_df.iloc[indx].overlapping_or_closest_gene = closest # even with copying, these cause `SettingWithCopyWarning` and even more problematic, don't make updates needed.
        #updated_sites_df.iloc[indx].position = pos_val # even with copying, these cause `SettingWithCopyWarning` and even more problematic, don't make updates needed.
        # Approach based on http://pandas.pydata.org/pandas-docs/stable/indexing.html#evaluation-order-matters 
        # worked to updata values in a dataframe, but still showed warning:
        #updated_sites_df['overlapping_or_closest_gene'][indx] = closest #still gives warning, but works
        # updated_sites_df['position'][indx] = pos_val #still gives warning, but works
        # Work and no warning, as prescribed at https://www.dataquest.io/blog/settingwithcopywarning/ 
        # at end of 'Chained assignment' section
        updated_sites_df.loc[indx,'overlapping_or_closest_gene'] = closest
        updated_sites_df.loc[indx,'position'] = pos_val

 # Reorder rows based on values in a column when you know what you want
 df = pd.DataFrame(list(categorization.items()),columns = ['category','residue_positions'])
 # That works but I want categories with most conserved as top line and 
 # `not_conserved` on bottom
 # Because I think the dictionary will have these as arbitrary orders I
 # cannot simply base order on what I saw in development. More robust would
 # be to extract what `new_index` order should be
 #print(categorized_residue_positions_df)     # FOR DEBUGGING ONLY
 default_indx = {}
 for i, row in df.iterrows():
 	default_indx[row.category] = i
 new_index = ([default_indx['identical'],
 	default_indx['strongly_similar'],
 	default_indx['weakly_similar'],
 	default_indx['not_conserved'],])
 categorized_residue_positions_df = categorized_residue_positions_df.reindex(new_index) # based on
 # https://stackoverflow.com/a/30010004/8508004
 categorized_residue_positions_df = categorized_residue_positions_df.reset_index(drop=True)
 #print(categorized_residue_positions_df)     # FOR DEBUGGING ONLY
 	
 	
 	
 # Delete column from DataFrame
 del df['column']
 #-or-
 df = df.drop('column',1)
 # see https://stackoverflow.com/a/18145399/8508004
 #-or-
 df = df.drop('reports', axis=1)
 # see https://chrisalbon.com/python/data_wrangling/pandas_dropping_column_and_rows/, but note 
 # that unlike shown there seems now need to assign (like in example above) to see change

 # Add a column to a DataFrame with same contents to each row
 df["new_column_label"] = "String_entry"
 df["new_column_label2"] = 0 
 # also works for setting value of all rows in an existing column to same thing, see https://stackoverflow.com/a/44723277/8508004
 # Note if you do this after you made a new dataframe from a subset of another, you may see a `SettingWithCopyWarning:` warning.
 # because Pandas is just being cautious, see https://stackoverflow.com/questions/42105859/pandas-map-to-a-new-column-settingwithcopywarning
 #; could fix with approach [here](https://stackoverflow.com/a/45885310/8508004) if important, like:
 # df = df.assign(signal= 'yes')
 # See `.append` below for ADDING ROWS.
 # Related: you can add a column with different contents to each
 # row WITHOUT USING APPLY if you use itertuples or iterrows to
 # build a list with the same amount of items as the length of 
 # the dataframe and then add new columh with
 df["ids"] = list_of_ids
 # see https://www.dataquest.io/blog/settingwithcopywarning/ for better understanding of `SettingWithCopyWarning:` warnings.

 # Rename a DataFrame column  / rename column
 df.rename(columns={'old_name':'new_name'}, inplace=True)
 # see https://stackoverflow.com/questions/33727667/pandas-settingwithcopywarning-a-value-is-trying-to-be-set-on-a-copy-of-a-slice
 # because with new Pandas and Pyton 3 I am seeing warning when doing inPlace
 # Better(?):
 df2 = df.rename(columns={'old':'new'}) 
 #-or-, even seems to work as
 df = df.rename(columns={'old':'new'}) 

 # Rename several DataFrame columns
 df = df.rename(columns = {
    'col1 old name':'col1 new name',
    'col2 old name':'col2 new name',
    'col3 old name':'col3 new name',
 })
 # or use `,inplace=True` without `df =`

 # Lower-case all DataFrame column names
 df.columns = map(str.lower, df.columns)

 # Even more fancy DataFrame column re-naming
 # lower-case all DataFrame column names (for example)
 df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

 # Loop through rows in a DataFrame
 # (if you must)
 for index, row in df.iterrows():
    print index, row['some column']

 # Much faster way to loop through DataFrame rows
 # if you can work with tuples (iterate rows more efficiently)
 # (h/t hughamacmullaniv)
 for row in df.itertuples():
    print(row)
 # see more about itertuples below

 # Next few examples show how to work with text data in Pandas.
 # Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html

 # Slice values in a DataFrame column (aka Series)
 df.column.str[0:2]

 # Lower-case everything in a DataFrame column
 df.column_name = df.column_name.str.lower()

 # Get length of data in a DataFrame column
 df.column_name.str.len()

 # Make a column of a dataframe a Python list (df column --> to list)
 lizt = df["col1"].tolist()

 # Sort dataframe by multiple columns
 df = df.sort_values(['col1','col2','col3'],ascending=[1,1,0])
 # see `df = df.reset_index(drop=True)` in relation to this

 # Sort on one column 
 df.sort_values('Col_name1', ascending=False, inplace=True)
 # If the column you are trying to sort on is a multi-level /hierarchical column, use 
 #the full tuple with all levels to reference it, like 
 # `df.sort_values(('tmn1-5001','p-value'), ascending=False, inplace=True)`

 # Get top n for each group of columns in a sorted dataframe
 # (make sure dataframe is sorted first)
 top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)

 # more on sorting at the useful Jupyter notebook, EXCEPT now `sort_values`,
 # http://nbviewer.jupyter.org/github/rasbt/python_reference/blob/master/tutorials/things_in_pandas.ipynb#Sorting-and-Reindexing-DataFrames

 # Grab DataFrame rows where specific column is null/notnull
 newdf = df[df['column'].isnull()]

 # Select from DataFrame using multiple keys of a hierarchical index
 df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))

 # Change all NaNs to None (useful before
 # loading to a db)
 df = df.where((pd.notnull(df)), None)

 # More pre-db insert cleanup...make a pass through the dataframe, stripping whitespace
 # from strings and changing any empty values to None
 # (not especially recommended but including here b/c I had to do this in real life one time)
 df = df.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None)

 # Get quick count of rows in a DataFrame
 len(df.index)
 len(df) # <---I find even quicker.

 # change a column into the index of the dataframe:
 df = df.set_index('column_name') # see http://stackoverflow.com/questions/10457584/redefining-the-index-in-a-pandas-dataframe-object

 # renumber index, useful for aftere removing items from one dataframe to make another
 df = df.reset_index(drop=True)
 # use `drop=True` to not keep the old index, see https://stackoverflow.com/a/20491748/8508004


 # Pivot data (with flexibility about what what
 # becomes a column and what stays a row).
 # Syntax works on Pandas >= .14
 pd.pivot_table(
  df,values='cell_value',
  index=['col1', 'col2', 'col3'], #these stay as columns; will fail silently if any of these cols have null values
  columns=['col4']) #data values in this column become their own column
 # example of re-orienting dataframe, based on https://stackoverflow.com/questions/28337117/how-to-pivot-a-dataframe-in-pandas
 reoriented_df = pd.pivot_table(count_of_types_df, values = 'count', columns = 'qseqid').reset_index()
 reoriented_df = reoriented_df[["G1","V1","M1","M7'","M8","M9'","M11''","M15","M14"]] 
 reoriented_df["TOTAL"] = reoriented_df.sum(1)
 # that was AFTER below had generated counts for BLAST results
 count_of_types_df = blast_df['qseqid'].value_counts().reset_index()
 count_of_types_df.columns = ['qseqid', 'count']



 # Change data type of DataFrame column
 df.column_name = df.column_name.astype(np.int64)

 # Get rid of non-numeric values throughout a DataFrame:
 for col in refunds.columns.values:
  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

 # Do find/replace on a string throughout a DataFrame
 df.replace({'OLD_TEXT': 'NEW_TEXT'}, regex=True, inplace = True)
 # to restrict changes to a specific column, you can do
 df.the_col = df.the_col.replace({'OLD_TEXT': 'NEW_TEXT'})

 # more replace example
 df.col_x.str.replace('[', '(').replace(']', ')')
 df.col_x.str.replace('[\[\]]', '')


 # Do find/replace on string restricted to column and use regex
 # 'ZEB1/ZEB1_cerevisiae_extracted.clustal' ---> 'ZEB1'	
 df['col_name_here'].replace({"(ZEB\d)/.*": "\\1"}, regex=True, inplace=True) # see https://stackoverflow.com/a/41473130/8508004
 #-or
 #df['col_name_here'].replace({"(ZEB\d)/.*": r"\1"}, regex=True, inplace=True) # see https://stackoverflow.com/a/41473130/8508004
  
 # Set DataFrame column values based on other column values (h/t: @mlevkov),.i.e., change values
 df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value
 df.loc[(df['column1'] == some_value), ['column_to_change']] = new_value
 df1.loc[df1['stream'] == 2, 'feat'] = 10
 df1.loc[df1['stream'] == 2, ['feat','another_feat']] = 'aaaa'

 #conditional value setting of a new column
 df.loc[df.col > 0, 'new_col'] = 'positif'
 df.loc[~ df.col > 0, 'new_col'] = 'negatif' 
 df.loc[df.col < 0, 'new_col'] = 'negatif'



 # Clean up missing values in multiple DataFrame columns
 df = df.fillna({
    'col1': 'missing',
    'col2': '99.999',
    'col3': '999',
    'col4': 'missing',
    'col5': 'missing',
    'col6': '99'
 })

 # Concatenate two DataFrame columns into a new, single column
 # (useful when dealing with composite keys, for example)
 # (h/t @makmanalp for improving this one!)
 df['newcol'] = df['col1'].astype(str) + df['col2'].astype(str)

 # Concatenate / combine multiple dataframes, without regards to index, for each grouping
 df = pd.concat([df1,df2], ignore_index=True)
 # I use this 'concat/ignore_index=True' approach often when 'stacking' two dataframes that have the same columns
 # Similarly, `.concat()` also great for combining into one when the dataframes are in list and all have same columns.
 df = pd.concat(list_of_dataframes) # An example if don't care how indexes left in resulting dataframe


 # Merge / combine / join / concatenate multiple dataframes
 new_df = pd.merge(df1,df2,on='gene')
 # For two or more (especially FOR MORE)...
 # For cleaner looks you can chain them, https://stackoverflow.com/questions/23668427/pandas-joining-multiple-dataframes-on-columns#comment36377024_23671390
 new_df = df1.merge(df2,on='gene').merge(df3,on='gene')

 # limit the merging / combining / joining to certain columns of the contributing dataframes
 new_df = pd.merge(df1[['col1','col4']],df2[['col1','col4']],on='gene')
 #-OR-
 new_df = df1[['col1','col4']].merge(df2[['col1','col4']],on='gene')

 # Combining merge (with extracting) and renaming columns for better tracking source in new dataframe 
 df_m = pd.merge(df1[['gene','column_name']].rename(columns = {'column_name' : 'New_name'}), mitoWTRep3_df[['gene','column_name']].rename(columns = {'TPM' : 'New_name'}), on = 'gene')
 # Note this is an eample of a way just to extract two of the columns from a dataframe that had more columns than those two to make the new dataframe.
 # Or do the renaming and combining this way:
 df = pd.concat([s3, s4, s5], axis=1, keys=['red','blue','yellow'])
 # "A fairly common use of the keys argument is to override the column names when creating a new DataFrame based on existing 
 # Series. Notice how the default behaviour consists on letting the resulting DataFrame inherit the parent Series‘ name, when these existed."
 # -or-
 pd.concat(dict(df1 = df1, df2 = df2),axis=1) # from https://stackoverflow.com/a/15990537
 # from http://pandas-docs.github.io/pandas-docs-travis/merging.html#more-concatenating-with-group-keys
 # example there places the two dataframes side-by-side (example there adds multi-level columns to distinguish),
 # instead of stacking(see above for stacking)
 # -or-
 pd.concat((df1, df2),axis=1) # to not make multi-level column names, but place side-by-side, otherwise similar to
 # http://pandas-docs.github.io/pandas-docs-travis/merging.html#more-concatenating-with-group-keys



 # Set up / Start / initialize a dataframe with certain columns for subsequently adding rows
 df = pd.DataFrame(columns=['col1','col2','col3','col4'])
 # and add rows to it (one way to do it; see issues about `.append` not being applicable to iterating over a list of dataframes, see `z_issues_and_workarounds.py`)
 df = df.append(
    {'col1':'string1','col2':value,'col3':value2,'col4':value3},
    ignore_index=True) # based on http://pandas.pydata.org/pandas-docs/stable/merging.html#appending-rows-to-a-dataframe

 # That above is not the recommended way to create a dataframe, i.e., by building a row at a time by adding a row with append, see https://stackoverflow.com/a/25376997/8508004 (VERY SLOW!),
 # but I found it worked when iterating over a list of dataframes, see `z_issues_and_workarounds.py`
 df.loc[len(df)]=['Text_for_Col1','Text_for_Col2','Text_for_Col3', a_value] 
 # Recommended way is at https://stackoverflow.com/a/17496530/8508004, but I don't know how amenable that
 # is to where you might iterate over several DataFrames

 # Doing calculations with DataFrame columns that have missing values
 # In example below, swap in 0 for df['col1'] cells that contain null
 df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
 # set column value based on another column value
 df['elderly'] = np.where(df['age']>=50, 'yes', 'no')

 # apply a function that uses value in a single column to each row of a dataframe, placing result in a new column
 df['new col'] = df['col1'].apply(<user_defined_function>)
 # I think if used same column it would replace. # based on
 # http://jonathansoma.com/lede/foundations/classes/pandas%20columns%20and%20functions/apply-a-function-to-every-row-in-a-pandas-dataframe/
 # "This is useful when cleaning up data - converting formats, altering values etc."

 #Similar to last example, but calculating with more than one column
 import statistics
 df['std dev'] = df[['col1_name','col2_name']].apply(statistics.pstdev, axis=1)

 def midpoint(items):
    '''
    takes a iterable of items and returns the midpoint (integer) of the first 
    and second values
    '''
    return int((int(items[0])+int(items[1]))/2)
 df['midpoint'] = df[['start','end']].apply(midpoint, axis=1)

 # apply a function to each row of a dataframe
 df = df.apply(<user_defined_function>, axis=1)
 # `axis=0` for across columns

 # some functions, like sum, mean, max, min, etc.  built-in to Pandas and allow shorthand call to axis
 df["mean"] = df.mean(1)
 # adds a column of the mean across the row to each row in a dataframe
 # `axis=0` for down columns
 # another example
 df_c['mean'] = df[['col1_name','col2_name']].mean(1)
 # an example for down the columns
 avg_length = df.mean(0).length              # see next too
 avg_length = df[['length']].mean(0).length  # same thing as above but subset first

 # Pandas has mode but it is awkward presently about what it rquires and returns. In this 
 # example 'length' is a column name. (based on https://stackoverflow.com/a/52532322/8508004 );
 # wouldn't take axis as inout used in this way and without the `[0]` returned a series .
 the_mode_of_length_column = df.length.mode()[0]
                   
 #Use `.apply()` to return multiple columns. Example also illustrates passing additional info
 # during use of `.apply()` using `args`. Returning multiple columns based on https://stackoverflow.com/a/43770075/8508004 .
 # use of `args`to pass additional positional arguments to the `.apply()` method.
 def example(row_items, info_dict):
 	'''
 	Toy example.
 	Takes a dataframe row that contains a midpoint value and 'num' value within 
 	it and a dict where keys are midpoints and
 	values are a list (for this toy example) of extra info about each midpoint
 	(actually correspond to info from rows of a different dataframe).

 	Returns a row with multiple new columns added.
 	based on https://stackoverflow.com/a/43770075/8508004
 	'''
 	smallest_difference = float('Inf')
 	for midpt in info_dict:
 		if abs(row_items.midpoint-midpt) < smallest_difference:
 			smallest_difference = abs(items.midpoint-midpt)
 			row_items['other_item_added_to_sq'] = row_items['num']**2 + info_dict[midpt][0]
 			row_items['other_item_added_to_cubed'] = row_items['num']**3 + info_dict[midpt][1]
 			row_items['other_item_added_to_4power'] = row_items['num']**4 + info_dict[midpt][2]
 	return row_items

 df = df.apply(example, args=(other_info_dict,), axis=1)




 # Split delimited values in a DataFrame column into two new columns
 df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

 # Split Pandas DataFrame into two random subsets: (from https://twitter.com/python_tip/status/951829597523456000)
 from sklearn.model_selection import train_test_split
 train, test = train_test_split(df, test_size=0.2)


 # Collapse hierarchical (multilevel/ multi-level) column indexes
 df.columns = df.columns.get_level_values(0)


 # Make a dataframe that is count of the frequency of items (moving towards a distribution accounting)
 df = df['amount row shifted'].value_counts().reset_index() # column 'amount of rows shifted' in 
 # this case were integers but strings and maybe even floats if unique, see https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.value_counts.html
 df.columns = ['amount rows shifted', 'count'] # FIX OF COLUMN NAMES AFTER based on https://stackoverflow.com/a/35893235/8508004
 # END ABOVE IF JUST NEED A DATAFRAME,
 # Note though if just want a plot of the amount row shifted can just use below assuming there is only one column and it corresponds to amount row shifted:
 ax = df.hist();
 ax[0][0].set_xlabel("shift") # from https://stackoverflow.com/a/43239901/8508004
 ax[0][0].set_ylabel("count") # from https://stackoverflow.com/a/43239901/8508004
 ax[0][0].set_title("Distribution of shifts") #just tried it based on matplotlib and how set labels of axes above
 # or as shown in current documentation, and combining with matplotlib settings
 ax = df.plot.hist(ec=(0.3,0.3,0.3,0.65),legend=False)
 ax.set_xlabel("shift")
 ax.set_ylabel("frequency")
 ax.set_title("Distribution of shifts");


 # Convert Django queryset to DataFrame
 qs = DjangoModelName.objects.all()
 q = qs.values()
 df = pd.DataFrame.from_records(q)

 # Create a DataFrame from a Python dictionary
 df = pd.DataFrame({ 'Id' : ["Charger","Ram","Pacer","Elantra","Camaro","Porsche 911"],
    'Speed':[30,35,31,20,25,80]
    })
 # can change order of columns by providing columns list in order, such as `, columns = ['Speed', 'Id']` between the dictionary closing curly bracket and the DataFrame method closing parantheis
 df = pd.DataFrame({'A': 'foo bar one123 bar foo one324 foo 0'.split(),
                   'B': 'one546 one765 twosde three twowef two234 onedfr three'.split(),
                   'C': np.arange(8), 'D': np.arange(8) * 2})
 # - or-
 # (THIS ONE NEXT IS FROM THE ORIGINAL SNIPPETS REPO BUT SEEMS LIMITED TO A TWO COLUMN DataFrame!?!?
 a_dictionary = {"April":"Grass", "May":"Flowers","June":"Corn"}
 df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2']) # BUT IS THIS LIMITED TO TWO COLUMNS SINCE USING KEY-VALUE PAIRS??

 # Other Dictionary to Dataframe Examples from https://stackoverflow.com/questions/41192401/python-dictionary-to-pandas-dataframe-with-multiple-columns
 # i.e. multiple columns
 from collections import Counter
 d= {'data'      : Counter({ 'important' : 2,
                        'very'      : 3}),
    'analytics' : Counter({ 'boring'    : 5,
                        'sleep'     : 3})
    }
 df = pd.DataFrame(d).stack().reset_index()
 df.columns = ['word','category','count']
 # -or-
 df = pd.DataFrame.from_

 # read csv as dataframe
 df = pd.read_csv('my_file.csv', usecols=['a', 'c'])
 df = pd.read_csv('my_file.csv', usecols=[0, 2])
 df = pd.read_csv('my_file.csv', nrows=3)
diff --git a/z_issues_and_workarounds.py b/z_issues_and_workarounds.py
 # These may simply be a result of my misunderstanding, stumbling though non-optimal / non-pythonic solutions, bad coding, or lack of research, but here are some issues I encountered. 
 # Workarounds are provided when / if I solved them.

 # COULD NOT ITERATE OVER A LIST OF DATAFRAMES AND ADD A ROW TO EACH WITH `.append`
 # For each dataframe I wanted to make a TOTAL combined entry for for an element using components a and b
 # It seemed like this should be doable in a loop.
 # To do one, I can do this:
 '''
 elem_meanlength = 1
 elem_meaneff = 1
 elem_sumTPM = 1
 elem_sumNumReads = 1
 total_df = total_df.append(
        {'Name':"Elem_total",'Length':elem_meanlength,'EffectiveLength':elem_meaneff,'TPM':elem_sumTPM,'NumReads':elem_sumNumReads}, 
        ignore_index=True) # based on http://pandas.pydata.org/pandas-docs/stable/merging.html#appending-rows-to-a-dataframe
 #print(total_df) # ONLY FOR DEBUGGING
 '''
 # But seems to only update a copy of when try to set up for iterating. Doesn't alter
 # original. Find/replace worked in loop (see BELOW) but used "inplace". 
 # Find/replace that worked in loop:
 #-----------------
 # list_of_dataframes = [total_df, another_df, yet_another_df]
 #for each_df in list_of_dataframes:
 #   each_df.replace({'OLD_TEXT': 'NEW_TEXT'}, regex=True, inplace = True)
 #   #print(each_df) # FOR DEBUGGING ONLY
 #-----------------
 # COULDN'T COME UP WITH SOLUTION IN A TIMELY MANNER AT FIRST BUT LOOKED MORE.
 # By searching `pandas append a row to dataframe not a copy` finally found Jun's answer at 
 # https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360#19368360
 # & it looked amenable to looping through several dataframes. Tested:
 '''
 list_of_dataframes = [total_df, another_df, yet_another_df]
 print(total_df) # ONLY FOR DEBUGGING    
 elem_meanlength = 1
 elem_meaneff = 1
 elem_sumTPM = 1
 elem_sumNumReads = 1
 list_of_dataframes[0].loc[len(list_of_dataframes[0])]= ["Elem_total",elem_meanlength,elem_meaneff,elem_sumTPM,elem_sumNumReads]# based on Jun's answer at https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360#19368360
 print(list_of_dataframes[0])
 print(total_df) # ONLY FOR DEBUGGING
 # THE WORKAROUND FOR THAT
 # That solution (plus the find/replace) implemented
 for indx, each_df in enumerate(list_of_dataframes):
    each_df.replace({'OLD_TEXT': 'NEW_TEXT'}, regex=True, inplace = True)
    #print(each_df) # FOR DEBUGGING
    #print(each_df[each_df.Name.str.contains("ID")]) # FOR DEBUGGING, shows matches if "IDa" "IDab", etc.
    elem_meanlength = each_df[each_df.Name.str.contains("ID")].mean(0).Length
    elem_meaneff = each_df[each_df.Name.str.contains("ID")].mean(0).EffectiveLength
    elem_sumTPM = each_df[each_df.Name.str.contains("ID")].sum(0).TPM
    elem_sumNumReads = each_df[each_df.Name.str.contains("ID")].sum(0).NumReads
    list_of_dataframes[indx].loc[len(list_of_dataframes[indx])]= ["Elem_total",elem_meanlength,elem_meaneff,elem_sumTPM,elem_sumNumReads]# based on Jun's answer at https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360

 # BUT DON'T USE THIS FOR A REAL,REAL LOT OF DATAFRAMES OR A LOT OF LARGE ONES. SUPER SLOW. See https://stackoverflow.com/a/17496530/8508004 for recommended way that I don't know if it is amenablet to iterating over a list of DataFrames



 # CANNOT USE `sample` as a column name if want to be able to call that column using attribute notation 
 # because `pandas.DataFrame.sample` is a function on the DataFrame.
 # Either change the column names using `df.rename(columns={'old_name':'new_name'}, inplace=True)`
 # -or use standard notation like this (compare with example of `.str.contains(pattern)` in snippets file:
 import pandas as pd
 import numpy as np
 df = pd.DataFrame({'A': 'foo bar one123 bar foo one324 foo 0'.split(),
                   'sample': 'one546 one765 twosde three twowef two234 onedfr three'.split(),
                   'C': np.arange(8), 'D': np.arange(8) * 2})
 print (df)
 pattern = '|'.join(['one', 'two'])
 df = df[df['sample'].str.contains(pattern)]
 df['sample'].str.contains(pattern)



 # if working with column names that contain spaces, use bracket notation to select and 
 # not attribute notation, unless you want to change column names first (see `df.rename(columns={'old':'new'})`)
 val =df[df.col3.str.contains('text\dmoretext')].mean(0)['source values']
	# These may simply be a result of my misunderstanding, stumbling though non-optimal / non-pythonic solutions, bad coding, or lack of research, but here are some issues I encountered.
	# Workarounds are provided when / if I solved them.

	# COULD NOT ITERATE OVER A LIST OF DATAFRAMES AND ADD A ROW TO EACH WITH `.append`
	# For each dataframe I wanted to make a TOTAL combined entry for for an element using components a and b
	# It seemed like this should be doable in a loop.
	# To do one, I can do this:
	'''
	elem_meanlength = 1
	elem_meaneff = 1
	elem_sumTPM = 1
	elem_sumNumReads = 1
	total_df = total_df.append(
	{'Name':"Elem_total",'Length':elem_meanlength,'EffectiveLength':elem_meaneff,'TPM':elem_sumTPM,'NumReads':elem_sumNumReads},
	ignore_index=True) # based on http://pandas.pydata.org/pandas-docs/stable/merging.html#appending-rows-to-a-dataframe
	#print(total_df) # ONLY FOR DEBUGGING
	'''
	# But seems to only update a copy of when try to set up for iterating. Doesn't alter
	# original. Find/replace worked in loop (see BELOW) but used "inplace".
	# Find/replace that worked in loop:
	#-----------------
	# list_of_dataframes = [total_df, another_df, yet_another_df]
	#for each_df in list_of_dataframes:
	# each_df.replace({'OLD_TEXT': 'NEW_TEXT'}, regex=True, inplace = True)
	# #print(each_df) # FOR DEBUGGING ONLY
	#-----------------
	# COULDN'T COME UP WITH SOLUTION IN A TIMELY MANNER AT FIRST BUT LOOKED MORE.
	# By searching `pandas append a row to dataframe not a copy` finally found Jun's answer at
	# https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360#19368360
	# & it looked amenable to looping through several dataframes. Tested:
	'''
	list_of_dataframes = [total_df, another_df, yet_another_df]
	print(total_df) # ONLY FOR DEBUGGING
	elem_meanlength = 1
	elem_meaneff = 1
	elem_sumTPM = 1
	elem_sumNumReads = 1
	list_of_dataframes[0].loc[len(list_of_dataframes[0])]= ["Elem_total",elem_meanlength,elem_meaneff,elem_sumTPM,elem_sumNumReads]# based on Jun's answer at https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360#19368360
	print(list_of_dataframes[0])
	print(total_df) # ONLY FOR DEBUGGING
	# THE WORKAROUND FOR THAT
	# That solution (plus the find/replace) implemented
	for indx, each_df in enumerate(list_of_dataframes):
	each_df.replace({'OLD_TEXT': 'NEW_TEXT'}, regex=True, inplace = True)
	#print(each_df) # FOR DEBUGGING
	#print(each_df[each_df.Name.str.contains("ID")]) # FOR DEBUGGING, shows matches if "IDa" "IDab", etc.
	elem_meanlength = each_df[each_df.Name.str.contains("ID")].mean(0).Length
	elem_meaneff = each_df[each_df.Name.str.contains("ID")].mean(0).EffectiveLength
	elem_sumTPM = each_df[each_df.Name.str.contains("ID")].sum(0).TPM
	elem_sumNumReads = each_df[each_df.Name.str.contains("ID")].sum(0).NumReads
	list_of_dataframes[indx].loc[len(list_of_dataframes[indx])]= ["Elem_total",elem_meanlength,elem_meaneff,elem_sumTPM,elem_sumNumReads]# based on Jun's answer at https://stackoverflow.com/questions/19365513/how-to-add-an-extra-row-to-a-pandas-dataframe/19368360

	# BUT DON'T USE THIS FOR A REAL,REAL LOT OF DATAFRAMES OR A LOT OF LARGE ONES. SUPER SLOW. See https://stackoverflow.com/a/17496530/8508004 for recommended way that I don't know if it is amenablet to iterating over a list of DataFrames



	# CANNOT USE `sample` as a column name if want to be able to call that column using attribute notation
	# because `pandas.DataFrame.sample` is a function on the DataFrame.
	# Either change the column names using `df.rename(columns={'old_name':'new_name'}, inplace=True)`
	# -or use standard notation like this (compare with example of `.str.contains(pattern)` in snippets file:
	import pandas as pd
	import numpy as np
	df = pd.DataFrame({'A': 'foo bar one123 bar foo one324 foo 0'.split(),
	'sample': 'one546 one765 twosde three twowef two234 onedfr three'.split(),
	'C': np.arange(8), 'D': np.arange(8) * 2})
	print (df)
	pattern = '\|'.join(['one', 'two'])
	df = df[df['sample'].str.contains(pattern)]
	df['sample'].str.contains(pattern)



	# if working with column names that contain spaces, use bracket notation to select and
	# not attribute notation, unless you want to change column names first (see `df.rename(columns={'old':'new'})`)
	val =df[df.col3.str.contains('text\dmoretext')].mean(0)['source values']