Skip to content

Instantly share code, notes, and snippets.

@fomightez
Last active October 30, 2022 07:11
Show Gist options
  • Save fomightez/0ff6c709ab99bd626cd67fcab03d3ef2 to your computer and use it in GitHub Desktop.
Save fomightez/0ff6c709ab99bd626cd67fcab03d3ef2 to your computer and use it in GitHub Desktop.
useful argparse snippets
# from `get_seq_following_seq_from_FASTA.py`
if __name__ == "__main__" and '__file__' in globals():
""" This is executed when run from the command line """
# Code with just `if __name__ == "__main__":` alone will be run if pasted
# into a notebook. The addition of ` and '__file__' in globals()` is based
# on https://stackoverflow.com/a/22923872/8508004
# See also https://stackoverflow.com/a/22424821/8508004 for an option to
# provide arguments when prototyping a full script in the notebook.
###-----------------for parsing command line arguments-------------------###
import argparse
parser = argparse.ArgumentParser(prog=
'get_seq_following_seq_from_FASTA.py',
description="get_seq_following_seq_from_FASTA.py \
takes a sequence pattern string, a sequence file (FASTA-format), and a \
record id and extracts a sequence of specified size following the \
sequence pattern. Importantly, the regular expression search term \
syntax is acceptable in the provided sequence pattern, although \
anything dealing with case will be ignored. (The FASTA-formatted \
sequence file is assumed by default to be \
a multi-FASTA, i.e., multiple sequences in the provided file, \
although it definitely doesn't have to be. In case it is only a \
single sequence, the record id becomes moot, see below.) A sequence \
string of the specified length will be returned. Redirect the output \
to a file if that is what is needed. \
**** Script by Wayne Decatur \
(fomightez @ github) ***")
parser.add_argument("sequence_file", help="Name of sequence file to \
use as input. Must be FASTA format. Can be a \
multi-FASTA file, i.e., multiple sequences in FASTA format in one \
file.", metavar="SEQUENCE_FILE")
parser.add_argument("record_id", help="Specific identifier of sequence \
entry in sequence file to mine. If the provided sequence file only \
contains one sequence, that sequence will be mined and what is provided \
for this parameter will be ignored. In other words, if the sequence \
file is not a multi-FASTA file, you don't need to determine the \
identifier and can instead just enter `blahblah` or any other \
nonsensical string in this spot.", metavar="RECORD_ID")
parser.add_argument("pattern", help="Sequence or sequence pattern to use \
to locate site after which to get the sequence. Regular expressions \
are accepted here; however any information about case will be ignored \
as the provided sequence pattern and sequence will both be converted \
to lower case to check for a match.", metavar="PATTERN")
parser.add_argument("amount_to_get", type=int, help="Number (integer) of \
residues \
to retrieve following the match to the sequence. The length of this \
sequence is to be given in common terms, where the first item is \
referenced as `1` ,and so a provided argument of \
`1` would a single residue following the match would be returned.",
metavar="NUMBER_TO_GET")
parser.add_argument('-ld', '--leave_dashes', help="Add this flag when \
calling the script in \
order to be able to use gaps (represented as dashes) in the pattern \
required to match. I.E., for matching with an aligned FASTA file. \
(***ATYPICAL.***)", action="store_true")
#I would also like trigger help to display if no arguments provided because
# need at least one for url
if len(sys.argv)==1: #from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu
parser.print_help()
sys.exit(1)
args = parser.parse_args()
sequence_file = args.sequence_file
record_id = args.record_id
seq_to_find = args.pattern
amount_to_get = args.amount_to_get
if args.leave_dashes:
filter_dashes = False
else:
filter_dashes = True
# from plot_expression_across_chromosomes.py
parser = argparse.ArgumentParser(prog='plot_expression_across_chromosomes.py',
description="plot_expression_across_chromosomes.py plots a ratio of \
expression values across chromosomes or scaffolds of a genome to highlight \
regions of deviation. Besides the options listed here, there are several \
`USER ADJUSTABLE VALUES` inside the script that can be edited for easy \
customization. A similar plot is called a Manhattan plot and this \
implementation borrows the plotting approach and some of the features from \
Brent Pedersen's awesome `manhattan-plot.py` script. \
**** Script by Wayne Decatur \
(fomightez @ github) ***")
parser.add_argument("annotation", help="Name of file containing the genome \
annotation. REQUIRED. This is needed to determine the order of individual \
data points along the chromosome and how to display the data across \
chromosomes or scaffolds.",
type=argparse.FileType('r'), metavar="ANNOTATION_FILE")
parser.add_argument("data", help="Name of file containing the summarized data \
to plot, such as mean TPM or RPKM, etc. in tab-delimited form. REQUIRED. \
See my script `plot_expression_across_chromosomes_from_raw.py` if you want \
supply the individual `raw` data files with the level metric for each \
sample and/or replicate.",
type=argparse.FileType('r'), metavar="DATA_FILE")
parser.add_argument('-cols', '--columns', action='store', type=str,
default= '1,2,3', help="columns for gene, wild-type (baseline state) \
expression value, experimental condition expression value, in that order. \
This flag is used to specify the data in the summary file to be plotted. \
Separate the column identifiers by commas, without spaces. \
Default is `1,2,3`, where `1` indicates the first column, i.e., how \
you'd refer to the columns in natural language (no zero-indexing). ")
# based on
# https://stackoverflow.com/questions/15753701/argparse-option-for-passing-a-list-as-option
parser.add_argument("-l", "--lines",help=
"add this flag to plot the expression level ratio value as lines \
extending from the x-axis rather than points in space. (The resulting \
aesthetic may resemble a city skyline for which the `manhattan plot` is \
named.)",
action="store_true")
parser.add_argument('-chr', '--chrs', action='store', type=str,
help="use this flag to limit plotting of the data to particular \
chromosomes or scaffolds you specify immediately following this flag. \
Separate the chromosome or scaffold identifiers by commas, without spaces. \
Example use in a command is `--chrs I,IV,XVI`. \
Default when this optional flag is not called is to plot that data for all \
chromosomes or scaffolds. ") # based on
# https://stackoverflow.com/questions/15753701/argparse-option-for-passing-a-list-as-option
parser.add_argument("-nl", "--no_log",help=
"add this flag to keep the expression level ratio to be plotted in the \
common base 10 instead of converting to log2.",
action="store_true")
parser.add_argument("-nlim", "--no_limits",help=
"add this flag to not impose a limit of above and below {} in plot window \
when converting to log2. The cutoff can also be adjusted under \
`user-adjustable settings` in the script. Issuing this flag has no effect \
if all values are within +/- the cutoff interval or `--no_log` is used."
.format(y_cutoff),
action="store_true")
parser.add_argument("-s", "--smooth",help=
"add this flag to display a smoothing curve fit to the data points \
(LOWESS) on a per chromosome basis. This option can enhance visualization \
of deviations characteristic of aneuploidy and copy number variation across \
the genome, both within and between chromosomes. Additionally, a \
simplistically-based assesment will be made for aneuploidy at the \
chromosome or scaffold level and a notice will be made as the program is \
running if aneuploidy at the chromosome or scaffold level seems indicated \
by this simple metric. Further examination is warranted regardless of \
the result this automated assessment.",
action="store_true")
parser.add_argument('-ed', '--exp_desig', action='store', type=str,
default= 'experimental', help="Allows changing the text used in y-axis \
label to reference experimental sample. Following `--exp_desig` type what \
you'd like to read there instead of `experimental`.")
parser.add_argument('-bd', '--base_desig', action='store', type=str,
default= 'wild\mathrm{-}type', help="Allows changing the text used in y-axis \
label to reference wild-type or baseline sample. Following `--base_desig` \
type what you'd like to read there instead of `wild-type`.")
parser.add_argument("-svg", "--save_vg",help=
"add this flag to save as vector graphics \
(**RECOMMENDED FOR PUBLICATION***) instead of default png. Not default or \
saved alongside default because file size can get large due to the large \
number of points.",
action="store_true")
parser.add_argument("-ndh", "--no_data_header",help=
"add this flag if there is no data header or no first line of column names \
in the data file. Otherwise, it is assumed there is and any item read as \
the first gene identifier from the first line won't be highlighted as \
missing from annotation.\
IMPORTANTLY, this only affects feedback provided as script is run. If the \
first line resembles data, i.e., numbers in specified columns, it will be \
automagically parsed as if data. Remove the header or column labels line \
from your summary data file on the off-chance this causes issues in your \
resulting plot.",
action="store_true")
parser.add_argument('-ac', '--advance_color', action='store', type=int,
default= '0', help="**FOR ADVANCED USE.*** Allows for advancing the color \
selection iterator the specified number of times. The idea is it allows \
the ability to control the color of the chromosome when specifying \
a chromosome or scaffolds to plot so you could make the color match the \
one used when all chromsome plotted if needed. Supply the number to \
advance after the flag on the command line. For example, `-ac 4`.")
#I would also like trigger help to display if no arguments provided because need at least one input file
if len(sys.argv)==1: #from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu
parser.print_help()
sys.exit(1)
args = parser.parse_args()
annotaton_file = args.annotation
data_file = args.data
data_columns_to_grab = [int(item) for item in args.columns.split(',')]
no_log = args.no_log
no_data_header = args.no_data_header
lines = args.lines
if args.chrs:
if "," in args.chrs:
limit_to_chrs = args.chrs.split(',')
else:
# means only one item
limit_to_chrs = [args.chrs] #has to be a list for passing to Pandas `isin()`
else:
limit_to_chrs = args.chrs # will make `limit_to_chrs` as `None`
advance_color_increments = args.advance_color
display_smooth = args.smooth
no_limits = args.no_limits
exp_designation = args.exp_desig
baseline_designation = args.base_desig
save_vg = args.save_vg
parser.add_argument('-os', '--output_suffix', action='store', type=str,
default= suffix_for_saving, help="OPTIONAL: Set a suffix for including in file \
name of output. \
If none provided, '{}' will be used.".format(suffix_for_saving))
parser.add_argument('-col', '--column', action='store', type=int,
default= '4', help="column for expression metric in data file. Assumes a \
value of four to match fourth tab-separated column in Salmon-quantified \
RNA-Seq data.")
parser.add_argument('-dfo', '--df_output', action='store', type=str,
default= df_save_as_name, help="OPTIONAL: Set file name for saving pickled \
dataframe. If none provided, '{}' will be used. To force no dataframe to \
be saved, enter `-dfo no_pickling` without quotes as output file \
(ATYPICAL).".format(df_save_as_name))
def valid_dibase(arg_string):
if len(arg_string) == 2 and set(
list(arg_string.upper())).issubset(nt_set):
return arg_string
else:
msg = ("Not a valid set of letters representing two nucleotides: "
"'{}'. Try something like `GC` or `AT`.".format(arg_string))
raise argparse.ArgumentTypeError(arg_string)
parser.add_argument("two_bases", type = valid_dibase, help="Two \
letters representing one set of bases to check for imbalance vs. the \
other two remaining out of the possible four from `GATC`. For example, \
providing `GC` will result in checking for imbalance of nucleotides \
`G` and `C` in close prox
parser.add_argument('-bl', '--block_size', action='store', type=int,
default= chunk_size, help="OPTIONAL: Use the `--block_size` flag followed \
by an interger to provide a value to use as the span size (window of \
basepairs) to analyze instead of the default of '{}'.".format(chunk_size))
parser.add_argument('-ov', '--overlap_size', action='store', type=int,
default= overlap_specified, help="OPTIONAL: Use the `--overlap_size` \
flag followed by an integer to specify the amount of overlap to use \
between the \
analysis windows instead of the default of '{}'.".format(overlap_specified))
parser.add_argument("input", nargs='?', help="**OPTIONAL**Name of the file \
generated by Salmon \
when run with your transcriptome of interest. Usually, this is \
'"+input_file_name_default+"' &\
if no input file name is provided then this will be used by \
default.", default=input_file_name_default, metavar="INPUT_FILE")
parser.add_argument("output", nargs='?', help="**OPTIONAL**Name of file to \
save results. If BOTH input \
and output file are not provided, '"+output_file_name_default+"', will \
be used.", default=output_file_name_default, metavar="OUTPUT_FILE")
# See
# https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments
# and
# https://docs.python.org/2/library/argparse.html#nargs for use of `nargs='?'`
# to make input and output file names optional. Note that the square brackets
# shown in the usage out signify optional according to
# https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments#comment40460395_4480202
# , but because placed under positional I added clarifying text to help
# description.
# IF MODIFYING THIS SCRIPT FOR USE ELSEWHERE AND DON'T NEED/WANT THE INPUT AND
# OUTPUT FILES TO BE OPTIONAL, remove `nargs` (& default?) BUT KEEP WHERE NOT
# USING `argparse.FileType` AND USING `with open` AS CONISDERED MORE PYTHONIC.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment