Last active
October 30, 2022 07:11
-
-
Save fomightez/0ff6c709ab99bd626cd67fcab03d3ef2 to your computer and use it in GitHub Desktop.
useful argparse snippets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # from `get_seq_following_seq_from_FASTA.py` | |
| if __name__ == "__main__" and '__file__' in globals(): | |
| """ This is executed when run from the command line """ | |
| # Code with just `if __name__ == "__main__":` alone will be run if pasted | |
| # into a notebook. The addition of ` and '__file__' in globals()` is based | |
| # on https://stackoverflow.com/a/22923872/8508004 | |
| # See also https://stackoverflow.com/a/22424821/8508004 for an option to | |
| # provide arguments when prototyping a full script in the notebook. | |
| ###-----------------for parsing command line arguments-------------------### | |
| import argparse | |
| parser = argparse.ArgumentParser(prog= | |
| 'get_seq_following_seq_from_FASTA.py', | |
| description="get_seq_following_seq_from_FASTA.py \ | |
| takes a sequence pattern string, a sequence file (FASTA-format), and a \ | |
| record id and extracts a sequence of specified size following the \ | |
| sequence pattern. Importantly, the regular expression search term \ | |
| syntax is acceptable in the provided sequence pattern, although \ | |
| anything dealing with case will be ignored. (The FASTA-formatted \ | |
| sequence file is assumed by default to be \ | |
| a multi-FASTA, i.e., multiple sequences in the provided file, \ | |
| although it definitely doesn't have to be. In case it is only a \ | |
| single sequence, the record id becomes moot, see below.) A sequence \ | |
| string of the specified length will be returned. Redirect the output \ | |
| to a file if that is what is needed. \ | |
| **** Script by Wayne Decatur \ | |
| (fomightez @ github) ***") | |
| parser.add_argument("sequence_file", help="Name of sequence file to \ | |
| use as input. Must be FASTA format. Can be a \ | |
| multi-FASTA file, i.e., multiple sequences in FASTA format in one \ | |
| file.", metavar="SEQUENCE_FILE") | |
| parser.add_argument("record_id", help="Specific identifier of sequence \ | |
| entry in sequence file to mine. If the provided sequence file only \ | |
| contains one sequence, that sequence will be mined and what is provided \ | |
| for this parameter will be ignored. In other words, if the sequence \ | |
| file is not a multi-FASTA file, you don't need to determine the \ | |
| identifier and can instead just enter `blahblah` or any other \ | |
| nonsensical string in this spot.", metavar="RECORD_ID") | |
| parser.add_argument("pattern", help="Sequence or sequence pattern to use \ | |
| to locate site after which to get the sequence. Regular expressions \ | |
| are accepted here; however any information about case will be ignored \ | |
| as the provided sequence pattern and sequence will both be converted \ | |
| to lower case to check for a match.", metavar="PATTERN") | |
| parser.add_argument("amount_to_get", type=int, help="Number (integer) of \ | |
| residues \ | |
| to retrieve following the match to the sequence. The length of this \ | |
| sequence is to be given in common terms, where the first item is \ | |
| referenced as `1` ,and so a provided argument of \ | |
| `1` would a single residue following the match would be returned.", | |
| metavar="NUMBER_TO_GET") | |
| parser.add_argument('-ld', '--leave_dashes', help="Add this flag when \ | |
| calling the script in \ | |
| order to be able to use gaps (represented as dashes) in the pattern \ | |
| required to match. I.E., for matching with an aligned FASTA file. \ | |
| (***ATYPICAL.***)", action="store_true") | |
| #I would also like trigger help to display if no arguments provided because | |
| # need at least one for url | |
| if len(sys.argv)==1: #from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu | |
| parser.print_help() | |
| sys.exit(1) | |
| args = parser.parse_args() | |
| sequence_file = args.sequence_file | |
| record_id = args.record_id | |
| seq_to_find = args.pattern | |
| amount_to_get = args.amount_to_get | |
| if args.leave_dashes: | |
| filter_dashes = False | |
| else: | |
| filter_dashes = True | |
| # from plot_expression_across_chromosomes.py | |
| parser = argparse.ArgumentParser(prog='plot_expression_across_chromosomes.py', | |
| description="plot_expression_across_chromosomes.py plots a ratio of \ | |
| expression values across chromosomes or scaffolds of a genome to highlight \ | |
| regions of deviation. Besides the options listed here, there are several \ | |
| `USER ADJUSTABLE VALUES` inside the script that can be edited for easy \ | |
| customization. A similar plot is called a Manhattan plot and this \ | |
| implementation borrows the plotting approach and some of the features from \ | |
| Brent Pedersen's awesome `manhattan-plot.py` script. \ | |
| **** Script by Wayne Decatur \ | |
| (fomightez @ github) ***") | |
| parser.add_argument("annotation", help="Name of file containing the genome \ | |
| annotation. REQUIRED. This is needed to determine the order of individual \ | |
| data points along the chromosome and how to display the data across \ | |
| chromosomes or scaffolds.", | |
| type=argparse.FileType('r'), metavar="ANNOTATION_FILE") | |
| parser.add_argument("data", help="Name of file containing the summarized data \ | |
| to plot, such as mean TPM or RPKM, etc. in tab-delimited form. REQUIRED. \ | |
| See my script `plot_expression_across_chromosomes_from_raw.py` if you want \ | |
| supply the individual `raw` data files with the level metric for each \ | |
| sample and/or replicate.", | |
| type=argparse.FileType('r'), metavar="DATA_FILE") | |
| parser.add_argument('-cols', '--columns', action='store', type=str, | |
| default= '1,2,3', help="columns for gene, wild-type (baseline state) \ | |
| expression value, experimental condition expression value, in that order. \ | |
| This flag is used to specify the data in the summary file to be plotted. \ | |
| Separate the column identifiers by commas, without spaces. \ | |
| Default is `1,2,3`, where `1` indicates the first column, i.e., how \ | |
| you'd refer to the columns in natural language (no zero-indexing). ") | |
| # based on | |
| # https://stackoverflow.com/questions/15753701/argparse-option-for-passing-a-list-as-option | |
| parser.add_argument("-l", "--lines",help= | |
| "add this flag to plot the expression level ratio value as lines \ | |
| extending from the x-axis rather than points in space. (The resulting \ | |
| aesthetic may resemble a city skyline for which the `manhattan plot` is \ | |
| named.)", | |
| action="store_true") | |
| parser.add_argument('-chr', '--chrs', action='store', type=str, | |
| help="use this flag to limit plotting of the data to particular \ | |
| chromosomes or scaffolds you specify immediately following this flag. \ | |
| Separate the chromosome or scaffold identifiers by commas, without spaces. \ | |
| Example use in a command is `--chrs I,IV,XVI`. \ | |
| Default when this optional flag is not called is to plot that data for all \ | |
| chromosomes or scaffolds. ") # based on | |
| # https://stackoverflow.com/questions/15753701/argparse-option-for-passing-a-list-as-option | |
| parser.add_argument("-nl", "--no_log",help= | |
| "add this flag to keep the expression level ratio to be plotted in the \ | |
| common base 10 instead of converting to log2.", | |
| action="store_true") | |
| parser.add_argument("-nlim", "--no_limits",help= | |
| "add this flag to not impose a limit of above and below {} in plot window \ | |
| when converting to log2. The cutoff can also be adjusted under \ | |
| `user-adjustable settings` in the script. Issuing this flag has no effect \ | |
| if all values are within +/- the cutoff interval or `--no_log` is used." | |
| .format(y_cutoff), | |
| action="store_true") | |
| parser.add_argument("-s", "--smooth",help= | |
| "add this flag to display a smoothing curve fit to the data points \ | |
| (LOWESS) on a per chromosome basis. This option can enhance visualization \ | |
| of deviations characteristic of aneuploidy and copy number variation across \ | |
| the genome, both within and between chromosomes. Additionally, a \ | |
| simplistically-based assesment will be made for aneuploidy at the \ | |
| chromosome or scaffold level and a notice will be made as the program is \ | |
| running if aneuploidy at the chromosome or scaffold level seems indicated \ | |
| by this simple metric. Further examination is warranted regardless of \ | |
| the result this automated assessment.", | |
| action="store_true") | |
| parser.add_argument('-ed', '--exp_desig', action='store', type=str, | |
| default= 'experimental', help="Allows changing the text used in y-axis \ | |
| label to reference experimental sample. Following `--exp_desig` type what \ | |
| you'd like to read there instead of `experimental`.") | |
| parser.add_argument('-bd', '--base_desig', action='store', type=str, | |
| default= 'wild\mathrm{-}type', help="Allows changing the text used in y-axis \ | |
| label to reference wild-type or baseline sample. Following `--base_desig` \ | |
| type what you'd like to read there instead of `wild-type`.") | |
| parser.add_argument("-svg", "--save_vg",help= | |
| "add this flag to save as vector graphics \ | |
| (**RECOMMENDED FOR PUBLICATION***) instead of default png. Not default or \ | |
| saved alongside default because file size can get large due to the large \ | |
| number of points.", | |
| action="store_true") | |
| parser.add_argument("-ndh", "--no_data_header",help= | |
| "add this flag if there is no data header or no first line of column names \ | |
| in the data file. Otherwise, it is assumed there is and any item read as \ | |
| the first gene identifier from the first line won't be highlighted as \ | |
| missing from annotation.\ | |
| IMPORTANTLY, this only affects feedback provided as script is run. If the \ | |
| first line resembles data, i.e., numbers in specified columns, it will be \ | |
| automagically parsed as if data. Remove the header or column labels line \ | |
| from your summary data file on the off-chance this causes issues in your \ | |
| resulting plot.", | |
| action="store_true") | |
| parser.add_argument('-ac', '--advance_color', action='store', type=int, | |
| default= '0', help="**FOR ADVANCED USE.*** Allows for advancing the color \ | |
| selection iterator the specified number of times. The idea is it allows \ | |
| the ability to control the color of the chromosome when specifying \ | |
| a chromosome or scaffolds to plot so you could make the color match the \ | |
| one used when all chromsome plotted if needed. Supply the number to \ | |
| advance after the flag on the command line. For example, `-ac 4`.") | |
| #I would also like trigger help to display if no arguments provided because need at least one input file | |
| if len(sys.argv)==1: #from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu | |
| parser.print_help() | |
| sys.exit(1) | |
| args = parser.parse_args() | |
| annotaton_file = args.annotation | |
| data_file = args.data | |
| data_columns_to_grab = [int(item) for item in args.columns.split(',')] | |
| no_log = args.no_log | |
| no_data_header = args.no_data_header | |
| lines = args.lines | |
| if args.chrs: | |
| if "," in args.chrs: | |
| limit_to_chrs = args.chrs.split(',') | |
| else: | |
| # means only one item | |
| limit_to_chrs = [args.chrs] #has to be a list for passing to Pandas `isin()` | |
| else: | |
| limit_to_chrs = args.chrs # will make `limit_to_chrs` as `None` | |
| advance_color_increments = args.advance_color | |
| display_smooth = args.smooth | |
| no_limits = args.no_limits | |
| exp_designation = args.exp_desig | |
| baseline_designation = args.base_desig | |
| save_vg = args.save_vg | |
| parser.add_argument('-os', '--output_suffix', action='store', type=str, | |
| default= suffix_for_saving, help="OPTIONAL: Set a suffix for including in file \ | |
| name of output. \ | |
| If none provided, '{}' will be used.".format(suffix_for_saving)) | |
| parser.add_argument('-col', '--column', action='store', type=int, | |
| default= '4', help="column for expression metric in data file. Assumes a \ | |
| value of four to match fourth tab-separated column in Salmon-quantified \ | |
| RNA-Seq data.") | |
| parser.add_argument('-dfo', '--df_output', action='store', type=str, | |
| default= df_save_as_name, help="OPTIONAL: Set file name for saving pickled \ | |
| dataframe. If none provided, '{}' will be used. To force no dataframe to \ | |
| be saved, enter `-dfo no_pickling` without quotes as output file \ | |
| (ATYPICAL).".format(df_save_as_name)) | |
| def valid_dibase(arg_string): | |
| if len(arg_string) == 2 and set( | |
| list(arg_string.upper())).issubset(nt_set): | |
| return arg_string | |
| else: | |
| msg = ("Not a valid set of letters representing two nucleotides: " | |
| "'{}'. Try something like `GC` or `AT`.".format(arg_string)) | |
| raise argparse.ArgumentTypeError(arg_string) | |
| parser.add_argument("two_bases", type = valid_dibase, help="Two \ | |
| letters representing one set of bases to check for imbalance vs. the \ | |
| other two remaining out of the possible four from `GATC`. For example, \ | |
| providing `GC` will result in checking for imbalance of nucleotides \ | |
| `G` and `C` in close prox | |
| parser.add_argument('-bl', '--block_size', action='store', type=int, | |
| default= chunk_size, help="OPTIONAL: Use the `--block_size` flag followed \ | |
| by an interger to provide a value to use as the span size (window of \ | |
| basepairs) to analyze instead of the default of '{}'.".format(chunk_size)) | |
| parser.add_argument('-ov', '--overlap_size', action='store', type=int, | |
| default= overlap_specified, help="OPTIONAL: Use the `--overlap_size` \ | |
| flag followed by an integer to specify the amount of overlap to use \ | |
| between the \ | |
| analysis windows instead of the default of '{}'.".format(overlap_specified)) | |
| parser.add_argument("input", nargs='?', help="**OPTIONAL**Name of the file \ | |
| generated by Salmon \ | |
| when run with your transcriptome of interest. Usually, this is \ | |
| '"+input_file_name_default+"' &\ | |
| if no input file name is provided then this will be used by \ | |
| default.", default=input_file_name_default, metavar="INPUT_FILE") | |
| parser.add_argument("output", nargs='?', help="**OPTIONAL**Name of file to \ | |
| save results. If BOTH input \ | |
| and output file are not provided, '"+output_file_name_default+"', will \ | |
| be used.", default=output_file_name_default, metavar="OUTPUT_FILE") | |
| # See | |
| # https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments | |
| # and | |
| # https://docs.python.org/2/library/argparse.html#nargs for use of `nargs='?'` | |
| # to make input and output file names optional. Note that the square brackets | |
| # shown in the usage out signify optional according to | |
| # https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments#comment40460395_4480202 | |
| # , but because placed under positional I added clarifying text to help | |
| # description. | |
| # IF MODIFYING THIS SCRIPT FOR USE ELSEWHERE AND DON'T NEED/WANT THE INPUT AND | |
| # OUTPUT FILES TO BE OPTIONAL, remove `nargs` (& default?) BUT KEEP WHERE NOT | |
| # USING `argparse.FileType` AND USING `with open` AS CONISDERED MORE PYTHONIC. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment