Last active
April 21, 2020 02:51
-
-
Save fomightez/324b7446dc08e56c83fa2d7af2b89a33 to your computer and use it in GitHub Desktop.
examples of my frequently used Python functions and slight variations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # generate output file name | |
| def generate_output_file_name(id_of_seq_element,id_of_seq_scanned): | |
| ''' | |
| Takes a file name as an argument and returns string for the name of the | |
| output file. The generated name is based on a prefix that can be adjusted | |
| under the section ' USER ADJUSTABLE VALUES ', plus the provided text in | |
| the function call. | |
| If there is no `id_of_seq_element` specified, the sequence element will be | |
| used, limited to the first number of bases specified in `limit_of_name`. | |
| Specific examples | |
| ================= | |
| Calling function with | |
| ("elem1","chrmt") | |
| returns | |
| "seq_account_elem1_chrmt.tsv" | |
| Calling function with | |
| (None,"chrmt") | |
| returns | |
| "seq_account_GAATTC_chrmt.tsv" | |
| if `GAATTC` happened to be the provided sequence element. | |
| ''' | |
| if id_of_seq_element: | |
| elem_id = id_of_seq_element | |
| elif len(element) > limit_of_name: | |
| elem_id = element[:limit_of_name] | |
| else: | |
| elem_id = element | |
| return "{prefix}_{elem_id}_{seq}.tsv".format( | |
| prefix=output_file_name_prefix,elem_id=id_of_seq_element, | |
| seq=id_of_seq_scanned) | |
| def generate_output_file_name(file_name): | |
| ''' | |
| Takes a file name as an argument and returns string for the name of the | |
| output file. The generated name is based on the original file | |
| name. | |
| Specific example | |
| ================= | |
| Calling function with | |
| ("mito_transcripts_dataframeplusSUTs.pkl") | |
| returns | |
| "mito_transcripts_dataframeplusSUTs_n_XUTs.pkl" | |
| ''' | |
| main_part_of_name, file_extension = os.path.splitext( | |
| file_name) #from http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python | |
| if '.' in file_name: #I don't know if this is needed with the os.path.splitext method but I had it before so left it | |
| return main_part_of_name + suffix_for_saving + file_extension | |
| else: | |
| return file_name + suffix_for_saving + ".pkl" | |
| def generate_output_file_name(file_name): | |
| ''' | |
| Takes a file name as an argument and returns string for the name of the | |
| output file. The generated name is based on the original file | |
| name. | |
| Specific examples | |
| ================= | |
| Calling function with | |
| ("test.pk1") | |
| returns | |
| "test.pkl" | |
| Calling function with | |
| ("test") | |
| returns | |
| "test.pkl" | |
| ''' | |
| main_part_of_name, file_extension = os.path.splitext( | |
| file_name) #from | |
| # http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python | |
| if file_extension: | |
| return file_name | |
| else: | |
| return main_part_of_name + ".pkl" | |
| # FASTA READING | |
| # Read sequence from FASTA file using Biopython | |
| # Read sequence, treating source as a filepath. | |
| # Use of `with` on next line based on http://biopython.org/wiki/SeqIO , | |
| # under "Sequence Input". Otherwise, backbone based on | |
| # https://www.biostars.org/p/209383/, and fact `rU` mode depecated. | |
| with open(source, "r") as handle: | |
| for record in SeqIO.parse(handle, "fasta"): | |
| # print(record.seq) # for debugging | |
| fasta_seq = record.seq | |
| # Reading a single sequence from URL using Biopython | |
| from Bio import SeqIO | |
| source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa" | |
| def get_seq_from_URL(url): | |
| ''' | |
| takes a URL and gets the sequence | |
| ''' | |
| try: | |
| from StringIO import StringIO | |
| except ImportError: | |
| from io import StringIO | |
| chromosomes_and_length = {} | |
| # Getting html originally for just Python 3, adapted from | |
| # https://stackoverflow.com/a/17510727/8508004 and then updated from to | |
| # handle Python 2 and 3 according to same link. | |
| try: | |
| # For Python 3.0 and later | |
| from urllib.request import urlopen | |
| except ImportError: | |
| # Fall back to Python 2's urllib2 | |
| from urllib2 import urlopen | |
| html = urlopen(url) | |
| record = SeqIO.read(StringIO(html.read().decode(encoding='UTF-8')), "fasta") # use of | |
| # SeqIO.read here based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html , | |
| # "This style is useful when you expect a single record only (and would consider | |
| # multiple records an error). For example, when dealing with GenBank files for | |
| # bacterial genomes or chromosomes, there is normally only a single record. | |
| # Alternatively, use this with a handle when downloading a single record | |
| # from the internet." | |
| # I found I needed `StringIO()` or got issues with trying to handle long file name. | |
| return record.seq | |
| t = get_seq_from_URL(source) | |
| print (len(t)) | |
| # Reading potentially multiple sequences from URL but only taking first one using Biopython | |
| from Bio import SeqIO | |
| source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa" | |
| def get_seq_from_URL(url): | |
| ''' | |
| takes a URL and gets the sequence | |
| ''' | |
| try: | |
| from StringIO import StringIO | |
| except ImportError: | |
| from io import StringIO | |
| chromosomes_and_length = {} | |
| # Getting html originally for just Python 3, adapted from | |
| # https://stackoverflow.com/a/17510727/8508004 and then updated from to | |
| # handle Python 2 and 3 according to same link. | |
| try: | |
| # For Python 3.0 and later | |
| from urllib.request import urlopen | |
| except ImportError: | |
| # Fall back to Python 2's urllib2 | |
| from urllib2 import urlopen | |
| html = urlopen(url) | |
| fasta_iterator = SeqIO.parse(StringIO(html.read().decode(encoding='UTF-8')), "fasta") | |
| # Use of `next()` on next line to get first FASTA -formatted sequence is | |
| # based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html | |
| # I think difference from `SeqIO.read()` in this approach is that it won't | |
| # give an error of more than one entry is in the html. | |
| # I found I needed `StringIO()` or got issues with trying to handle long file name. | |
| record = next(fasta_iterator) | |
| return record.seq | |
| t = get_seq_from_URL(source) | |
| print (len(t)) | |
| # Reading a single sequence from URL not using Biopython | |
| from Bio import SeqIO | |
| source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa" | |
| def get_seq_from_URL(url): | |
| ''' | |
| takes a URL and gets the sequence | |
| ''' | |
| chromosomes_and_length = {} | |
| # Getting html originally for just Python 3, adapted from | |
| # https://stackoverflow.com/a/17510727/8508004 and then updated from to | |
| # handle Python 2 and 3 according to same link. | |
| try: | |
| # For Python 3.0 and later | |
| from urllib.request import urlopen | |
| except ImportError: | |
| # Fall back to Python 2's urllib2 | |
| from urllib2 import urlopen | |
| html = urlopen(url) | |
| needs_more_parsing = html.read().decode(encoding='UTF-8') | |
| print(needs_more_parsing) | |
| get_seq_from_URL(source) | |
| #make and save tab-separated values / tab-delimited TSV/ .tab output file | |
| def make_and_save_karyotype(chromosomes_and_length, species_code): | |
| ''' | |
| Takes a dictionary of chromosome identifiers and length and makes a karyotype | |
| file with that information. | |
| Result will look like this at start of output file: | |
| chr - Sc-chrIV chrIV 0 1531933 black | |
| chr - Sc-chrXV chrXV 0 1091291 black | |
| ... | |
| Function returns None. | |
| ''' | |
| # prepare output file for saving so it will be open and ready | |
| with open(output_file_name, 'w') as output_file: | |
| for indx,(chrom,length) in enumerate(chromosomes_and_length.items()): | |
| next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0" | |
| "\t{length}\tblack".format( | |
| species_code=species_code,chrom=chrom, length=length)) | |
| if indx < (len(chromosomes_and_length)-1): | |
| next_line += "\n" # don't add new line character to last line | |
| # Send the built line to output | |
| output_file.write(next_line) | |
| sys.stderr.write( "\n\nThe karyotype file for {} chromosomes has been saved " | |
| "as a file named" | |
| " '{}'.".format(len(chromosomes_and_length),output_file_name)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment