Skip to content

Instantly share code, notes, and snippets.

@fomightez
Last active April 21, 2020 02:51
Show Gist options
  • Save fomightez/324b7446dc08e56c83fa2d7af2b89a33 to your computer and use it in GitHub Desktop.
Save fomightez/324b7446dc08e56c83fa2d7af2b89a33 to your computer and use it in GitHub Desktop.
examples of my frequently used Python functions and slight variations
# generate output file name
def generate_output_file_name(id_of_seq_element,id_of_seq_scanned):
'''
Takes a file name as an argument and returns string for the name of the
output file. The generated name is based on a prefix that can be adjusted
under the section ' USER ADJUSTABLE VALUES ', plus the provided text in
the function call.
If there is no `id_of_seq_element` specified, the sequence element will be
used, limited to the first number of bases specified in `limit_of_name`.
Specific examples
=================
Calling function with
("elem1","chrmt")
returns
"seq_account_elem1_chrmt.tsv"
Calling function with
(None,"chrmt")
returns
"seq_account_GAATTC_chrmt.tsv"
if `GAATTC` happened to be the provided sequence element.
'''
if id_of_seq_element:
elem_id = id_of_seq_element
elif len(element) > limit_of_name:
elem_id = element[:limit_of_name]
else:
elem_id = element
return "{prefix}_{elem_id}_{seq}.tsv".format(
prefix=output_file_name_prefix,elem_id=id_of_seq_element,
seq=id_of_seq_scanned)
def generate_output_file_name(file_name):
'''
Takes a file name as an argument and returns string for the name of the
output file. The generated name is based on the original file
name.
Specific example
=================
Calling function with
("mito_transcripts_dataframeplusSUTs.pkl")
returns
"mito_transcripts_dataframeplusSUTs_n_XUTs.pkl"
'''
main_part_of_name, file_extension = os.path.splitext(
file_name) #from http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python
if '.' in file_name: #I don't know if this is needed with the os.path.splitext method but I had it before so left it
return main_part_of_name + suffix_for_saving + file_extension
else:
return file_name + suffix_for_saving + ".pkl"
def generate_output_file_name(file_name):
'''
Takes a file name as an argument and returns string for the name of the
output file. The generated name is based on the original file
name.
Specific examples
=================
Calling function with
("test.pk1")
returns
"test.pkl"
Calling function with
("test")
returns
"test.pkl"
'''
main_part_of_name, file_extension = os.path.splitext(
file_name) #from
# http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python
if file_extension:
return file_name
else:
return main_part_of_name + ".pkl"
# FASTA READING
# Read sequence from FASTA file using Biopython
# Read sequence, treating source as a filepath.
# Use of `with` on next line based on http://biopython.org/wiki/SeqIO ,
# under "Sequence Input". Otherwise, backbone based on
# https://www.biostars.org/p/209383/, and fact `rU` mode depecated.
with open(source, "r") as handle:
for record in SeqIO.parse(handle, "fasta"):
# print(record.seq) # for debugging
fasta_seq = record.seq
# Reading a single sequence from URL using Biopython
from Bio import SeqIO
source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa"
def get_seq_from_URL(url):
'''
takes a URL and gets the sequence
'''
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
chromosomes_and_length = {}
# Getting html originally for just Python 3, adapted from
# https://stackoverflow.com/a/17510727/8508004 and then updated from to
# handle Python 2 and 3 according to same link.
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen
html = urlopen(url)
record = SeqIO.read(StringIO(html.read().decode(encoding='UTF-8')), "fasta") # use of
# SeqIO.read here based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html ,
# "This style is useful when you expect a single record only (and would consider
# multiple records an error). For example, when dealing with GenBank files for
# bacterial genomes or chromosomes, there is normally only a single record.
# Alternatively, use this with a handle when downloading a single record
# from the internet."
# I found I needed `StringIO()` or got issues with trying to handle long file name.
return record.seq
t = get_seq_from_URL(source)
print (len(t))
# Reading potentially multiple sequences from URL but only taking first one using Biopython
from Bio import SeqIO
source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa"
def get_seq_from_URL(url):
'''
takes a URL and gets the sequence
'''
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
chromosomes_and_length = {}
# Getting html originally for just Python 3, adapted from
# https://stackoverflow.com/a/17510727/8508004 and then updated from to
# handle Python 2 and 3 according to same link.
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen
html = urlopen(url)
fasta_iterator = SeqIO.parse(StringIO(html.read().decode(encoding='UTF-8')), "fasta")
# Use of `next()` on next line to get first FASTA -formatted sequence is
# based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html
# I think difference from `SeqIO.read()` in this approach is that it won't
# give an error of more than one entry is in the html.
# I found I needed `StringIO()` or got issues with trying to handle long file name.
record = next(fasta_iterator)
return record.seq
t = get_seq_from_URL(source)
print (len(t))
# Reading a single sequence from URL not using Biopython
from Bio import SeqIO
source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa"
def get_seq_from_URL(url):
'''
takes a URL and gets the sequence
'''
chromosomes_and_length = {}
# Getting html originally for just Python 3, adapted from
# https://stackoverflow.com/a/17510727/8508004 and then updated from to
# handle Python 2 and 3 according to same link.
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen
html = urlopen(url)
needs_more_parsing = html.read().decode(encoding='UTF-8')
print(needs_more_parsing)
get_seq_from_URL(source)
#make and save tab-separated values / tab-delimited TSV/ .tab output file
def make_and_save_karyotype(chromosomes_and_length, species_code):
'''
Takes a dictionary of chromosome identifiers and length and makes a karyotype
file with that information.
Result will look like this at start of output file:
chr - Sc-chrIV chrIV 0 1531933 black
chr - Sc-chrXV chrXV 0 1091291 black
...
Function returns None.
'''
# prepare output file for saving so it will be open and ready
with open(output_file_name, 'w') as output_file:
for indx,(chrom,length) in enumerate(chromosomes_and_length.items()):
next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0"
"\t{length}\tblack".format(
species_code=species_code,chrom=chrom, length=length))
if indx < (len(chromosomes_and_length)-1):
next_line += "\n" # don't add new line character to last line
# Send the built line to output
output_file.write(next_line)
sys.stderr.write( "\n\nThe karyotype file for {} chromosomes has been saved "
"as a file named"
" '{}'.".format(len(chromosomes_and_length),output_file_name))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment