fomightez · April 21, 2020 02:51
diff --git a/my_python_functions.py b/my_python_functions.py




 # generate output file name
 def generate_output_file_name(id_of_seq_element,id_of_seq_scanned):
    '''
    Takes a file name as an argument and returns string for the name of the
    output file. The generated name is based on a prefix that can be adjusted
    under the section ' USER ADJUSTABLE VALUES ', plus the provided text in 
    the function call.
    If there is no `id_of_seq_element` specified, the sequence element will be
    used, limited to the first number of bases specified in `limit_of_name`.

    Specific examples
    =================
    Calling function with
        ("elem1","chrmt")
    returns
        "seq_account_elem1_chrmt.tsv"

    Calling function with
        (None,"chrmt")
    returns
        "seq_account_GAATTC_chrmt.tsv"
    if `GAATTC` happened to be the provided sequence element.
    '''
    if id_of_seq_element:
        elem_id = id_of_seq_element
    elif len(element) > limit_of_name:
        elem_id = element[:limit_of_name]
    else:
        elem_id = element
    return "{prefix}_{elem_id}_{seq}.tsv".format(
        prefix=output_file_name_prefix,elem_id=id_of_seq_element,
        seq=id_of_seq_scanned)
  
  
  def generate_output_file_name(file_name):
    '''
    Takes a file name as an argument and returns string for the name of the
    output file. The generated name is based on the original file
    name.

    Specific example
    =================
    Calling function with
        ("mito_transcripts_dataframeplusSUTs.pkl")
    returns
        "mito_transcripts_dataframeplusSUTs_n_XUTs.pkl"
    '''
    main_part_of_name, file_extension = os.path.splitext(
        file_name) #from http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python
    if '.' in file_name:  #I don't know if this is needed with the os.path.splitext method but I had it before so left it
        return main_part_of_name + suffix_for_saving  + file_extension
    else:
        return file_name + suffix_for_saving + ".pkl"
  
  
  def generate_output_file_name(file_name):
    '''
    Takes a file name as an argument and returns string for the name of the
    output file. The generated name is based on the original file
    name.

    Specific examples
    =================
    Calling function with
        ("test.pk1")
    returns
        "test.pkl"

    Calling function with
        ("test")
    returns
        "test.pkl"
    '''
    main_part_of_name, file_extension = os.path.splitext(
        file_name) #from 
    # http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python
    if file_extension:
        return file_name
    else:
        return main_part_of_name + ".pkl"
    
    
    


    
    
    
 # FASTA READING    
 # Read sequence from FASTA file using Biopython
 # Read sequence, treating source as a filepath.
 # Use of `with` on next line based on http://biopython.org/wiki/SeqIO , 
 # under "Sequence Input". Otherwise, backbone based on 
 # https://www.biostars.org/p/209383/, and fact `rU` mode depecated.
 with open(source, "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        # print(record.seq) # for debugging
        fasta_seq = record.seq



 # Reading a single sequence from URL using Biopython
 from Bio import SeqIO

 source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa"
 def get_seq_from_URL(url):
    '''
    takes a URL and gets the sequence
    '''
    try:
        from StringIO import StringIO
    except ImportError:
        from io import StringIO
    
    chromosomes_and_length = {}
    # Getting html originally for just Python 3, adapted from 
    # https://stackoverflow.com/a/17510727/8508004 and then updated from to 
    # handle Python 2 and 3 according to same link.
    try:
        # For Python 3.0 and later
        from urllib.request import urlopen
    except ImportError:
        # Fall back to Python 2's urllib2
        from urllib2 import urlopen
    html = urlopen(url)
    record = SeqIO.read(StringIO(html.read().decode(encoding='UTF-8')), "fasta") # use of
    # SeqIO.read here based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html ,
    # "This style is useful when you expect a single record only (and would consider 
    # multiple records an error). For example, when dealing with GenBank files for 
    # bacterial genomes or chromosomes, there is normally only a single record. 
    # Alternatively, use this with a handle when downloading a single record 
    # from the internet."
    # I found I needed `StringIO()` or got issues with trying to handle long file name.
    return record.seq
    
 t = get_seq_from_URL(source)
 print (len(t))


 # Reading potentially multiple sequences from URL but only taking first one using Biopython
 from Bio import SeqIO

 source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa"
 def get_seq_from_URL(url):
    '''
    takes a URL and gets the sequence
    '''
    try:
        from StringIO import StringIO
    except ImportError:
        from io import StringIO
        
    chromosomes_and_length = {}
    # Getting html originally for just Python 3, adapted from 
    # https://stackoverflow.com/a/17510727/8508004 and then updated from to 
    # handle Python 2 and 3 according to same link.
    try:
        # For Python 3.0 and later
        from urllib.request import urlopen
    except ImportError:
        # Fall back to Python 2's urllib2
        from urllib2 import urlopen
    html = urlopen(url)
    fasta_iterator = SeqIO.parse(StringIO(html.read().decode(encoding='UTF-8')), "fasta")
    # Use of `next()` on next line to get first FASTA -formatted sequence is 
    # based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html
    # I think difference from `SeqIO.read()` in this approach is that it won't
    # give an error of more than one entry is in the html.
    # I found I needed `StringIO()` or got issues with trying to handle long file name.
    record = next(fasta_iterator)
    return record.seq
    
 t = get_seq_from_URL(source)
 print (len(t))


 # Reading a single sequence from URL not using Biopython
 from Bio import SeqIO

 source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa"
 def get_seq_from_URL(url):
    '''
    takes a URL and gets the sequence
    '''
    chromosomes_and_length = {}
    # Getting html originally for just Python 3, adapted from 
    # https://stackoverflow.com/a/17510727/8508004 and then updated from to 
    # handle Python 2 and 3 according to same link.
    try:
        # For Python 3.0 and later
        from urllib.request import urlopen
    except ImportError:
        # Fall back to Python 2's urllib2
        from urllib2 import urlopen
    html = urlopen(url)
    needs_more_parsing = html.read().decode(encoding='UTF-8')
    print(needs_more_parsing)
    
 get_seq_from_URL(source)




 #make and save tab-separated values / tab-delimited TSV/ .tab output file
 def make_and_save_karyotype(chromosomes_and_length, species_code):
    '''
    Takes a dictionary of chromosome identifiers and length and makes a karyotype
    file with that information.

    Result will look like this at start of output file:
    chr - Sc-chrIV chrIV 0 1531933 black
    chr - Sc-chrXV chrXV 0 1091291 black
    ...

    Function returns None.
    '''
    # prepare output file for saving so it will be open and ready
    with open(output_file_name, 'w') as output_file:
        for indx,(chrom,length) in enumerate(chromosomes_and_length.items()):
            next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0"
                "\t{length}\tblack".format(
                species_code=species_code,chrom=chrom, length=length))
            if indx < (len(chromosomes_and_length)-1):
                next_line += "\n" # don't add new line character to last line
            # Send the built line to output
            output_file.write(next_line)
    sys.stderr.write( "\n\nThe karyotype file for {} chromosomes has been saved "
            "as a file named"
            " '{}'.".format(len(chromosomes_and_length),output_file_name))




	# generate output file name
	def generate_output_file_name(id_of_seq_element,id_of_seq_scanned):
	'''
	Takes a file name as an argument and returns string for the name of the
	output file. The generated name is based on a prefix that can be adjusted
	under the section ' USER ADJUSTABLE VALUES ', plus the provided text in
	the function call.
	If there is no `id_of_seq_element` specified, the sequence element will be
	used, limited to the first number of bases specified in `limit_of_name`.

	Specific examples
	=================
	Calling function with
	("elem1","chrmt")
	returns
	"seq_account_elem1_chrmt.tsv"

	Calling function with
	(None,"chrmt")
	returns
	"seq_account_GAATTC_chrmt.tsv"
	if `GAATTC` happened to be the provided sequence element.
	'''
	if id_of_seq_element:
	elem_id = id_of_seq_element
	elif len(element) > limit_of_name:
	elem_id = element[:limit_of_name]
	else:
	elem_id = element
	return "{prefix}_{elem_id}_{seq}.tsv".format(
	prefix=output_file_name_prefix,elem_id=id_of_seq_element,
	seq=id_of_seq_scanned)


	def generate_output_file_name(file_name):
	'''
	Takes a file name as an argument and returns string for the name of the
	output file. The generated name is based on the original file
	name.

	Specific example
	=================
	Calling function with
	("mito_transcripts_dataframeplusSUTs.pkl")
	returns
	"mito_transcripts_dataframeplusSUTs_n_XUTs.pkl"
	'''
	main_part_of_name, file_extension = os.path.splitext(
	file_name) #from http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python
	if '.' in file_name: #I don't know if this is needed with the os.path.splitext method but I had it before so left it
	return main_part_of_name + suffix_for_saving + file_extension
	else:
	return file_name + suffix_for_saving + ".pkl"


	def generate_output_file_name(file_name):
	'''
	Takes a file name as an argument and returns string for the name of the
	output file. The generated name is based on the original file
	name.

	Specific examples
	=================
	Calling function with
	("test.pk1")
	returns
	"test.pkl"

	Calling function with
	("test")
	returns
	"test.pkl"
	'''
	main_part_of_name, file_extension = os.path.splitext(
	file_name) #from
	# http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python
	if file_extension:
	return file_name
	else:
	return main_part_of_name + ".pkl"








	# FASTA READING
	# Read sequence from FASTA file using Biopython
	# Read sequence, treating source as a filepath.
	# Use of `with` on next line based on http://biopython.org/wiki/SeqIO ,
	# under "Sequence Input". Otherwise, backbone based on
	# https://www.biostars.org/p/209383/, and fact `rU` mode depecated.
	with open(source, "r") as handle:
	for record in SeqIO.parse(handle, "fasta"):
	# print(record.seq) # for debugging
	fasta_seq = record.seq



	# Reading a single sequence from URL using Biopython
	from Bio import SeqIO

	source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa"
	def get_seq_from_URL(url):
	'''
	takes a URL and gets the sequence
	'''
	try:
	from StringIO import StringIO
	except ImportError:
	from io import StringIO

	chromosomes_and_length = {}
	# Getting html originally for just Python 3, adapted from
	# https://stackoverflow.com/a/17510727/8508004 and then updated from to
	# handle Python 2 and 3 according to same link.
	try:
	# For Python 3.0 and later
	from urllib.request import urlopen
	except ImportError:
	# Fall back to Python 2's urllib2
	from urllib2 import urlopen
	html = urlopen(url)
	record = SeqIO.read(StringIO(html.read().decode(encoding='UTF-8')), "fasta") # use of
	# SeqIO.read here based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html ,
	# "This style is useful when you expect a single record only (and would consider
	# multiple records an error). For example, when dealing with GenBank files for
	# bacterial genomes or chromosomes, there is normally only a single record.
	# Alternatively, use this with a handle when downloading a single record
	# from the internet."
	# I found I needed `StringIO()` or got issues with trying to handle long file name.
	return record.seq

	t = get_seq_from_URL(source)
	print (len(t))


	# Reading potentially multiple sequences from URL but only taking first one using Biopython
	from Bio import SeqIO

	source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa"
	def get_seq_from_URL(url):
	'''
	takes a URL and gets the sequence
	'''
	try:
	from StringIO import StringIO
	except ImportError:
	from io import StringIO

	chromosomes_and_length = {}
	# Getting html originally for just Python 3, adapted from
	# https://stackoverflow.com/a/17510727/8508004 and then updated from to
	# handle Python 2 and 3 according to same link.
	try:
	# For Python 3.0 and later
	from urllib.request import urlopen
	except ImportError:
	# Fall back to Python 2's urllib2
	from urllib2 import urlopen
	html = urlopen(url)
	fasta_iterator = SeqIO.parse(StringIO(html.read().decode(encoding='UTF-8')), "fasta")
	# Use of `next()` on next line to get first FASTA -formatted sequence is
	# based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html
	# I think difference from `SeqIO.read()` in this approach is that it won't
	# give an error of more than one entry is in the html.
	# I found I needed `StringIO()` or got issues with trying to handle long file name.
	record = next(fasta_iterator)
	return record.seq

	t = get_seq_from_URL(source)
	print (len(t))


	# Reading a single sequence from URL not using Biopython
	from Bio import SeqIO

	source = "https://downloads.yeastgenome.org/sequence/S288C_reference/chromosomes/fasta/chrmt.fsa"
	def get_seq_from_URL(url):
	'''
	takes a URL and gets the sequence
	'''
	chromosomes_and_length = {}
	# Getting html originally for just Python 3, adapted from
	# https://stackoverflow.com/a/17510727/8508004 and then updated from to
	# handle Python 2 and 3 according to same link.
	try:
	# For Python 3.0 and later
	from urllib.request import urlopen
	except ImportError:
	# Fall back to Python 2's urllib2
	from urllib2 import urlopen
	html = urlopen(url)
	needs_more_parsing = html.read().decode(encoding='UTF-8')
	print(needs_more_parsing)

	get_seq_from_URL(source)




	#make and save tab-separated values / tab-delimited TSV/ .tab output file
	def make_and_save_karyotype(chromosomes_and_length, species_code):
	'''
	Takes a dictionary of chromosome identifiers and length and makes a karyotype
	file with that information.

	Result will look like this at start of output file:
	chr - Sc-chrIV chrIV 0 1531933 black
	chr - Sc-chrXV chrXV 0 1091291 black
	...

	Function returns None.
	'''
	# prepare output file for saving so it will be open and ready
	with open(output_file_name, 'w') as output_file:
	for indx,(chrom,length) in enumerate(chromosomes_and_length.items()):
	next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0"
	"\t{length}\tblack".format(
	species_code=species_code,chrom=chrom, length=length))
	if indx < (len(chromosomes_and_length)-1):
	next_line += "\n" # don't add new line character to last line
	# Send the built line to output
	output_file.write(next_line)
	sys.stderr.write( "\n\nThe karyotype file for {} chromosomes has been saved "
	"as a file named"
	" '{}'.".format(len(chromosomes_and_length),output_file_name))