Last active
December 20, 2024 21:19
-
-
Save fomightez/971d0958510c4c1a4734eafe4afa0cac to your computer and use it in GitHub Desktop.
Useful snippets for Jupyter notebooks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Use `%%capture` to hush 'noisy' stdout and stderr streams, but still combine with getting `%%time` after | |
%%capture out_stream | |
%%time | |
---rest of a cell that does something with LOTS of output-- | |
#In cell after, put following to get time of completion from that: | |
#time it took to run cell above | |
for x in out_stream.stdout.split("\n")[-3:]: | |
print(x) | |
# Use `%%capture` to hush 'noisy' stdout and stderr streams, but still get certain output after | |
%%capture out_stream | |
---rest of a cell that does something with LOTS of output with anything to keep tagged with `#x#x#x#x#x` at start of every line -- | |
#In cell after, put following to get filtered output: | |
# output specifically tagged stderr lines captured from above cell | |
tag_used = "#x#x#x#x#x" | |
filtered_out = "" | |
for x in out_stream.stderr.split("\n"): | |
if x.startswith(tag_used): | |
filtered_out += x[len(tag_used):]+"\n" | |
# Feedback | |
sys.stderr.write("{}".format(filtered_out)) | |
# Use `%%capture` to capture' stdout and stderr streams and send output to a file | |
%%capture out_stream | |
---rest of a cell that does something with output-- | |
#In cell after, put following: | |
%store out_stream.stdout >output_from_cell.txt #based on https://stackoverflow.com/a/32731418/8508004 | |
# (In an answer to a Jupyter Discourse post, I added more background on using this and options for how | |
# you could add showing the captured text in the notebook, too. See | |
# https://discourse.jupyter.org/t/how-to-write-the-output-from-previous-cell-to-a-csv-file/10319/2?u=fomightez ) | |
# Use `with io.capture_output() as captured:` to suppress output from only what is in the `with` block | |
# This comes from down below on that same page as the `%%capture` cell magic (https://stackoverflow.com/a/52559560/8508004) | |
from IPython.utils import io | |
with io.capture_output() as captured: | |
MyFunction() | |
# Use %store and doscstring to write multi-line text to file | |
s='''#!/bin/bash | |
pdb=$1 | |
for chain in $(grep "^ATOM" $pdb | cut -b 22 | sort -u) | |
do | |
sed -n "/^.\{21\}$chain/p" $pdb > ${pdb%.pdb}_$chain.pdb | |
done''' | |
%store s >split_into_chains.sh | |
# clean out directory of all but one file | |
from shlex import quote | |
pathname_of_file_to_keep = quote("notebooks/Generating later Circos tutorial notebooks from extracted markdown via notedown and papermill.ipynb") | |
name_of_file_to_keep = quote("Generating later Circos tutorial notebooks from extracted markdown via notedown and papermill.ipynb") | |
# based on Olivier Dulac's comment at https://unix.stackexchange.com/questions/153862/remove-all-files-directories-except-for-one-file | |
%cd .. | |
!cp $pathname_of_file_to_keep . | |
!rm -rf notebooks | |
!mkdir notebooks | |
!mv $name_of_file_to_keep notebooks/ | |
%cd notebooks | |
# Make a directory if it doesn't already exist | |
# Make a folder if it doesn't already exist | |
import os | |
directory_for_archive = "original_html" | |
if not os.path.isdir(directory_for_archive): | |
!mkdir {directory_for_archive} | |
# Get a file if not yet retrieved / check if file exists | |
import os | |
file_needed = "get_seq_from_multiFASTA_with_match_in_description.py" | |
if not os.path.isfile(file_needed): | |
#!curl -OL https://raw.githubusercontent.com/fomightez/sequencework/master/Extract_from_FASTA/{file_needed} | |
os.system(f"curl -OL https://raw.githubusercontent.com/fomightez/sequencework/master/Extract_from_FASTA/{file_needed}") | |
# Get a list of files if not yet retrieved, checking if file exists already first | |
import os | |
files_needed = ["hhsearch_q9bsu1_uniclust_w_ss_pfamA_30.hhr", | |
"2uvo_hhblits.hhr", | |
"2uvo_hhsearch.hhr", | |
"hhpred_9590198.hhr"] | |
url_prefix = "https://raw.githubusercontent.com/biopython/biopython/master/Tests/HHsuite/" | |
for file_needed in files_needed: | |
if not os.path.isfile(file_needed): | |
!curl -OL {url_prefix+file_needed} | |
# Check if list of file already uploaded and if not, prompt for them | |
import os | |
import sys | |
files_needed = ["genome_1.fa","genome_2.fa","genome_3.fa","genome_4.fa",] | |
files_still_needed = [] | |
for fn in files_needed: | |
if not os.path.isfile(fn): | |
files_still_needed.append(fn) | |
if files_still_needed: | |
sys.stderr.write("\nThe following sequences still need uploading:\n - {}\n".format("\n - ".join(files_still_needed))) | |
sys.exit(1) | |
else: | |
sys.stderr.write("\nSequences needed all present.") | |
# Check for a file that is in an archive and then ask for archive if file not there | |
# and don't find the archive. Particularly useful in Binder sessions to make sure | |
# needed files are around and ready to run cells. HALTS NOTEBOOK CELL PROCESSING IF NOT. | |
# first check `an_archive_example.tar.gz` uploaded if it wasn't already extracted | |
import os | |
unpacked_example = os.path.join("directory_containing_file_when_unpacked","your_alignment_file.clustal") | |
file_needed = "an_archive_example.tar.gz" | |
import sys | |
if os.path.isfile(unpacked_example): | |
sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed)) | |
elif os.path.isfile(file_needed): | |
!tar xzf {file_needed} | |
else: | |
sys.stderr.write("\n\n*****************ERROR**************************\n" | |
"The file '{0}' is needed.\n" | |
"Upload '{0}' to this Jupyter session and re-run this cell.\n" | |
"*****************ERROR**************************\n".format(file_needed)) | |
sys.exit(1) | |
# Ask for an archive and unpack and extract enclosed dataframe | |
file_required = "collected_candidate_21S-containing_seqs.tar.gz" | |
dataframe_to_read = "extracted_21S-containing_seq_info_df.pkl" | |
import os | |
import sys | |
import pandas as pd | |
if os.path.isfile(file_required): | |
!tar -xzf {file_required} | |
mitolsu_frag_df = pd.read_pickle(dataframe_to_read) | |
sys.stderr.write("\nFile with sequences ('{}') observed and" | |
" unpacked.".format(file_required)) | |
sys.stderr.write("\nDataframe '{}' read in" | |
".".format(dataframe_to_read)) | |
else: | |
sys.stderr.write("\nFile with sequences '{}' not seen and so nothing done" | |
". Seems wrong.".format(file_required)) | |
sys.exit(1) | |
# for when that archive to check for contains a dataframe to bring into memory in the notebook: | |
import os | |
file_needed = "an_archive_example.tar.gz" | |
unpacked_goal = "info_df.pkl" | |
import sys | |
import pandas as pd | |
if os.path.isfile(unpacked_goal): | |
sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed)) | |
#bring the details in | |
try: | |
len(previous_details_df) > 2 | |
except NameError as e: | |
previous_details_df = pd.read_pickle(unpacked_goal) | |
sys.stderr.write("\nData in '{}' read in.\n".format(unpacked_goal)) | |
elif os.path.isfile(file_needed): | |
!tar xzf {file_needed} | |
previous_details_df = pd.read_pickle(unpacked_goal) | |
sys.stderr.write("\nData in '{}' read in.\n".format(unpacked_goal)) | |
else: | |
sys.stderr.write("\n\n*****************ERROR**************************\n" | |
"The file '{0}' is needed.\n" | |
"Upload '{0}' to this Jupyter session and re-run this cell.\n" | |
"*****************ERROR**************************\n".format(file_needed)) | |
sys.exit(1) | |
# for when that archive to check for contains several dataframes and a list to bring into memory in the notebook (SEE JUST BELOW FOR MORE GENERAL / ONLY DATAFRAMES): | |
import os | |
file_needed = "Counts_promoter_motifs_among1011_21S_candidates_where_no_mito_prev_identified.tar.gz" | |
unpacked_goal = "disruptor_hit_num_tallies_by_id_df.pkl" | |
df_n_fnstr_dict = { | |
"largest_disr_num_by_id_df": "largest_disr_num_by_id_df", | |
"mito_promoter_matches_df": "df", | |
"mito_promoter_hit_num_tallies_by_id_df": "largest_hit_num_by_id_df", | |
"disruptor_matches_df": "disrupt_df", | |
"disruptor_hit_num_tallies_by_id_df": "largest_disr_num_by_id_df", | |
"grich_matches_df": "grich_df", | |
"grich_hit_num_tallies_by_id_df": "largest_grich_num_by_id_df", | |
"endgrich_matches_df": "end_grich_df", | |
"endgrich_hit_num_tallies_by_id_df": "largest_endgrich_num_by_id_df", | |
"twenty_nineATrich_seq_matches_df": "twenty_nine_df", | |
"twenty_nineATrich_seq_hit_num_tallies_by_id_df": "largest_ATrich_num_by_id_df", | |
} | |
def read_in_data(df_n_fnstr_dict): | |
#df_fns = ["{}.pkl".format(x) for x in df_n_fnstr_dict.keys()] | |
df_n_fnstr_dict = {"{}.pkl".format(k):v for k,v in df_n_fnstr_dict.items()} | |
g = globals() #based on `how to use a string to make a python variable.md` | |
for k,v in df_n_fnstr_dict.items(): | |
g[v] = pd.read_pickle(k) | |
sys.stderr.write("\nData in '{}' read in; produced `{}`.".format(k,v)) | |
import json | |
with open('genomes_list.json', 'r') as f: | |
g["genomes"] = json.load(f) | |
sys.stderr.write("\nGenomes list read back in as `genomes`.") | |
import sys | |
import pandas as pd | |
if os.path.isfile(unpacked_goal): | |
sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed)) | |
#bring the data into memory, if it isn't already | |
try: | |
len(globals()[list(df_n_fnstr_dict.items())[0][1]]) > 2 | |
except (NameError,KeyError) as e: | |
read_in_data(df_n_fnstr_dict) | |
elif os.path.isfile(file_needed): | |
!tar xzf {file_needed} | |
read_in_data(df_n_fnstr_dict) | |
else: | |
sys.stderr.write("\n\n*****************ERROR**************************\n" | |
"The file '{0}' is needed.\n" | |
"Upload '{0}' to this Jupyter session and re-run this cell.\n" | |
"*****************ERROR**************************\n".format(file_needed)) | |
sys.exit(1) | |
## MORE GENERAL VERSION OF THAT LAST ONE THAT DOESN'T INCLUDE ANY LIST TO READ IN | |
import os | |
file_needed = "Counts_promoter_motifs_among1011_21S_candidates_where_no_mito_prev_identified.tar.gz" | |
unpacked_goal = "disruptor_hit_num_tallies_by_id_df.pkl" | |
df_n_fnstr_dict = { | |
"largest_disr_num_by_id_df": "largest_disr_num_by_id_df", | |
"mito_promoter_matches_df": "df", | |
"mito_promoter_hit_num_tallies_by_id_df": "largest_hit_num_by_id_df", | |
"disruptor_matches_df": "disrupt_df", | |
"disruptor_hit_num_tallies_by_id_df": "largest_disr_num_by_id_df", | |
"grich_matches_df": "grich_df", | |
"grich_hit_num_tallies_by_id_df": "largest_grich_num_by_id_df", | |
"endgrich_matches_df": "end_grich_df", | |
"endgrich_hit_num_tallies_by_id_df": "largest_endgrich_num_by_id_df", | |
"twenty_nineATrich_seq_matches_df": "twenty_nine_df", | |
"twenty_nineATrich_seq_hit_num_tallies_by_id_df": "largest_ATrich_num_by_id_df", | |
} | |
def read_in_pickles(df_n_fnstr_dict): | |
#df_fns = ["{}.pkl".format(x) for x in df_n_fnstr_dict.keys()] | |
df_n_fnstr_dict = {"{}.pkl".format(k):v for k,v in df_n_fnstr_dict.items()} | |
g = globals() #based on `how to use a string to make a python variable.md` | |
for k,v in df_n_fnstr_dict.items(): | |
g[v] = pd.read_pickle(k) | |
sys.stderr.write("\nData in '{}' read in; produced `{}`.".format(k,v)) | |
import sys | |
import pandas as pd | |
if os.path.isfile(unpacked_goal): | |
sys.stderr.write("\nAppears '{}' has already been unpacked.\n".format(file_needed)) | |
#bring the data into memory, if it isn't already | |
try: | |
len(globals()[list(df_n_fnstr_dict.items())[0][1]]) > 2 | |
except (NameError,KeyError) as e: | |
read_in_pickles(df_n_fnstr_dict) | |
elif os.path.isfile(file_needed): | |
!tar xzf {file_needed} | |
read_in_pickles(df_n_fnstr_dict) | |
else: | |
sys.stderr.write("\n\n*****************ERROR**************************\n" | |
"The file '{0}' is needed.\n" | |
"Upload '{0}' to this Jupyter session and re-run this cell.\n" | |
"*****************ERROR**************************\n".format(file_needed)) | |
sys.exit(1) | |
# check single file uploaded | |
file_required = "collected_seqs.tar.gz" # usually in another cell | |
import os | |
import sys | |
try: | |
os.path.isfile(file_required) | |
except NameError: | |
file_required = "collected_seqs.tar.gz" | |
if os.path.isfile(file_required): | |
!tar -xzf collected_seqs.tar.gz | |
!mv collected_seqs/* . | |
!rm -rf collected__seqs | |
sys.stderr.write("\nFile with sequences ('{}') observed and" | |
" unpacked.".format(file_required)) | |
else: | |
sys.stderr.write("\nFile with sequences '{}' not seen and so nothing done" | |
". Seems wrong.".format(file_required)) | |
sys.exit(1) | |
# Check single file uploaded with check on size | |
file_required = ""0_332yeast_genomesFROMshenETal2018.zip" # usually in another cell | |
size_expected = 2.902e+09 # in bytes # usually in another cell | |
# Upload the file prior to running this cell | |
import os | |
import sys | |
try: | |
os.path.isfile(file_required) | |
except NameError: | |
file_required = "0_332yeast_genomesFROMshenETal2018.zip" | |
if os.path.isfile(file_required): | |
# make sure it is large as it should be since it takes so long to upload | |
f_size = os.path.getsize(file_required) # based on https://stackoverflow.com/a/2104083/8508004 | |
if f_size >= size_expected: | |
!mkdir genomes | |
!unzip -q 0_332yeast_genomesFROMshenETal2018.zip | |
!unzip -q 0_332yeast_genomes/332_genome_assemblies.zip | |
!mv *.fas genomes/. | |
sys.stderr.write("\nGenomes archive ('{}') observed and" | |
" unpacked.".format(file_required)) | |
else: | |
sys.stderr.write("\nGenomes archive ('{}') observed but is not" | |
" fully uploaded\nWait and run this cell again.".format(file_required)) | |
else: | |
sys.stderr.write("\nGenomes archive '{}' not seen and so nothing done" | |
". Seems wrong.".format(file_required)) | |
sys.exit(1) | |
#someone else's take on some of these concepts is in post at https://twitter.com/radekosmulski/status/1129116929589940232 | |
# check multiple files uploaded | |
import os | |
import sys | |
import pandas as pd | |
try: | |
type(files_required) | |
except NameError: | |
print("Setting `files_required`") | |
files_required = ["PB_n_1011_collection_df.pkl","other_all_stretchesN_df.pkl"] | |
for file_required in files_required: | |
if os.path.isfile(file_required): | |
if file_required == files_required[0]: | |
all_df = pd.read_pickle(file_required) | |
else: | |
other_df = pd.read_pickle(file_required) | |
sys.stderr.write("\nFile '{}' observed and" | |
" unpickled.".format(file_required)) | |
else: | |
sys.stderr.write("\nFile'{}' not seen and so nothing done" | |
".\nSeems wrong!??!\n\n".format(file_required)) | |
sys.exit(1) | |
#Check if a large remote archive already unpacked and retrieved. If not, | |
# take care of whatever is left to do to use result. (For example, if the directory was | |
# set up via Cyverse to already have the archive, no need to retrieve it now | |
# but want to unpack it. | |
import os | |
import sys | |
archive_fn = "1011Assemblies.tar.gz" | |
archive_url = "http://1002genomes.u-strasbg.fr/files/1011Assemblies.tar.gz" | |
num_files_in_archive = 1011 | |
genomes_dir = 'GENOMES_ASSEMBLED' | |
expected_unpacked_fn = genomes_dir+"/"+"YBV.re.fa" | |
def unpack_and_delete_lrg_archive(archive_fn): | |
!tar -xzf {archive_fn} | |
if len(os.listdir(genomes_dir)) >= num_files_in_archive: | |
!rm {archive_fn} | |
sys.stderr.write("\nFile with genomes ('{}') observed and" | |
" unpacked.".format(archive_fn)) | |
if os.path.isfile(expected_unpacked_fn): | |
sys.stderr.write("\n**Nothing Done. Genomes from '{}' already obtained &" | |
" unpacked.**".format(archive_fn)) | |
else: | |
if os.path.isfile( | |
archive_fn) and not os.path.isfile(expected_unpacked_fn): | |
unpack_and_delete_lrg_archive(archive_fn) | |
if not os.path.isfile( | |
archive_fn) and not os.path.isfile(expected_unpacked_fn): | |
sys.stderr.write("\nGenome sequences not seen, and so obtaining" | |
" '{}'".format(archive_fn)) | |
#!curl -O {archive_url} | |
os.system(f"curl -O {archive_url}") | |
unpack_and_delete_lrg_archive(archive_fn) | |
else: | |
sys.stderr.write("\nSomething seems wrong.") | |
sys.exit(1) | |
# Pickle files function | |
def pickle_dict(d,file_name): | |
with open(file_name, "wb") as f: | |
pickle.dump(d, f) | |
pickle_dict(di_dict, "di_dict.pkl") | |
#Unpickle the files with a function if pickled version present | |
import pickle | |
def unpickle_dict(file_name): | |
with open(file_name, "rb") as f: | |
return pickle.load(f) | |
if os.path.exists(main_pickled_dict_file_to_check_for): | |
di_dict = unpickle_dict(main_pickled_dict_file_to_check_for) | |
print(f"Loaded data from {main_pickled_dict_file_to_check_for}.") | |
# Manage files with `fnmatch` (see just above about whether file uploaded, too) | |
# Basic fnmatch use | |
import fnmatch | |
for file in os.listdir(genomes_dir): | |
if fnmatch.fnmatch(file, '*.re.fa'): | |
!perl patmatch_1.2/unjustify_fasta.pl {genomes_dir}/{file} | |
#os.remove(os.path.join(genomes_dir, file)) #left over from development | |
output = !perl patmatch_1.2/patmatch.pl -c {promoter_pattern} {genomes_dir}/{file}.prepared | |
os.remove(os.path.join(genomes_dir, file+".prepared")) #delete file made for PatMatch | |
df = patmatch_results_to_df(output.n, pattern=promoter_pattern, name="promoter") | |
# more fnmatch basic use | |
tag_to_add ="1G03" | |
import os | |
import sys | |
import fnmatch | |
model_pattern = "model_*.pdb" | |
for file in os.listdir('.'): | |
if fnmatch.fnmatch(file, model_pattern): | |
os.rename(file, tag_to_add + file) | |
# fnmatch use combined with checking id related file yet exists | |
# categorize those annotated already and those missed | |
import os | |
import sys | |
import fnmatch | |
extension_to_check = ".fa" | |
extension_to_see_if_exists = ".new" | |
num_checked = 0 | |
not_annotated = [] | |
for file in os.listdir('.'): | |
if fnmatch.fnmatch(file, '*'+extension_to_check): | |
num_checked += 1 | |
#print (file) | |
first_part_filen = file.rsplit(extension_to_check,1)[0] | |
# check if corresponding `.new` file exists | |
annotated_file = file+extension_to_see_if_exists | |
#print(annotated_file) | |
if os.path.isfile(annotated_file): | |
pass | |
else: | |
not_annotated.append(file) | |
print ("No {} file?".format(annotated_file)) | |
# Feedback | |
sys.stderr.write("{} sequences files checked; {} lack corresponding, " | |
"\nannotated `.new` files.".format(num_checked,len(not_annotated))) | |
sys.stderr.write("\nThe variable `not_annotated` lists the sequences missing annotated files.") | |
#fnmatch to make a list of files and then do something with related files (see below how to use `glob.glob()` if just needed a list and | |
# weren't doing something with the names of files as encountered) (I added an example considering both when do need to iterate on many | |
# files and rename at https://discourse.jupyter.org/t/rename-files-using-a-for/17144/2?u=fomightez )(Example with using glob or fnmatch to get base file names is at https://www.biostars.org/p/9539595/#9548023 ; keep in mind in relation to basename is .stem from Path, see https://stackoverflow.com/a/47496703/8508004) | |
import os | |
import sys | |
import fnmatch | |
extension_to_handle = ".gff3" | |
name_part_to_match = "mito.gff3" | |
associated_mito_noms= [] | |
for file in os.listdir('.'): | |
if fnmatch.fnmatch(file, '*'+name_part_to_match): | |
#print (file) | |
first_part_filen = file.rsplit(extension_to_handle,1)[0] | |
associated_mito_noms.append(first_part_filen) | |
# Now delete any files that end in `mito.fa` that are not in the list of the annotation files | |
extension_to_handle = ".fa" | |
name_part_to_match = "_mito.fa" | |
removed = 0 | |
for file in os.listdir('.'): | |
if fnmatch.fnmatch(file, '*'+name_part_to_match): | |
first_part_filen = file.rsplit(extension_to_handle,1)[0] | |
if (first_part_filen) not in associated_mito_noms: | |
os.remove(file) | |
removed += 1 | |
sys.stderr.write("\n{} files ending in `{}` removed" | |
".".format(removed,name_part_to_match)) | |
# use fnmatch and glob in a notebook to iterate on all `.py` Python script files in a directory and run them. Even subsequent ones made by the | |
# scripts 'dynamically' in the course of running (was to answer a StackOverflow question, see https://stackoverflow.com/a/75087369/8508004 | |
import os | |
import fnmatch | |
import glob | |
executed_scripts = [] | |
extension_to_match = ".py" | |
def execute_script(s): | |
%run {s} | |
while set(executed_scripts) != set(glob.glob(f"*{extension_to_match}")): | |
for file in os.listdir('.'): | |
if fnmatch.fnmatch(file, '*'+extension_to_match): | |
if file not in executed_scripts: | |
execute_script(file) | |
executed_scripts.append(file) | |
# glob use to just get list of file pathnames like part of what is done just above: | |
import glob | |
name_part_to_match = "mito.gff3" | |
associated_files = glob.glob(f"*.{name_part_to_match}") | |
# recursive search for CSV files in Current directory OR sub directories /subdirectories | |
csv_files = glob.glob("**/*.csv", recursive=True) | |
# I didn't find the explanation of `**` in the Python documentation (https://docs.python.org/3/library/glob.html) very clear until I read | |
# https://www.geeksforgeeks.org/how-to-use-glob-function-to-find-files-recursively-in-python/ , | |
# but for files with matches to an extension in current directory or sub directories, the example | |
# code in the documentation is more concise and results in easier to read and use paths. | |
# fnmatch use Example with checking for a file it will produce and then running if not there. It will unpack | |
# an multi-entry FASTA file into a single file for each entry and rename them all to have `.mito.fa` at end, | |
# leaving any original FASTA file there at the start ending in ".fa" alone. | |
import os | |
import sys | |
import fnmatch | |
example_produced_file = "NCYC3594.mito.fa" | |
if not os.path.isfile(example_produced_file): #so won't run again if already ran | |
name_part_to_match = ".fa" | |
name_part_to_expand_to = ".mito.fa" | |
old_files_with_ext = [] | |
for file in os.listdir('.'): | |
if fnmatch.fnmatch(file, '*'+name_part_to_match): | |
old_files_with_ext.append(file) | |
files_to_not_touch_despite_match = old_files_with_ext | |
seq_file = "SGDs288CplusPacBio_ADJUSTEDplusWoltersnW303forALIGNERS.fa" | |
!faidx --split-files {seq_file} | |
new_fasta = [] | |
for file in os.listdir('.'): | |
if fnmatch.fnmatch(file, '*'+name_part_to_match) and file not in files_to_not_touch_despite_match: | |
new_fasta.append(file) | |
#fix name if it needs fixing | |
for file in new_fasta: | |
if not fnmatch.fnmatch(file, '*'+name_part_to_expand_to): | |
new_file_name = file.split(".fa")[0] + name_part_to_expand_to | |
!mv {file} {new_file_name} | |
# fnmatch use Example with reading and modifying the current matched file | |
import fnmatch | |
for file in os.listdir('.'): | |
if fnmatch.fnmatch(file, '*mt.fsa'): | |
print (file) | |
# !blastn -query {file} -db chrmt.fsa -outfmt "6 qseqid sseqid stitle pident qcovs length mismatch gapopen qstart qend sstart send qframe sframe frames evalue bitscore qseq sseq" -out {file[:-9]}x.chrmt.comp.txt | |
blast_result = !blastn -query {file} -db S288c.mt.genome.fa -outfmt "6 qseqid sseqid stitle pident qcovs length mismatch gapopen qstart qend sstart send qframe sframe frames evalue bitscore qseq sseq" | |
blast_df = blast_to_df(blast_result.n, pickle_df=False) | |
#... | |
print(start_loc) | |
with open(file) as handle: | |
mito_seq = SeqIO.read(handle, "fasta") | |
# fix, based on where it says "i.e. shift the starting point on this plasmid," @ | |
#http://biopython.org/DIST/docs/api/Bio.SeqRecord.SeqRecord-class.html | |
left = mito_seq[:start_loc] # use one less than what matches '1' in | |
# those cases because of zero indexing in Python; gets handled by that | |
# substraction above where `start_loc` defined | |
right = mito_seq[start_loc:] | |
adj_mito_seq = right + left | |
# write result after fix | |
sys.stderr.write("\n\nFile with adjusted 'start' saved as " | |
"'{}'.".format(generate_new_name(file),)) | |
SeqIO.write( | |
adj_mito_seq, generate_new_name(file), "fasta"); | |
# Use fnmatch to skip if extension has more beyond what is searched and exclude a specific file | |
fn_to_check = "pep.fa" | |
sequences = "" | |
import os | |
import fnmatch | |
for file in os.listdir('.'): | |
if fnmatch.fnmatch(file, '*'+fn_to_check): | |
if not file.endswith(".fai") and file != "DBVPG6044.mt.pep.fa": | |
sequences += get_seq_from_multiFASTA_with_match_in_description( | |
file,gene_to_match, return_record_as_string=True) | |
# Use fnmatch to change the names of files with specific extensions (in a subdirectory) to different extensions | |
#Change name of genome files from ending in `.fas` to ending in `.genome.fa` ; rename files, renaming files | |
genomes_dir = "genomes" | |
old_extension = ".fas" | |
new_extension = ".genome.fa" | |
import os | |
import fnmatch | |
for file in os.listdir(genomes_dirn): | |
if fnmatch.fnmatch(file, '*'+ old_extension): | |
!mv {genomes_dirn}/{file} {genomes_dirn}/{file.split(old_extension)[0]+new_extension} | |
# Package up a lot of various data sources and output streams (if trying to pack up | |
# just files, keep in mind using nbzip module or if want whole subdirectory hierarchy recursively, use https://stackoverflow.com/a/48141707/8508004 ; uncompress the example with `tar xf archive.tar`) | |
python_data_to_grab = { | |
'FILE_NAME_TO_GENERATE_A.md':(name_of_listA,"DESCRIPTION_TAG_to_put_on_top_line_of_file:"), | |
'FILE_NAME_TO_GENERATE_B.md': (name_of_listB,"TAG_to_put_on_top_line_of_file:"), | |
'FILE_NAME_TO_GENERATE_C.md':(out_stream.stderr,"TAG_to_put_on_top_line_of_file:"), | |
} | |
# PYTHON 2.7 VERSION | |
import sys | |
import contextlib | |
data_tag = "some_descriptive_string_here_about_info" | |
# fix for python 2 based on https://stackoverflow.com/a/44226422/8508004 | |
@contextlib.contextmanager | |
def redirect_stdout(target): | |
original = sys.stdout | |
sys.stdout = target | |
yield | |
sys.stdout = original | |
for file_name_to_use,py_obj_info in python_data_to_grab.items(): | |
py_obj,data_name = py_obj_info | |
with open(file_name_to_use, 'w') as f: | |
with redirect_stdout(f): | |
print(data_name + " =") | |
print(str(py_obj)) | |
# package up the files | |
!mkdir pertinent_data_for{set_designation} | |
for each_file in python_data_to_grab.keys(): | |
!mv {each_file} pertinent_data/. | |
!tar czf pertinent_data_for{set_designation}.tar.gz pertinent_data/ | |
sys.stderr.write("Useful information for the set saved as " | |
"`pertinent_data_for_{}.tar.gz`".format(data_tag)) | |
# IT'D BE BETTER TO incorporate `%store` in above, I think. see https://stackoverflow.com/a/32731418/8508004 & above here | |
# identify several files via file names and fnmatch and package up without placing in a directory first | |
# (see under 'Collecting all the results' in `Annotating mito sequences extracted from XXXX collection with MFannot and converting annotation file to gff3.ipynb` if need example with putting into directory first just using bash shell commands or | |
# search `!mv {each_file} pertinent_data/.` here for something similar) | |
archive_file_name = "annotations_for_four_putative_mitos_from332.tar.gz" | |
import os | |
import sys | |
import fnmatch | |
dl_files = [] | |
name_part_to_match = ".new" | |
for file in os.listdir('.'): | |
if fnmatch.fnmatch(file, '*'+name_part_to_match): | |
#print (file) | |
#first_part_filen = file.rsplit(extension_to_handle,1)[0] | |
dl_files.append(file) | |
!tar czf {archive_file_name} {" ".join(dl_files)} | |
sys.stderr.write("***************************DONE***********************************\n" | |
"'{}' generated. Download it.\n" | |
"***************************DONE***********************************".format(archive_file_name)) | |
#Note that when I added in the use of the `--transform` flag into making a tar, it disrupted use of passing Python into shell commands and so even after consultig Claude and trying somethings | |
#I found easier to just hardcode in even if blatently redundant and breaking DRY, like so: | |
archive_file_name = "results_for_merged_set.tar.gz" | |
!tar czf {archive_file_name} --transform 's/^\./results_for_merged_set/' {" ".join(list_of_files)} | |
#package up several files for download (if trying to pack up just files, keep in mind using nbzip module as alternative if want whole subdirectory hierarchy recursively, use https://stackoverflow.com/a/48141707/8508004 ; uncompress the example with `tar xf archive.tar`) | |
# make one file for downloading | |
archive_file_name = "collected_files.tar.gz" | |
dl_files = [x + "_tag.fa" for f in file_list] | |
!tar czf {archive_file_name} {" ".join(dl_files)} | |
sys.stderr.write("*****************DONE***********************************\n" | |
"'{}' generated. Download it.\n" | |
"*****************DONE***********************************".format(archive_file_name)) | |
#package up several files and files made from captured output stream for download, where (keep in mind using nbzip module as alternative if want whole subdirectory hierarchy recursively, use https://stackoverflow.com/a/48141707/8508004 ; uncompress the example with `tar xf archive.tar`) | |
# make one file for downloading | |
archive_file_name = "collected_files.tar.gz" | |
dl_files = [x + "_tag.fa" for f in file_list] | |
# save & add the additional information files to collect | |
output_txt_filename_a = 'seqs_filtered_info.txt' | |
output_txt_filename_b = 'seqs_filtered.txt' | |
%store captured_stream_a.stderr >{output_txt_filename_a} #based on https://stackoverflow.com/a/32731418/8508004 | |
%store captured_stream_b.stdout >{output_txt_filename_b} #based on https://stackoverflow.com/a/32731418/8508004 | |
dl_files += [output_txt_filename_a, output_txt_filename_b] # or if really only one, `dl_files.append(filtered_out)` | |
!tar czf {archive_file_name} {" ".join(dl_files)} | |
sys.stderr.write("*****************DONE***********************************\n" | |
"{} generated. Download it.\n" | |
"*****************DONE***********************************".format(archive_file_name)) | |
# note based on https://stackoverflow.com/a/32731418/8508004, used `%store` to replace above | |
with open(output_txt_filename_a, 'w') as output_handler: | |
output_handler.write(captured_stream_a.stderr) | |
with open(output_txt_filename_b, 'w') as output_handler: | |
output_handler.write(captured_stream_b.stdout) | |
# Package up several dataframes and sequences | |
#Archive the CTD sequences (FASTA format) collected and any dataframes made | |
# Pickle each dataframe and also save as `tsv` for possible use elsewhere | |
strd_dataframes_fn_list = [] | |
def pickle_df_and_store_as_table(dataframe, prefix): | |
''' | |
Take a dataframe and a filename prefix and save a pickled form of that | |
dataframe and a text tablular data version (tab-sepearated values). | |
Returns the name of the pickled and text file. | |
''' | |
dataframe.to_pickle(prefix + ".pkl") | |
dataframe.to_csv(prefix + ".tsv", sep='\t',index = False) | |
return prefix + ".pkl", prefix + ".tsv" | |
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix | |
# associated as the value | |
df_n_fn_dict = { | |
"CTD_seq_of_protein_orthologs": CTD_seq_df, | |
"first_heptad_of_protein_orthologs": first_7_df, | |
"heptads_ofCTD_seq_of_protein_orthologs": repeat_df, | |
"main_heptads_ofCTD_seq_of_protein_orthologs": repeat_wo_first_df, | |
"fraction_matching_consensus_per_CTD": fraction_consensus_df, | |
} | |
import pandas as pd | |
for prefix, dataframe in df_n_fn_dict.items(): | |
#pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix) | |
strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix)) | |
# store `CTD_seqs_fn_list` as json since lighter-weight and more portable than pickling | |
CTD_seqs_fn_list_storedfn = "CTD_seqs_fn_list.json" | |
import json | |
with open(CTD_seqs_fn_list_storedfn, 'w') as f: | |
json.dump(CTD_seqs_fn_list, f) | |
# see my useful python snippets for reading json back in | |
#for ease in aligning or other uses later save the all the CTDs as a concatenated file | |
cat_fasta_fn = "CTD_seq_of_protein_orthologs.fa" | |
# !cat {" ".join(CTD_seqs_fn_list)} > {cat_fasta_fn} # faster but not as good as awk if files don't already have newlines at end; | |
# just results in the lines of the files as one long run on that won't work for aligning | |
!awk 1 {" ".join(ortholog_prot_seqs)} > {cat_fasta_fn} #based on https://stackoverflow.com/a/25030513/8508004 | |
archiving_fn_list = CTD_seqs_fn_list + strd_dataframes_fn_list + [CTD_seqs_fn_list_storedfn , cat_fasta_fn] | |
archive_file_name = gene_name+"_orthologs_extracted_CTDs.tar.gz" | |
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command | |
sys.stderr.write("\nCollected CTD sequences" | |
" and tables of details gathered and saved as " | |
"`{}`.".format(archive_file_name)) | |
# Package up several dataframes and a list of genomes (see just below for only with dataframes) | |
# Pickle each dataframe and also save as `tsv` for possible use elsewhere | |
strd_dataframes_fn_list = [] | |
# store `genomes` as json since lighter-weight and more portable than pickling | |
# for easy json dumping for many list use when archiving: | |
file_names_for_lists_dict = { | |
"genomes_list.json":genomes, | |
} | |
import json | |
for fn, lizt in file_names_for_lists_dict.items(): | |
with open(fn, 'w') as f: | |
json.dump(lizt, f) | |
def pickle_df_and_store_as_table(dataframe, prefix): | |
''' | |
Take a dataframe and a filename prefix and save a pickled form of that | |
dataframe and a text tablular data version (tab-sepearated values). | |
Returns the name of the pickled and text file. | |
''' | |
dataframe.to_pickle(prefix + ".pkl") | |
dataframe.to_csv(prefix + ".tsv", sep='\t',index = False) | |
return prefix + ".pkl", prefix + ".tsv" | |
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix | |
# associated as the value | |
df_n_fn_dict = { | |
"mito_promoter_matches_df": df, | |
"mito_promoter_hit_num_tallies_by_id_df": largest_hit_num_by_id_df, | |
"disruptor_matches_df": disrupt_df, | |
"disruptor_hit_num_tallies_by_id_df": largest_disr_num_by_id_df, | |
"grich_matches_df": grich_df, | |
"grich_hit_num_tallies_by_id_df": largest_grich_num_by_id_df, | |
"endgrich_matches_df": end_grich_df, | |
"endgrich_hit_num_tallies_by_id_df": largest_endgrich_num_by_id_df, | |
"twenty_nineATrich_seq_matches_df": twenty_nine_df, | |
"twenty_nineATrich_seq_hit_num_tallies_by_id_df": largest_ATrich_num_by_id_df, | |
} | |
import pandas as pd | |
for prefix, dataframe in df_n_fn_dict.items(): | |
#pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix) | |
strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix)) | |
archiving_fn_list = strd_dataframes_fn_list + list(file_names_for_lists_dict.keys()) | |
archive_file_name = "Counts_promoter_motifs_among1011_21S_candidates_where_no_mito_prev_identified.tar.gz" | |
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command | |
sys.stderr.write("\nCollected dataframes" | |
" and tables of details gathered and saved as " | |
"`{}`.".format(archive_file_name)) | |
# for archiving just several dataframes with automated handling | |
archive_file_name = "dataframes_archived.tar.gz" | |
strd_dataframes_fn_list = [] | |
def pickle_df_and_store_as_table(dataframe, prefix): | |
''' | |
Take a dataframe and a filename prefix and save a pickled form of that | |
dataframe and a text tablular data version (tab-sepearated values). | |
Returns the name of the pickled and text file. | |
''' | |
dataframe.to_pickle(prefix + ".pkl") | |
dataframe.to_csv(prefix + ".tsv", sep='\t',index = False) | |
return prefix + ".pkl", prefix + ".tsv" | |
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix | |
# associated as the value | |
df_n_fn_dict = { | |
"df": df, | |
"another_df": another_df, | |
"yet_another_df": yet_another_df, | |
} | |
import pandas as pd | |
for prefix, dataframe in df_n_fn_dict.items(): | |
#pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix) | |
strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix)) | |
archiving_fn_list = strd_dataframes_fn_list | |
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command | |
sys.stderr.write("\nCollected dataframes" | |
" gathered and saved as " | |
"`{}`.".format(archive_file_name)) | |
# for easy json dumping for many list use when archiving: | |
file_names_for_lists_dict = { | |
"annotation_fns.json":annot_fns, | |
"genome_fnss.json":genomes_for_anot_fns, | |
"fn_pairings.json":file_pairs, | |
} | |
import json | |
for fn, lizt in file_names_for_lists_dict.items(): | |
with open(fn, 'w') as f: | |
json.dump(lizt, f) | |
# Use curl to get an FASTA file from OCA and remove html tags (may need `!pip install BS4` first) | |
# Get FASTA file for the non yeast one | |
import os | |
#!curl -o 1x0t_A.fa http://oca.weizmann.ac.il/oca-bin/send-seq?1x0t_A | |
os.system("curl -o 1x0t_A.fa http://oca.weizmann.ac.il/oca-bin/send-seq?1x0t_A") | |
# remove HTML to leave actual FASTA | |
# based on https://stackoverflow.com/a/21577649/8508004 and https://unix.stackexchange.com/a/64747 | |
import sys | |
from bs4 import BeautifulSoup | |
oca_file_to_fix = "1x0t_A.fa" | |
soup = BeautifulSoup(open(oca_file_to_fix), "html.parser") | |
for pre in soup.findAll("pre"): | |
fasta = pre.contents | |
%store fasta[0] >{oca_file_to_fix} | |
# NOTE ABOUT THE READING PART OF THIS NEXT BLOCK: seems more modern Pythonic way | |
# is to leave out the `,'r'` part. See https://stackabuse.com/read-a-file-line-by-line-in-python/ under | |
# 'Read a File Line-by-Line with a for Loop - Most Pythonic Approach'. Note also that | |
# best to use `.strip()` or possibly slice `[:-1]` to remove line ending if going to | |
# rearrange because can get weird merge if alter order because usually last line will not have a new | |
# line character. | |
# add identifiers to each `chr` so results for each strain clear later | |
chromosome_id_prefix = "chr" | |
def add_strain_id_to_description_line(file,strain_id): | |
''' | |
Takes a file and edits every description line to add | |
strain_id after the caret. | |
Saves the fixed file | |
''' | |
import sys | |
output_file_name = "temp.txt" | |
# prepare output file for saving so it will be open and ready | |
with open(output_file_name, 'w') as output_file: | |
# read in the input file | |
with open(file, 'r') as input_handler: # OR SEE NOTE ABOVE THIS CODE BLOCK HOW DON'T NEED `, 'r'` anymore. | |
# prepare to give feeback later or allow skipping to certain start | |
lines_processed = 0 | |
for line in input_handler: | |
lines_processed += 1 | |
if line.startswith(">"): | |
rest_o_line = line.split(">") | |
new_line = ">"+strain_id + rest_o_line[1] | |
else: | |
new_line = line | |
# Send text to output | |
output_file.write(new_line) | |
# replace the original file with edited | |
!mv temp.txt {file} | |
# Feedback | |
sys.stderr.write("\n{} chromosome identifiers tagged.".format(file)) | |
for s in yue_et_al_strains: | |
add_strain_id_to_description_line(s+".genome.fa",s) | |
# A find / replace similar to last example but pure Python (no-IPython magics or shell use) | |
# (See `testing_repeat_number_by_looping_bendit_analysis.ipynb` for practical use of this to change a script on a loop to monitor effect on outcome) | |
script_name = "donut_plot_with_subgroups_from_dataframe.py" | |
def change_original_title(s): | |
''' | |
Change the plot title to the provided text. | |
''' | |
with open(script_name, 'r') as thefile: | |
script=thefile.read() | |
script = script.replace('BREAKDOWN', s) | |
with open(script_name, 'w') as output_file: | |
output_file.write(script) | |
change_original_title("NEW TITLE GOES HERE") | |
# Note for making substituons, Python now allows you to use f-strings (formatted string literals ) substitute | |
# variables into strings by name, but also Python strings have 'Template strings' built in (Ex. `import string; t = string.Template('Hello, $name!'); print(t.substitute(name='World'))` | |
# Collect list of image files in a directory | |
# Run this in notebook that is in directory along with the folder containing | |
# images, i.e., is in the level above the actual images | |
import os | |
import sys | |
try: | |
from pathlib import Path | |
except ImportError: | |
from pathlib2 import Path | |
img_folder = "Untitled Folder" | |
img_file_extensions = [".png",".jpg",".jpeg"] | |
list_imgs_in_directory = [] | |
for file in os.listdir(img_folder): | |
#print (file) | |
if Path(file).suffix in img_file_extensions: | |
list_imgs_in_directory.append(file) | |
len(list_imgs_in_directory) | |
#Pathlib in Python 2 or 3 example: | |
try: | |
from pathlib import Path | |
except ImportError: | |
from pathlib2 import Path | |
# list all files in a directory | |
[item for item in Path('.').glob('*')] # based on | |
# https://jefftriplett.com/2017/pathlib-is-wonderful/ | |
# list final file extension , see 'Path.suffix' at | |
#https://docs.python.org/3/library/pathlib.html | |
[item.suffix for item in Path('.').glob('*')] | |
# list the final suffixes if there is more than one - see 'Path.suffixes' at | |
#https://docs.python.org/3/library/pathlib.html | |
# Collect list of image files in a directory and display them in a Jupyter | |
# notebook cell | |
# Run this in notebook that is in directory along with the folder containing | |
# images, i.e., is in the level above the actual images | |
import os | |
import sys | |
try: | |
from pathlib import Path | |
except ImportError: | |
from pathlib2 import Path | |
from IPython.display import Image | |
from IPython.display import display | |
img_folder = "Untitled Folder" | |
img_file_extensions = [".png",".jpg",".jpeg"] | |
list_imgs = [] | |
for file in os.listdir(img_folder): | |
#print (file) | |
if Path(file).suffix in img_file_extensions: | |
list_imgs.append(Path(img_folder,file)) | |
imgl = [Image(filename=str(x)) for x in list_imgs] #had to cast the | |
# path object to a string or else `display.py` was giving error | |
# `'PosixPath' object has no attribute 'split'`;seems `display.py` not able to | |
# handle path objects yet. | |
display(*imgl) | |
# Collect list of image files in a directory and display them in a Jupyter | |
# notebook cell WITH FILE NAMES SHOWN BELOW EACH | |
# Run this in notebook that is in directory along with the folder containing | |
# images, i.e., is in the level above the actual images | |
import os | |
import sys | |
try: | |
from pathlib import Path | |
except ImportError: | |
from pathlib2 import Path | |
from IPython.display import Image | |
from IPython.display import display | |
img_folder = "Untitled Folder" | |
img_file_extensions = [".png",".jpg",".jpeg"] | |
list_imgs = [] | |
for file in os.listdir(img_folder): | |
#print (file) | |
if Path(file).suffix in img_file_extensions: | |
list_imgs.append(Path(img_folder,file)) | |
for i in list_imgs: | |
display(Image(filename=str(i))) | |
print("ABOVE: {}".format(i.name)) | |
#slide carousel-like example to show a subset of images that changes every five seconds (from `demo_palette.ipynb` in pymol-binder) with HTML labels for each image to make the text stand out: | |
import IPython.display as ipd | |
import time | |
import os | |
import sys | |
import random | |
def display_subset(): | |
img = {} | |
for x in random.sample(range(shuffles_to_do), 3): | |
img[x] = ipd.Image(filename="img_{}.png".format(x)) | |
ipd.display(img[x]) | |
ipd.display(ipd.HTML('ABOVE: <font size=5><b>img_{}.png</b></font>'.format(x))) | |
time.sleep(5) | |
ipd.clear_output(wait=True) | |
while True: | |
display_subset() | |
# Subset / restrict to a random sampling of items in a list , based on https://pynative.com/python-random-sample/ | |
# Good for doing right before EVEYRTHING GETS PROCESSED to pick a subset for testing, instead | |
# of defining specifically | |
import random | |
genomes = random.sample(population=genomes, k=15) | |
# Run a function every 8 minutes | |
%load https://gist.githubusercontent.com/fomightez/b012e51ebef6ec58c1515df3ee0c850a/raw/300da6c67ceeaf5384a3e500648b993345c361cb/run_every_eight_mins.py | |
# RELOAD for when you are using `from python_file_containing_function import a_function` (Python 3) | |
# Reload a function into a notebook after editing the script file in editor of running session; | |
# this allows calling the function in the notebook whereas if just reload the script won't | |
import importlib | |
import python_file_containing_function; importlib.reload(python_file_containing_function); from python_file_containing_function import a_function | |
# above line from https://stackoverflow.com/a/11724154/8508004 | |
# RELOAD for when you are using `import python_file` (Python 3) | |
# Reload a script into a notebook after editing the script file in editor of running session; | |
# note it is much more easily done then the case where using `from foo import foo`, but | |
# `from foo import foo` makes it easier to work in a notebook in many ways. | |
import importlib;importlib.reload(import python_file) | |
# Create a download link in Jupyter notebook; from | |
# https://medium.com/ibm-data-science-experience/how-to-upload-download-files-to-from-notebook-in-my-local-machine-6a4e65a15767 | |
# <-- Haven't tried it yet but it might be handy | |
# for idea I am working on for making animations from pymol files using jmol or any where where I suggest | |
# downloading an archive of results | |
from IPython.display import HTML | |
def create_download_link( df, title = "Download CSV file", filename = "data.csv"): | |
csv = df.to_csv() | |
b64 = base64.b64encode(csv.encode()) | |
payload = b64.decode() | |
html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>' | |
html = html.format(payload=payload,title=title,filename=filename) | |
return HTML(html) | |
create_download_link(df) | |
# For handling archive files to make a clickable download link, I found the section 'Create and download CSV/zip file' at https://blog.softhints.com/jupyter-ipython-download-files/ ; however, the code seems incomplete as I don't see how they make the zip file in conjunction with the sending it through as payload. (I assume `create_download_files()` was triggered elsewhere already.) And minor thing too, why not returning `HTML(html)` in that code block? | |
# Maybe Some of the answers here might help me reverse that Zipfile approach so it works to download to local? | |
# https://stackoverflow.com/questions/5710867/downloading-and-unzipping-a-zip-file-without-writing-to-disk | |
# Related to the topic of making downloadable links from Jupyter pages, I found https://stackoverflow.com/questions/26497912/trigger-file-download-within-ipython-notebook | |
# and | |
# https://stackoverflow.com/questions/24437661/retrieving-files-from-remote-ipython-notebook-server/24439480#24439480 about | |
# FileLink / FileLinks; however, in JupyterLab if it is a gif or png that JupyterLab renders, it opens it in the application | |
# instead of allowing download. And if it is a tarball that it doesn't render and you click on it, instead of offering to download | |
# it sats it isn't UTF-8 encoded. | |
# Fortunately when in Voila apps, you can list the files with the following: | |
from IPython.display import FileLink, FileLinks | |
FileLinks(".") | |
# And in VOILA those can be right clicked on and downloaded to local drive from those links using `Save link as..`. | |
# Howeve, a better, relatedsolution for in Voila because it makes a pop-up automatically without needing user to use `Save as` is: | |
%%html | |
<a href="SVM_Confusion_Matrix.jpg" download="SVM_Confusion_Matrix.jpg">Click HERE to Download SVM image</a> | |
# Using Panel (installable via pip) in a notebook (NOT VOILA) you can make a download file, too: | |
import panel as pn | |
pn.extension() | |
# Create option to download SVM Confusion Matrix Graphic | |
pn.widgets.FileDownload( | |
file="SVM_Confusion_Matrix.jpg", | |
embed=False, | |
name="Save SVM Confusion Matrix image" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment