Ken-Kuroki’s gists

Ken-Kuroki / tfidf.py

Created August 26, 2019 02:39

Calculate TF-IDF from a count matrix

	import numpy as np
	from sklearn.preprocessing import normalize

	def tf_idf(X): # corresponds to smooth=True and norm="l2" in sklearn.feature_extraction.text.TfidfVectorizer
	tf = normalize(X, norm="l1", axis=1)
	N = len(X)
	df = np.count_nonzero(X, axis=0)
	idf = np.log((N+1)/(df+1))+1
	return normalize(tf * idf, norm="l2")

Ken-Kuroki / dist_matrix.py

Last active August 30, 2019 16:00

Generate distance matrix from newick tree

	from itertools import product
	import pandas as pd
	from ete3 import Tree

	t = Tree("some_tree_file.nwk")

	leaves = t.get_leaves()
	leaf_names = t.get_leaf_names()
	pair = product(leaves, leaves)

Ken-Kuroki / dist_unifrac.py

Created August 31, 2019 14:03

Generate distance matrix among samples by unifrac distance

	import numpy as np
	from skbio import TreeNode
	from skbio.diversity import beta_diversity

	tree = TreeNode.read("gg_13_8_otus/trees/61_otus_unannotated.tree")
	sample_ids = [f"sample{i}" for i in range(6)]
	with open("gg_13_8_otus/taxonomy/61_otu_taxonomy.txt", "r") as f:
	otu_ids = [each.strip().split("\t")[0] for each in f.readlines()]
	data = np.random.randint(0, 100, size=(len(sample_ids), len(otu_ids))).tolist()

Ken-Kuroki / randomforest.py

Last active April 21, 2020 04:17

Generic random forest classification to draw learning curve

	import numpy as np
	import pandas as pd
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import cross_val_predict, cross_val_score, learning_curve
	import altair as alt

	def learn(X, y, **kwargs): # X needs to be a pandas dataframe
	r = RandomForestClassifier(n_estimators=100, random_state=123, class_weight="balanced", **kwargs)
	steps, curve_train, curve_test = learning_curve(r, X.sample(frac=1, random_state=123), y.sample(frac=1, random_state=123), cv=5, scoring="roc_auc", n_jobs=20, train_sizes=np.linspace(0.05,1,20))
	learning = (pd.concat([pd.DataFrame(curve_train, index=steps).apply(np.average, axis="columns").rename("train"),

Ken-Kuroki / itol_basic.py

Created October 4, 2019 11:02

Call iTOL API to draw and save a phylogenetic tree

	from itolapi import Itol

	def itol(tree_file: str, save_file: str, save_format: str) -> str:
	if tree_file[-5:] != ".tree" and tree_file[-9:] != ".tree.txt":
	raise Exception("Input tree file name must end with .tree or .tree.txt")
	if save_format not in ['png', 'svg', 'eps', 'ps', 'pdf', 'nexus', 'newick']:
	raise Exception("Unsupported save format")

	itol_uploader = Itol()
	itol_uploader.add_file(tree_file)

Ken-Kuroki / itol_simplebar.py

Last active October 4, 2019 12:46

Generate iTOL "simple bar" annotation file from pandas dataframe

	import numpy as np
	import pandas as pd
	from ete3 import PhyloTree

	def generate_simplebar(df: pd.DataFrame, save_file: str) -> None:
	# df columns must be labels and values.
	output = """DATASET_SIMPLEBAR\n
	SEPARATOR COMMA\n
	DATASET_LABEL,label_simplebar\n
	COLOR,#ff0000\n

Ken-Kuroki / itol_binary.py

Created October 4, 2019 12:46

Generate iTOL "binary data" annotation file from pandas dataframe

	import numpy as np
	import pandas as pd
	from ete3 import PhyloTree

	def generate_binary(df: pd.DataFrame, save_file: str) -> None:
	# df columns must be labels, values, values, ..., values.
	num_cols = df.shape[1]-1
	shapes = ",".join(["1"]*num_cols)
	colors = ",".join(["#ff0000"]*num_cols)
	labels = ",".join([f"{i+1}" for i in range(num_cols)])

Ken-Kuroki / get_gcd.py

Created October 6, 2019 09:12

Calculate GCD

Ken-Kuroki / prime_factorize.py

Created October 6, 2019 13:20

Find prime factors

Ken-Kuroki / itol_colorstrip.py

Created October 7, 2019 11:36

Generate iTOL "color strip" annotation file from pandas series

	import numpy as np
	import pandas as pd
	from ete3 import PhyloTree
	from itertools import cycle

	def generate_colorstrip(labels: pd.Series, save_file: str, palette: list = None) -> None:
	# labels must be a pd.Series whose index are leaf names and values are their labels
	if palette is None:
	palette = ["#4E79A7", "#A0CBE8", "#F28E2B", "#FFBE7D", "#59A14F",
	"#8CD17D", "#B6992D", "#F1CE63", "#499894", "#86BDB6",