Skip to content

Instantly share code, notes, and snippets.

@Ken-Kuroki
Ken-Kuroki / tfidf.py
Created August 26, 2019 02:39
Calculate TF-IDF from a count matrix
import numpy as np
from sklearn.preprocessing import normalize
def tf_idf(X): # corresponds to smooth=True and norm="l2" in sklearn.feature_extraction.text.TfidfVectorizer
tf = normalize(X, norm="l1", axis=1)
N = len(X)
df = np.count_nonzero(X, axis=0)
idf = np.log((N+1)/(df+1))+1
return normalize(tf * idf, norm="l2")
@Ken-Kuroki
Ken-Kuroki / dist_matrix.py
Last active August 30, 2019 16:00
Generate distance matrix from newick tree
from itertools import product
import pandas as pd
from ete3 import Tree
t = Tree("some_tree_file.nwk")
leaves = t.get_leaves()
leaf_names = t.get_leaf_names()
pair = product(leaves, leaves)
@Ken-Kuroki
Ken-Kuroki / dist_unifrac.py
Created August 31, 2019 14:03
Generate distance matrix among samples by unifrac distance
import numpy as np
from skbio import TreeNode
from skbio.diversity import beta_diversity
tree = TreeNode.read("gg_13_8_otus/trees/61_otus_unannotated.tree")
sample_ids = [f"sample{i}" for i in range(6)]
with open("gg_13_8_otus/taxonomy/61_otu_taxonomy.txt", "r") as f:
otu_ids = [each.strip().split("\t")[0] for each in f.readlines()]
data = np.random.randint(0, 100, size=(len(sample_ids), len(otu_ids))).tolist()
@Ken-Kuroki
Ken-Kuroki / randomforest.py
Last active April 21, 2020 04:17
Generic random forest classification to draw learning curve
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score, learning_curve
import altair as alt
def learn(X, y, **kwargs): # X needs to be a pandas dataframe
r = RandomForestClassifier(n_estimators=100, random_state=123, class_weight="balanced", **kwargs)
steps, curve_train, curve_test = learning_curve(r, X.sample(frac=1, random_state=123), y.sample(frac=1, random_state=123), cv=5, scoring="roc_auc", n_jobs=20, train_sizes=np.linspace(0.05,1,20))
learning = (pd.concat([pd.DataFrame(curve_train, index=steps).apply(np.average, axis="columns").rename("train"),
@Ken-Kuroki
Ken-Kuroki / itol_basic.py
Created October 4, 2019 11:02
Call iTOL API to draw and save a phylogenetic tree
from itolapi import Itol
def itol(tree_file: str, save_file: str, save_format: str) -> str:
if tree_file[-5:] != ".tree" and tree_file[-9:] != ".tree.txt":
raise Exception("Input tree file name must end with .tree or .tree.txt")
if save_format not in ['png', 'svg', 'eps', 'ps', 'pdf', 'nexus', 'newick']:
raise Exception("Unsupported save format")
itol_uploader = Itol()
itol_uploader.add_file(tree_file)
@Ken-Kuroki
Ken-Kuroki / itol_simplebar.py
Last active October 4, 2019 12:46
Generate iTOL "simple bar" annotation file from pandas dataframe
import numpy as np
import pandas as pd
from ete3 import PhyloTree
def generate_simplebar(df: pd.DataFrame, save_file: str) -> None:
# df columns must be labels and values.
output = """DATASET_SIMPLEBAR\n
SEPARATOR COMMA\n
DATASET_LABEL,label_simplebar\n
COLOR,#ff0000\n
@Ken-Kuroki
Ken-Kuroki / itol_binary.py
Created October 4, 2019 12:46
Generate iTOL "binary data" annotation file from pandas dataframe
import numpy as np
import pandas as pd
from ete3 import PhyloTree
def generate_binary(df: pd.DataFrame, save_file: str) -> None:
# df columns must be labels, values, values, ..., values.
num_cols = df.shape[1]-1
shapes = ",".join(["1"]*num_cols)
colors = ",".join(["#ff0000"]*num_cols)
labels = ",".join([f"{i+1}" for i in range(num_cols)])
@Ken-Kuroki
Ken-Kuroki / get_gcd.py
Created October 6, 2019 09:12
Calculate GCD
def get_gcd(a, b):
if b > a:
a, b = b, a
r = a%b
if r == 0:
return b
else:
return get_gcd(b, r)
@Ken-Kuroki
Ken-Kuroki / prime_factorize.py
Created October 6, 2019 13:20
Find prime factors
def prime_factorize(n):
answers = []
while n % 2 == 0:
answers.append(2)
n //= 2
f = 3
while f**2 <= n: # instead of f <= n**0.5
if n % f == 0:
answers.append(f)
n //= f
@Ken-Kuroki
Ken-Kuroki / itol_colorstrip.py
Created October 7, 2019 11:36
Generate iTOL "color strip" annotation file from pandas series
import numpy as np
import pandas as pd
from ete3 import PhyloTree
from itertools import cycle
def generate_colorstrip(labels: pd.Series, save_file: str, palette: list = None) -> None:
# labels must be a pd.Series whose index are leaf names and values are their labels
if palette is None:
palette = ["#4E79A7", "#A0CBE8", "#F28E2B", "#FFBE7D", "#59A14F",
"#8CD17D", "#B6992D", "#F1CE63", "#499894", "#86BDB6",