This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.preprocessing import normalize | |
def tf_idf(X): # corresponds to smooth=True and norm="l2" in sklearn.feature_extraction.text.TfidfVectorizer | |
tf = normalize(X, norm="l1", axis=1) | |
N = len(X) | |
df = np.count_nonzero(X, axis=0) | |
idf = np.log((N+1)/(df+1))+1 | |
return normalize(tf * idf, norm="l2") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import product | |
import pandas as pd | |
from ete3 import Tree | |
t = Tree("some_tree_file.nwk") | |
leaves = t.get_leaves() | |
leaf_names = t.get_leaf_names() | |
pair = product(leaves, leaves) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from skbio import TreeNode | |
from skbio.diversity import beta_diversity | |
tree = TreeNode.read("gg_13_8_otus/trees/61_otus_unannotated.tree") | |
sample_ids = [f"sample{i}" for i in range(6)] | |
with open("gg_13_8_otus/taxonomy/61_otu_taxonomy.txt", "r") as f: | |
otu_ids = [each.strip().split("\t")[0] for each in f.readlines()] | |
data = np.random.randint(0, 100, size=(len(sample_ids), len(otu_ids))).tolist() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import cross_val_predict, cross_val_score, learning_curve | |
import altair as alt | |
def learn(X, y, **kwargs): # X needs to be a pandas dataframe | |
r = RandomForestClassifier(n_estimators=100, random_state=123, class_weight="balanced", **kwargs) | |
steps, curve_train, curve_test = learning_curve(r, X.sample(frac=1, random_state=123), y.sample(frac=1, random_state=123), cv=5, scoring="roc_auc", n_jobs=20, train_sizes=np.linspace(0.05,1,20)) | |
learning = (pd.concat([pd.DataFrame(curve_train, index=steps).apply(np.average, axis="columns").rename("train"), |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itolapi import Itol | |
def itol(tree_file: str, save_file: str, save_format: str) -> str: | |
if tree_file[-5:] != ".tree" and tree_file[-9:] != ".tree.txt": | |
raise Exception("Input tree file name must end with .tree or .tree.txt") | |
if save_format not in ['png', 'svg', 'eps', 'ps', 'pdf', 'nexus', 'newick']: | |
raise Exception("Unsupported save format") | |
itol_uploader = Itol() | |
itol_uploader.add_file(tree_file) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from ete3 import PhyloTree | |
def generate_simplebar(df: pd.DataFrame, save_file: str) -> None: | |
# df columns must be labels and values. | |
output = """DATASET_SIMPLEBAR\n | |
SEPARATOR COMMA\n | |
DATASET_LABEL,label_simplebar\n | |
COLOR,#ff0000\n |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from ete3 import PhyloTree | |
def generate_binary(df: pd.DataFrame, save_file: str) -> None: | |
# df columns must be labels, values, values, ..., values. | |
num_cols = df.shape[1]-1 | |
shapes = ",".join(["1"]*num_cols) | |
colors = ",".join(["#ff0000"]*num_cols) | |
labels = ",".join([f"{i+1}" for i in range(num_cols)]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_gcd(a, b): | |
if b > a: | |
a, b = b, a | |
r = a%b | |
if r == 0: | |
return b | |
else: | |
return get_gcd(b, r) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def prime_factorize(n): | |
answers = [] | |
while n % 2 == 0: | |
answers.append(2) | |
n //= 2 | |
f = 3 | |
while f**2 <= n: # instead of f <= n**0.5 | |
if n % f == 0: | |
answers.append(f) | |
n //= f |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from ete3 import PhyloTree | |
from itertools import cycle | |
def generate_colorstrip(labels: pd.Series, save_file: str, palette: list = None) -> None: | |
# labels must be a pd.Series whose index are leaf names and values are their labels | |
if palette is None: | |
palette = ["#4E79A7", "#A0CBE8", "#F28E2B", "#FFBE7D", "#59A14F", | |
"#8CD17D", "#B6992D", "#F1CE63", "#499894", "#86BDB6", |